In [1]:
from abc import ABC, abstractmethod
import re
from gensim.models import Word2Vec
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from sacremoses import MosesTokenizer
import Levenshtein
import spacy
import nltk
import pickle
import urllib
import os
import tarfile
import zipfile
from gensim import models
import seaborn as sns
import pandas as pd
from tqdm import tqdm
from pathlib import Path
import numpy as np
from string import punctuation
import matplotlib.pyplot as plt
nltk.download('stopwords', quiet=True)
from nltk.corpus import stopwords
RESOURCES_DIR = Path("C:/Users/Antonio/PycharmProjects/simpleTextCLEF/resources")
DATASETS_PATH = RESOURCES_DIR / "datasets"
WORD_EMBEDDINGS_NAME = "glove.42B.300d"
DUMPS_DIR = RESOURCES_DIR / "DUMPS"

stopwords = set(stopwords.words("english"))

In [2]:
def convert_glove_to_word2vec(path_glove):
    glove_file = datapath(path_glove)
    tmp_file = get_tmpfile(DUMPS_DIR / "test_word2vec.txt")
    glove2word2vec(glove_file, tmp_file)
    model = KeyedVectors.load_word2vec_format(tmp_file)
    return model

In [3]:
def ControlDivisionByZero(numerator, denominator):
    return numerator / denominator if denominator != 0 else 0

class WordRankRatio():

    def __init__(self):
        self.tokenizer = MosesTokenizer(lang='en')
        self.word2rank = self._get_word2rank()
        self.length_rank = len(self.word2rank)

    def calculate_ratio(self, simple_text, original_text):

        result_ratio = round(min(ControlDivisionByZero(self.get_lexical_complexity_score(simple_text),
                                                       self.get_lexical_complexity_score(original_text)),
                                 2), 2)

        return result_ratio

    def get_lexical_complexity_score(self, sentence):

        words = self.tokenizer.tokenize(self._remove_stopwords(self._remove_punctuation(sentence)))
        words = [word for word in words if word in self.word2rank]
        if len(words) == 0:
            return np.log(1 + self.length_rank)
        return np.quantile([self._get_rank(word) for word in words], 0.25)

    def _remove_punctuation(self, text):
        return ' '.join([word for word in self.tokenizer.tokenize(text) if not self._is_punctuation(word)])

    def _remove_stopwords(self, text):
        return ' '.join([w for w in self.tokenizer.tokenize(text) if w.lower() not in stopwords])

    def _is_punctuation(self, word):
        return ''.join([char for char in word if char not in punctuation]) == ''

    def _get_rank(self, word):
        rank = self.word2rank.get(word, self.length_rank)
        return np.log(1 + rank)

    def _get_word2rank(self, vocab_size=np.inf):
        model_filepath = DUMPS_DIR / f"{WORD_EMBEDDINGS_NAME}.pk"
        if model_filepath.exists():
            with open(model_filepath, 'rb') as f:
                model = pickle.load(f)
            return model
        else:
            print("Downloading glove.42B.300d ...")
            self._download_glove(model_name='glove.42B.300d', dest_dir=str(DUMPS_DIR))
            print("Preprocessing word2rank...")
            DUMPS_DIR.mkdir(parents=True, exist_ok=True)
            WORD_EMBEDDINGS_PATH = DUMPS_DIR / f'{WORD_EMBEDDINGS_NAME}.txt'
            lines_generator = self._yield_lines(WORD_EMBEDDINGS_PATH)
            word2rank = {}
            # next(lines_generator)
            for i, line in enumerate(lines_generator):
                if i >= vocab_size: break
                word = line.split(' ')[0]
                word2rank[word] = i

            pickle.dump(word2rank, open(model_filepath, 'wb'))
            txt_file = DUMPS_DIR / f'{WORD_EMBEDDINGS_NAME}.txt'
            zip_file = DUMPS_DIR / f'{WORD_EMBEDDINGS_NAME}.zip'
            if txt_file.exists(): txt_file.unlink()
            if zip_file.exists(): zip_file.unlink()
            return word2rank

    def _download_glove(self, model_name, dest_dir):
        url = ''
        if model_name == 'glove.6B':
            url = 'http://nlp.stanford.edu/data/glove.6B.zip'
        elif model_name == 'glove.42B.300d':
            url = 'http://nlp.stanford.edu/data/glove.42B.300d.zip'
        elif model_name == 'glove.840B.300d':
            url = 'http://nlp.stanford.edu/data/glove.840B.300d.zip'
        elif model_name == 'glove.twitter.27B':
            url = 'http://nlp.stanford.edu/data/glove.twitter.27B.zip',
        else:
            possible_values = ['glove.6B', 'glove.42B.300d', 'glove.840B.300d', 'glove.twitter.27B']
            raise ValueError('Unknown model_name. Possible values are {}'.format(possible_values))
        file_path = self._download_url(url, dest_dir)
        out_filepath = Path(file_path)
        out_filepath = out_filepath.parent / f'{out_filepath.stem}.txt'
        # print(out_filepath, out_filepath.exists())
        if not out_filepath.exists():
            print("Extracting: ", Path(file_path).name)
            self._unzip(file_path, dest_dir)

    def _yield_lines(self, filepath):
        filepath = Path(filepath)
        with filepath.open('r', encoding="latin-1") as f:
            for line in f:
                yield line.rstrip()

    def _download_url(self, url, output_path):
        name = url.split('/')[-1]
        file_path = f'{output_path}/{name}'
        if not Path(file_path).exists():
            with tqdm(unit='B', unit_scale=True, leave=True, miniters=1,
                      desc=name) as t:  # all optional kwargs
                urllib.request.urlretrieve(url, filename=file_path, reporthook=self._download_report_hook(t), data=None)
        return file_path

    def _unzip(self, file_path, dest_dir=None):
        if dest_dir is None:
            dest_dir = os.path.dirname(file_path)
        if file_path.endswith('.zip'):
            with zipfile.ZipFile(file_path, "r") as zip_ref:
                zip_ref.extractall(dest_dir)
        elif file_path.endswith("tar.gz") or file_path.endswith("tgz"):
            tar = tarfile.open(file_path, "r:gz")
            tar.extractall(dest_dir)
            tar.close()
        elif file_path.endswith("tar"):
            tar = tarfile.open(file_path, "r:")
            tar.extractall(dest_dir)
            tar.close()

    def _download_report_hook(self, t):
        last_b = [0]

        def inner(b=1, bsize=1, tsize=None):
            if tsize is not None:
                t.total = tsize
            t.update((b - last_b[0]) * bsize)
            last_b[0] = b

        return inner

In [4]:
complex_sentence = "We introduce Ignition: an end-to-end neural network architecture for training unconstrained self-driving vehicles in simulated environments."
simple_sentence = "Ignition is a neural network for training unconstrained self-driving vehicles in simulated environments."

In [5]:
wordRank = WordRankRatio()

In [6]:
wordRank.get_lexical_complexity_score(complex_sentence)

7.896465224107381

In [7]:
wordRank.get_lexical_complexity_score(simple_sentence)

7.4619136612316534

In [8]:
dump_path = DUMPS_DIR / "PubMed-w2v.bin"

In [9]:
w2v_model = models.KeyedVectors.load_word2vec_format(dump_path, binary=True)

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\Antonio\\PycharmProjects\\simpleTextCLEF\\resources\\DUMPS\\PubMed-w2v.bin'

In [None]:
w2v_model

In [None]:
w2v_model.most_similar("hypertrophy")

In [None]:
#glove_embeddings = convert_glove_to_word2vec(DUMPS_DIR / "glove.42B.300d.txt")
#glove_embeddings

In [None]:
glove_path = DUMPS_DIR / "test_word2vec.txt"
glove_model  =  models.KeyedVectors.load_word2vec_format(glove_path)

In [None]:
glove_model.key_to_index["self-driving"]

In [None]:
w2v_model.key_to_index["self-driving"]

In [None]:
glove_model.get_index()

In [None]:
len(w2v_model)

In [None]:
len(glove_model)

In [None]:
w2v_model.most_similar("Multi-Layer"), w2v_model.get_index("Multi-Layer"),  w2v_model.get_index("Perceptron")

In [None]:
glove_model.most_similar("Multi-Layer"), glove_model.get_index("Multi-Layer"), glove_model.get_index("innovative")