In [5]:
from sacremoses import MosesDetokenizer, MosesTokenizer
import Levenshtein
import spacy
import numpy as np
import nltk
import os
import tarfile
import zipfile
from tqdm import tqdm
import urllib
nltk.download('stopwords')
from nltk.corpus import stopwords
from pathlib import Path
from string import punctuation
import pickle
stopwords = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Antonio\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Resources

In [4]:
RESOURCES_DIR = Path("../../resources").resolve()

In [5]:
DUMPS_DIR = RESOURCES_DIR / "DUMPS"
WORD_EMBEDDINGS_NAME = "glove.42B.300d"
WORD_FREQUENCY_FILEPATH = RESOURCES_DIR / 'others/enwiki_freq.txt'

In [6]:
DUMPS_DIR

WindowsPath('C:/Users/Antonio/PhD/Simplification_experiments/resources/DUMPS')

In [7]:
def load_dump(filepath):
    return pickle.load(open(filepath, 'rb'))


def dump(obj, filepath):
    pickle.dump(obj, open(filepath, 'wb'))

In [8]:
def yield_lines(filepath):
    filepath = Path(filepath)
    with filepath.open('r', encoding="latin-1") as f:
        for line in f:
            yield line.rstrip()

In [9]:
def download_report_hook(t):
  last_b = [0]
  def inner(b=1, bsize=1, tsize=None):
    if tsize is not None:
        t.total = tsize
    t.update((b - last_b[0]) * bsize)
    last_b[0] = b
  return inner

In [10]:
def download_url(url, output_path):
    name = url.split('/')[-1]
    file_path = f'{output_path}/{name}'
    if not Path(file_path).exists():
        # with DownloadProgressBar(unit='B', unit_scale=True, miniters=1, desc=url.split('/')[-1]) as t:
        #     urllib.request.urlretrieve(url, filename=output_path, reporthook=t.update_to)
        with tqdm(unit='B', unit_scale=True, leave=True, miniters=1,
                  desc=name) as t:  # all optional kwargs
            urllib.request.urlretrieve(url, filename=file_path, reporthook=download_report_hook(t), data=None)
    return file_path

In [11]:
def unzip(file_path, dest_dir=None):
    if dest_dir is None:
        dest_dir = os.path.dirname(file_path)
    if file_path.endswith('.zip'):
        with zipfile.ZipFile(file_path, "r") as zip_ref:
            zip_ref.extractall(dest_dir)
    elif file_path.endswith("tar.gz") or file_path.endswith("tgz"):
        tar = tarfile.open(file_path, "r:gz")
        tar.extractall(dest_dir)
        tar.close()
    elif file_path.endswith("tar"):
        tar = tarfile.open(file_path, "r:")
        tar.extractall(dest_dir)
        tar.close()


In [12]:
def download_glove(model_name, dest_dir):
    url = ''
    if model_name == 'glove.6B':
        url = 'http://nlp.stanford.edu/data/glove.6B.zip'
    elif model_name == 'glove.42B.300d':
        url = 'http://nlp.stanford.edu/data/glove.42B.300d.zip'
    elif model_name == 'glove.840B.300d':
        url = 'http://nlp.stanford.edu/data/glove.840B.300d.zip'
    elif model_name == 'glove.twitter.27B':
        url = 'http://nlp.stanford.edu/data/glove.twitter.27B.zip',
    else:
        possible_values = ['glove.6B', 'glove.42B.300d', 'glove.840B.300d', 'glove.twitter.27B']
        raise ValueError('Unknown model_name. Possible values are {}'.format(possible_values))
    file_path = download_url(url, dest_dir)
    out_filepath = Path(file_path)
    out_filepath = out_filepath.parent / f'{out_filepath.stem}.txt'
    # print(out_filepath, out_filepath.exists())
    if not out_filepath.exists():
        print("Extracting: ", Path(file_path).name)
        unzip(file_path, dest_dir)

In [13]:
print("Downloading glove.42B.300d ...")
download_glove(model_name='glove.42B.300d', dest_dir=str(DUMPS_DIR))

Downloading glove.42B.300d ...


glove.42B.300d.zip: 1.88GB [06:08, 5.09MB/s]                                


Extracting:  glove.42B.300d.zip


Tokenizer

In [14]:
def get_tokenizer():
    return MosesTokenizer(lang='en')


def get_detokenizer():
    return MosesDetokenizer(lang='en')

In [15]:
def tokenize(sentence):
    return get_tokenizer().tokenize(sentence)

In [16]:
def safe_division(a, b):
    return a / b if b else 0

Dependency Tree Ratio Functions

In [6]:
def get_spacy_model():
    model = 'en_core_web_sm'
    if not spacy.util.is_package(model):
        spacy.cli.download(model)
        spacy.cli.link(model, model, force=True, model_path=spacy.util.get_package_path(model))
    return spacy.load(model)

In [10]:
get_spacy_model()

<spacy.lang.en.English at 0x2837a4c3f10>

In [18]:
def spacy_process(text):
        return get_spacy_model()(text)

In [19]:
def get_dependency_tree_depth(sentence):
        def get_subtree_depth(node):
            if len(list(node.children)) == 0:
                return 0
            return 1 + max([get_subtree_depth(child) for child in node.children])

        tree_depths = [get_subtree_depth(spacy_sentence.root) for spacy_sentence in spacy_process(sentence).sents]
        if len(tree_depths) == 0:
            return 0
        return max(tree_depths)

Word Rank Functions

In [20]:
def is_punctuation(word):
    return ''.join([char for char in word if char not in punctuation]) == ''

In [21]:
def remove_punctuation(text):
    return ' '.join([word for word in tokenize(text) if not is_punctuation(word)])


def remove_stopwords(text):
    return ' '.join([w for w in tokenize(text) if w.lower() not in stopwords])

In [22]:
def get_word2rank(vocab_size=np.inf):
    model_filepath = DUMPS_DIR / f"{WORD_EMBEDDINGS_NAME}.pk"
    if model_filepath.exists():
        return load_dump(model_filepath)
    else:
        print("Downloading glove.42B.300d ...")
        download_glove(model_name='glove.42B.300d', dest_dir=str(DUMPS_DIR))
        print("Preprocessing word2rank...")
        DUMPS_DIR.mkdir(parents=True, exist_ok=True)
        WORD_EMBEDDINGS_PATH = DUMPS_DIR / f'{WORD_EMBEDDINGS_NAME}.txt'
        lines_generator = yield_lines(WORD_EMBEDDINGS_PATH)
        word2rank = {}
        # next(lines_generator)
        for i, line in enumerate(lines_generator):
            if i >= vocab_size: break
            word = line.split(' ')[0]
            word2rank[word] = i
        dump(word2rank, model_filepath)
        txt_file = DUMPS_DIR / f'{WORD_EMBEDDINGS_NAME}.txt'
        zip_file = DUMPS_DIR / f'{WORD_EMBEDDINGS_NAME}.zip'
        if txt_file.exists(): txt_file.unlink()
        if zip_file.exists(): zip_file.unlink()
        return word2rank

In [23]:
def get_normalized_rank(word):
    max = len(get_word2rank())
    rank = get_word2rank().get(word, max)
    return np.log(1 + rank) / np.log(1 + max)

In [24]:
def get_word_frequency():
    model_filepath = DUMPS_DIR / f'{WORD_FREQUENCY_FILEPATH.stem}.pk'
    if model_filepath.exists():
        return load_dump(model_filepath)
    else:
        DUMPS_DIR.mkdir(parents=True, exist_ok=True)
        word_freq = {}
        for line in yield_lines(WORD_FREQUENCY_FILEPATH):
            chunks = line.split(' ')
            word = chunks[0]
            freq = int(chunks[1])
            word_freq[word] = freq
        dump(word_freq, model_filepath)
        return word_freq

In [25]:
def get_normalized_frequency(word):
    max = 153141437 # the 153141437, the max frequency
    freq = get_word_frequency().get(word, 0)
    return 1.0 - np.log(1 + freq) / np.log(1 + max)

In [26]:
def get_complexity_score(sentence):
    # words = tokenize(remove_stopwords(remove_punctuation(sentence)))
    words = tokenize(remove_punctuation(sentence))
    words = [word for word in words if word in get_word2rank()]  # remove unknown words
    if len(words) == 0:
        return 1.0

    return np.array([get_normalized_frequency(word.lower()) for word in words]).mean()

In [27]:
def get_lexical_complexity_score(sentence):
        words = tokenize(remove_stopwords(remove_punctuation(sentence)))
        words = [word for word in words if word in get_word2rank()]
        if len(words) == 0:
            return np.log(1 + len(get_word2rank()))
        return np.quantile([get_rank(word) for word in words], 0.75)

In [28]:
def get_rank(word):
    rank = get_word2rank().get(word, len(get_word2rank()))
    return np.log(1 + rank)

In [29]:
sentence = "As early as 1100 , Bishopwearmouth parish included a small fishing village at the southern mouth of the river -LRB- modern day Hendon -RRB- known as ` Soender-land ' -LRB- which evolved into ` Sunderland ' -RRB- ."


In [30]:
words = tokenize(remove_stopwords(remove_punctuation(sentence)))
print(words)
words = [word for word in words if word in get_word2rank()]
print(words)
print(np.quantile([get_rank(word) for word in words], 0.75))

['Mandaean', 'scholar', 'Säve-Söderberg', 'showed', 'Mani', '&amp;', 'amp', ';', 'apos', ';', 'Psalms', 'Thomas', 'closely', 'related', 'Mandaean', 'texts']
['scholar', 'showed', '&amp;', 'amp', ';', 'apos', ';', 'closely', 'related', 'texts']
8.94934839193785


Main Features

In [31]:
def get_word_length_ratio(complex_sentence, simple_sentence):
    return round(safe_division(len(tokenize(simple_sentence)), len(tokenize(complex_sentence))))

In [32]:
def get_char_length_ratio(complex_sentence, simple_sentence):
        return round(safe_division(len(simple_sentence), len(complex_sentence)))

In [33]:
def get_levenshtein_ratio(complex_sentence, simple_sentence):
        return round(Levenshtein.ratio(complex_sentence, simple_sentence))

In [34]:
def get_dependency_tree_depth_ratio(complex_sentence, simple_sentence):
        return round(
            safe_division(get_dependency_tree_depth(simple_sentence),
                          get_dependency_tree_depth(complex_sentence)))

In [35]:
def get_word_rank_ratio(complex_sentence, simple_sentence):
        return round(min(safe_division(get_lexical_complexity_score(simple_sentence),
                                       get_lexical_complexity_score(complex_sentence)), 2))

In [41]:
simple = "A signal transduction in biology , is a cellular mechanism ."

complex = "Sensing of both the external and internal environments at the cellular level relies on signal transduction . Many disease processes , such as diabetes , heart disease , autoimmunity , and cancer arise from defects in signal transduction pathways , further highlighting the critical importance of signal transduction to biology , as well as medicine ."

In [47]:
get_word_length_ratio(complex, simple)

0

In [43]:
get_char_length_ratio(complex, simple)

0

In [44]:
get_levenshtein_ratio(complex, simple)

0

In [45]:
get_dependency_tree_depth_ratio(complex, simple)

0

In [46]:
get_word_rank_ratio(complex, simple)

1

In [1]:
round(4/120,2)

0.03