In [None]:
import os
import pandas as pd
from more_itertools import windowed
from tqdm import tqdm
import editdistance
from tensorflow.keras.preprocessing.text import Tokenizer

tqdm.pandas()

In [None]:
def get_ngrams(text: str, n: int = 3, step=1) -> str:
    """

    :param text:
    :param n:
    :param step:
    :return:
    """

    output = []

    # split the sentence in tokens.
    tokens = text.split()

    # if only one token, then we only have BOS and EOS tags
    if len(tokens) == 1:

        chars = ['<BOS>'] + list(text) + ['<EOS>']
        text = ' '.join(
            [''.join(t) for t in windowed(seq=chars, n=3, step=1)])
        output.append(text)

    # We have more than 1 tokens. So we need 3 kind of tags:
    # BOS: beginning of sentence
    # IOS: inside of sentence
    # EOS: end of sentence
    else:
        # extracting the first token, a list of the inside tokens, and the
        # last token. We handle each one differently
        first, *inside, last = tokens

        # in the first token we put BOS tag in the beginning of the token
        # and IOS at the end, since the sentence is not over.
        # We also split to first token to it's characters, so we can get
        # the n-grams.
        first_chars = ['<BOS>'] + list(first) + ['<IOS>']

        # create the n-gram texts and join them back together with a ' '
        first_chars = ' '.join(
            [''.join(t)
             for t in windowed(seq=first_chars, n=n, step=step)])

        # append the "n-gramed" token to the output list
        output.append(first_chars)

        for ins_token in inside:
            # for each of the inside tokens use only the IOS tags
            # we do the same procedure as in the first token.
            inside_chars = ['<IOS>'] + list(ins_token) + ['<IOS>']

            inside_chars = ' '.join(
                [''.join(t) for t in
                 windowed(seq=inside_chars, n=n, step=step)])

            output.append(inside_chars)

        # for the last token we use IOS and EOS tags.
        # Same procedure as before.
        last_chars = ['<IOS>'] + list(last) + ['<EOS>']

        last_chars = ' '.join(
            [''.join(t) for t in windowed(seq=last_chars, n=3, step=1)])

        output.append(last_chars)

    return ' '.join(output)

In [None]:
df = pd.read_csv('../../data/toponym_distances.csv', 
                 nrows=200_000, 
                 usecols=['gid','anchor','alternate'])
df.dropna(inplace=True)

In [None]:
df['len_anchor'] = df['anchor'].str.len()
df['len_alternate'] = df['alternate'].str.len()

In [None]:
df['anchor_3grams'] = df['anchor'].progress_apply(get_ngrams)
df['alternate_3grams'] = df['alternate'].progress_apply(get_ngrams)

In [None]:
df['1gram_dist'] = df.progress_apply(lambda row: editdistance.eval(row['anchor'], row['alternate']), axis=1)

In [None]:
df['3gram_dist'] = df.progress_apply(lambda row: editdistance.eval(row['anchor_3grams'].split(), 
                                                                   row['alternate_3grams'].split()), axis=1)

In [None]:
all_texts_ngrams = pd.concat((df['anchor_3grams'], df['alternate_3grams'])).reset_index(drop=True)

In [None]:
df['len_anchor_3grams'] = df['anchor_3grams'].str.split().apply(len)

In [None]:
df.head()

In [None]:
def calculate_tokenizer_distances(texts,
                                  data,
                                  num_words = 10000):
    
    tokenizer = Tokenizer(
        filters='', 
        lower=True,
        split=' ',
        char_level=False,
        num_words=num_words,
        oov_token='<OOV>')

    tokenizer.fit_on_texts(texts=texts)    
#     print(len(tokenizer.word_index))
#     print(tokenizer.num_words)
    res = pd.DataFrame()
    res['anchor_seqs'] = pd.Series(tokenizer.texts_to_sequences(data['anchor_3grams']))
    res['alternate_seqs'] = pd.Series(tokenizer.texts_to_sequences(data['alternate_3grams']))

    distances = res.progress_apply(lambda row: editdistance.eval(row['anchor_seqs'], row['alternate_seqs']), axis=1)
    
    return distances

In [None]:
df['trigram_10k_dist'] = calculate_tokenizer_distances(all_texts_ngrams, df, 10_000)
df['trigram_20k_dist'] = calculate_tokenizer_distances(all_texts_ngrams, df, 20_000)
df['trigram_25k_dist'] = calculate_tokenizer_distances(all_texts_ngrams, df, 25_000)
df['trigram_50k_dist'] = calculate_tokenizer_distances(all_texts_ngrams, df, 50_000)
df['trigram_75k_dist'] = calculate_tokenizer_distances(all_texts_ngrams, df, 75_000)
df['trigram_100k_dist'] = calculate_tokenizer_distances(all_texts_ngrams, df, 100_000)

In [None]:
df[df['3gram_dist'] != df['trigram_10k_dist']]