In [29]:
from IPython.display import display, HTML

display(HTML(data="""
<style>
    div#notebook-container    { width: 95%; }
    div#menubar-container     { width: 65%; }
    div#maintoolbar-container { width: 99%; }
</style>
"""))

In [30]:
import os
import pandas as pd
from more_itertools import windowed
from tqdm import tqdm
import editdistance
from tensorflow.keras.preprocessing.text import Tokenizer

tqdm.pandas()

In [31]:
def get_ngrams(text: str, n: int = 3, step=1) -> str:
    """

    :param text:
    :param n:
    :param step:
    :return:
    """

    output = []

    # split the sentence in tokens.
    tokens = text.split()

    # if only one token, then we only have BOS and EOS tags
    if len(tokens) == 1:

        chars = ['<BOS>'] + list(text) + ['<EOS>']
        text = ' '.join(
            [''.join(t) for t in windowed(seq=chars, n=3, step=1)])
        output.append(text)

    # We have more than 1 tokens. So we need 3 kind of tags:
    # BOS: beginning of sentence
    # IOS: inside of sentence
    # EOS: end of sentence
    else:
        # extracting the first token, a list of the inside tokens, and the
        # last token. We handle each one differently
        first, *inside, last = tokens

        # in the first token we put BOS tag in the beginning of the token
        # and IOS at the end, since the sentence is not over.
        # We also split to first token to it's characters, so we can get
        # the n-grams.
        first_chars = ['<BOS>'] + list(first) + ['<IOS>']

        # create the n-gram texts and join them back together with a ' '
        first_chars = ' '.join(
            [''.join(t)
             for t in windowed(seq=first_chars, n=n, step=step)])

        # append the "n-gramed" token to the output list
        output.append(first_chars)

        for ins_token in inside:
            # for each of the inside tokens use only the IOS tags
            # we do the same procedure as in the first token.
            inside_chars = ['<IOS>'] + list(ins_token) + ['<IOS>']

            inside_chars = ' '.join(
                [''.join(t) for t in
                 windowed(seq=inside_chars, n=n, step=step)])

            output.append(inside_chars)

        # for the last token we use IOS and EOS tags.
        # Same procedure as before.
        last_chars = ['<IOS>'] + list(last) + ['<EOS>']

        last_chars = ' '.join(
            [''.join(t) for t in windowed(seq=last_chars, n=3, step=1)])

        output.append(last_chars)

    return ' '.join(output)

In [32]:
df = pd.read_csv('../../data/toponym_distances.csv', 
                 nrows=10_000, 
                 usecols=['gid','anchor','alternate'])

df.dropna(inplace=True)

In [33]:
df.head()

Unnamed: 0,gid,anchor,alternate
0,2986043,pic de font blanca,pic de font blanca
1,2986043,pic de font blanca,pic du port
2,2994701,roc mélé,roc mele
3,2994701,roc mélé,roc meler
4,2994701,roc mélé,roc mélé


In [34]:
df['len_anchor'] = df['anchor'].str.len()

df.head()

Unnamed: 0,gid,anchor,alternate,len_anchor
0,2986043,pic de font blanca,pic de font blanca,18
1,2986043,pic de font blanca,pic du port,18
2,2994701,roc mélé,roc mele,8
3,2994701,roc mélé,roc meler,8
4,2994701,roc mélé,roc mélé,8


In [35]:
df['len_alternate'] = df['alternate'].str.len()

df.head()

Unnamed: 0,gid,anchor,alternate,len_anchor,len_alternate
0,2986043,pic de font blanca,pic de font blanca,18,18
1,2986043,pic de font blanca,pic du port,18,11
2,2994701,roc mélé,roc mele,8,8
3,2994701,roc mélé,roc meler,8,9
4,2994701,roc mélé,roc mélé,8,8


In [36]:
df['anchor_3grams'] = df['anchor'].progress_apply(get_ngrams)

df.head()

100%|██████████| 10000/10000 [00:00<00:00, 70920.65it/s]


Unnamed: 0,gid,anchor,alternate,len_anchor,len_alternate,anchor_3grams
0,2986043,pic de font blanca,pic de font blanca,18,18,<BOS>pi pic ic<IOS> <IOS>de de<IOS> <IOS>fo fo...
1,2986043,pic de font blanca,pic du port,18,11,<BOS>pi pic ic<IOS> <IOS>de de<IOS> <IOS>fo fo...
2,2994701,roc mélé,roc mele,8,8,<BOS>ro roc oc<IOS> <IOS>mé mél élé lé<EOS>
3,2994701,roc mélé,roc meler,8,9,<BOS>ro roc oc<IOS> <IOS>mé mél élé lé<EOS>
4,2994701,roc mélé,roc mélé,8,8,<BOS>ro roc oc<IOS> <IOS>mé mél élé lé<EOS>


In [37]:
df['alternate_3grams'] = df['alternate'].progress_apply(get_ngrams)

df.head()

100%|██████████| 10000/10000 [00:00<00:00, 71158.16it/s]


Unnamed: 0,gid,anchor,alternate,len_anchor,len_alternate,anchor_3grams,alternate_3grams
0,2986043,pic de font blanca,pic de font blanca,18,18,<BOS>pi pic ic<IOS> <IOS>de de<IOS> <IOS>fo fo...,<BOS>pi pic ic<IOS> <IOS>de de<IOS> <IOS>fo fo...
1,2986043,pic de font blanca,pic du port,18,11,<BOS>pi pic ic<IOS> <IOS>de de<IOS> <IOS>fo fo...,<BOS>pi pic ic<IOS> <IOS>du du<IOS> <IOS>po po...
2,2994701,roc mélé,roc mele,8,8,<BOS>ro roc oc<IOS> <IOS>mé mél élé lé<EOS>,<BOS>ro roc oc<IOS> <IOS>me mel ele le<EOS>
3,2994701,roc mélé,roc meler,8,9,<BOS>ro roc oc<IOS> <IOS>mé mél élé lé<EOS>,<BOS>ro roc oc<IOS> <IOS>me mel ele ler er<EOS>
4,2994701,roc mélé,roc mélé,8,8,<BOS>ro roc oc<IOS> <IOS>mé mél élé lé<EOS>,<BOS>ro roc oc<IOS> <IOS>mé mél élé lé<EOS>


In [38]:
df['1gram_dist'] = df.progress_apply(lambda row: editdistance.eval(row['anchor'], row['alternate']), axis=1)

df.head()

100%|██████████| 10000/10000 [00:00<00:00, 36625.21it/s]


Unnamed: 0,gid,anchor,alternate,len_anchor,len_alternate,anchor_3grams,alternate_3grams,1gram_dist
0,2986043,pic de font blanca,pic de font blanca,18,18,<BOS>pi pic ic<IOS> <IOS>de de<IOS> <IOS>fo fo...,<BOS>pi pic ic<IOS> <IOS>de de<IOS> <IOS>fo fo...,0
1,2986043,pic de font blanca,pic du port,18,11,<BOS>pi pic ic<IOS> <IOS>de de<IOS> <IOS>fo fo...,<BOS>pi pic ic<IOS> <IOS>du du<IOS> <IOS>po po...,10
2,2994701,roc mélé,roc mele,8,8,<BOS>ro roc oc<IOS> <IOS>mé mél élé lé<EOS>,<BOS>ro roc oc<IOS> <IOS>me mel ele le<EOS>,2
3,2994701,roc mélé,roc meler,8,9,<BOS>ro roc oc<IOS> <IOS>mé mél élé lé<EOS>,<BOS>ro roc oc<IOS> <IOS>me mel ele ler er<EOS>,3
4,2994701,roc mélé,roc mélé,8,8,<BOS>ro roc oc<IOS> <IOS>mé mél élé lé<EOS>,<BOS>ro roc oc<IOS> <IOS>mé mél élé lé<EOS>,0


In [39]:
df['3gram_dist'] = df.progress_apply(lambda row: editdistance.eval(row['anchor_3grams'].split(), 
                                                                   row['alternate_3grams'].split()), axis=1)

df.head()

100%|██████████| 10000/10000 [00:00<00:00, 32774.32it/s]


Unnamed: 0,gid,anchor,alternate,len_anchor,len_alternate,anchor_3grams,alternate_3grams,1gram_dist,3gram_dist
0,2986043,pic de font blanca,pic de font blanca,18,18,<BOS>pi pic ic<IOS> <IOS>de de<IOS> <IOS>fo fo...,<BOS>pi pic ic<IOS> <IOS>de de<IOS> <IOS>fo fo...,0,0
1,2986043,pic de font blanca,pic du port,18,11,<BOS>pi pic ic<IOS> <IOS>de de<IOS> <IOS>fo fo...,<BOS>pi pic ic<IOS> <IOS>du du<IOS> <IOS>po po...,10,12
2,2994701,roc mélé,roc mele,8,8,<BOS>ro roc oc<IOS> <IOS>mé mél élé lé<EOS>,<BOS>ro roc oc<IOS> <IOS>me mel ele le<EOS>,2,4
3,2994701,roc mélé,roc meler,8,9,<BOS>ro roc oc<IOS> <IOS>mé mél élé lé<EOS>,<BOS>ro roc oc<IOS> <IOS>me mel ele ler er<EOS>,3,5
4,2994701,roc mélé,roc mélé,8,8,<BOS>ro roc oc<IOS> <IOS>mé mél élé lé<EOS>,<BOS>ro roc oc<IOS> <IOS>mé mél élé lé<EOS>,0,0


In [40]:
all_texts_ngrams = pd.concat((df['anchor_3grams'],
                              df['alternate_3grams'])).reset_index(drop=True)

all_texts_ngrams

0        <BOS>pi pic ic<IOS> <IOS>de de<IOS> <IOS>fo fo...
1        <BOS>pi pic ic<IOS> <IOS>de de<IOS> <IOS>fo fo...
2              <BOS>ro roc oc<IOS> <IOS>mé mél élé lé<EOS>
3              <BOS>ro roc oc<IOS> <IOS>mé mél élé lé<EOS>
4              <BOS>ro roc oc<IOS> <IOS>mé mél élé lé<EOS>
5        <BOS>pi pic ic<IOS> <IOS>de des es<IOS> <IOS>l...
6        <BOS>pi pic ic<IOS> <IOS>de de<IOS> <IOS>le le...
7        <BOS>pi pic ic<IOS> <IOS>de de<IOS> <IOS>le le...
8        <BOS>pi pic ic<IOS> <IOS>de de<IOS> <IOS>le le...
9        <BOS>es est sta tan any ny<IOS> <IOS>de de<IOS...
10       <BOS>es est sta tan any ny<IOS> <IOS>de de<IOS...
11       <BOS>es est sta tan any ny<IOS> <IOS>de de<IOS...
12       <BOS>po por ort rt<IOS> <IOS>vi vie ieu eux ux...
13       <BOS>po por ort rt<IOS> <IOS>vi vie ieu eux ux...
14       <BOS>po por ort rt<IOS> <IOS>vi vie ieu eux ux...
15       <BOS>po por ort rt<IOS> <IOS>vi vie ieu eux ux...
16       <BOS>po por ort rt<IOS> <IOS>de de<IOS> <IOS>l.

In [41]:
df['len_anchor_3grams'] = df['anchor_3grams'].str.split().apply(len)

In [42]:
df.head()

Unnamed: 0,gid,anchor,alternate,len_anchor,len_alternate,anchor_3grams,alternate_3grams,1gram_dist,3gram_dist,len_anchor_3grams
0,2986043,pic de font blanca,pic de font blanca,18,18,<BOS>pi pic ic<IOS> <IOS>de de<IOS> <IOS>fo fo...,<BOS>pi pic ic<IOS> <IOS>de de<IOS> <IOS>fo fo...,0,0,15
1,2986043,pic de font blanca,pic du port,18,11,<BOS>pi pic ic<IOS> <IOS>de de<IOS> <IOS>fo fo...,<BOS>pi pic ic<IOS> <IOS>du du<IOS> <IOS>po po...,10,12,15
2,2994701,roc mélé,roc mele,8,8,<BOS>ro roc oc<IOS> <IOS>mé mél élé lé<EOS>,<BOS>ro roc oc<IOS> <IOS>me mel ele le<EOS>,2,4,7
3,2994701,roc mélé,roc meler,8,9,<BOS>ro roc oc<IOS> <IOS>mé mél élé lé<EOS>,<BOS>ro roc oc<IOS> <IOS>me mel ele ler er<EOS>,3,5,7
4,2994701,roc mélé,roc mélé,8,8,<BOS>ro roc oc<IOS> <IOS>mé mél élé lé<EOS>,<BOS>ro roc oc<IOS> <IOS>mé mél élé lé<EOS>,0,0,7


In [43]:
def calculate_tokenizer_distances(texts,
                                  data,
                                  num_words = 10000):
    
    tokenizer = Tokenizer(
        filters='', 
        lower=True,
        split=' ',
        char_level=False,
        num_words=num_words,
        oov_token='<OOV>')

    tokenizer.fit_on_texts(texts=texts)
    res = pd.DataFrame()
    res['anchor_seqs'] = pd.Series(tokenizer.texts_to_sequences(data['anchor_3grams']))
    res['alternate_seqs'] = pd.Series(tokenizer.texts_to_sequences(data['alternate_3grams']))

    distances = res.progress_apply(lambda row: editdistance.eval(row['anchor_seqs'], row['alternate_seqs']), axis=1)
    
    return distances

In [44]:
df['trigram_10k_dist'] = calculate_tokenizer_distances(all_texts_ngrams, df, 10_000)

df.head()

100%|██████████| 10000/10000 [00:00<00:00, 29310.26it/s]


Unnamed: 0,gid,anchor,alternate,len_anchor,len_alternate,anchor_3grams,alternate_3grams,1gram_dist,3gram_dist,len_anchor_3grams,trigram_10k_dist
0,2986043,pic de font blanca,pic de font blanca,18,18,<BOS>pi pic ic<IOS> <IOS>de de<IOS> <IOS>fo fo...,<BOS>pi pic ic<IOS> <IOS>de de<IOS> <IOS>fo fo...,0,0,15,0
1,2986043,pic de font blanca,pic du port,18,11,<BOS>pi pic ic<IOS> <IOS>de de<IOS> <IOS>fo fo...,<BOS>pi pic ic<IOS> <IOS>du du<IOS> <IOS>po po...,10,12,15,12
2,2994701,roc mélé,roc mele,8,8,<BOS>ro roc oc<IOS> <IOS>mé mél élé lé<EOS>,<BOS>ro roc oc<IOS> <IOS>me mel ele le<EOS>,2,4,7,4
3,2994701,roc mélé,roc meler,8,9,<BOS>ro roc oc<IOS> <IOS>mé mél élé lé<EOS>,<BOS>ro roc oc<IOS> <IOS>me mel ele ler er<EOS>,3,5,7,5
4,2994701,roc mélé,roc mélé,8,8,<BOS>ro roc oc<IOS> <IOS>mé mél élé lé<EOS>,<BOS>ro roc oc<IOS> <IOS>mé mél élé lé<EOS>,0,0,7,0


In [45]:
df['trigram_20k_dist'] = calculate_tokenizer_distances(all_texts_ngrams, df, 20_000)

100%|██████████| 10000/10000 [00:00<00:00, 29887.48it/s]


In [46]:
df['trigram_25k_dist'] = calculate_tokenizer_distances(all_texts_ngrams, df, 25_000)

100%|██████████| 10000/10000 [00:00<00:00, 27998.26it/s]


In [47]:
df['trigram_50k_dist'] = calculate_tokenizer_distances(all_texts_ngrams, df, 50_000)

100%|██████████| 10000/10000 [00:00<00:00, 29192.73it/s]


In [48]:
df['trigram_75k_dist'] = calculate_tokenizer_distances(all_texts_ngrams, df, 75_000)

100%|██████████| 10000/10000 [00:00<00:00, 28848.42it/s]


In [49]:
df['trigram_100k_dist'] = calculate_tokenizer_distances(all_texts_ngrams, df, 100_000)

100%|██████████| 10000/10000 [00:00<00:00, 28506.60it/s]


In [50]:
df

Unnamed: 0,gid,anchor,alternate,len_anchor,len_alternate,anchor_3grams,alternate_3grams,1gram_dist,3gram_dist,len_anchor_3grams,trigram_10k_dist,trigram_20k_dist,trigram_25k_dist,trigram_50k_dist,trigram_75k_dist,trigram_100k_dist
0,2986043,pic de font blanca,pic de font blanca,18,18,<BOS>pi pic ic<IOS> <IOS>de de<IOS> <IOS>fo fo...,<BOS>pi pic ic<IOS> <IOS>de de<IOS> <IOS>fo fo...,0,0,15,0,0,0,0,0,0
1,2986043,pic de font blanca,pic du port,18,11,<BOS>pi pic ic<IOS> <IOS>de de<IOS> <IOS>fo fo...,<BOS>pi pic ic<IOS> <IOS>du du<IOS> <IOS>po po...,10,12,15,12,12,12,12,12,12
2,2994701,roc mélé,roc mele,8,8,<BOS>ro roc oc<IOS> <IOS>mé mél élé lé<EOS>,<BOS>ro roc oc<IOS> <IOS>me mel ele le<EOS>,2,4,7,4,4,4,4,4,4
3,2994701,roc mélé,roc meler,8,9,<BOS>ro roc oc<IOS> <IOS>mé mél élé lé<EOS>,<BOS>ro roc oc<IOS> <IOS>me mel ele ler er<EOS>,3,5,7,5,5,5,5,5,5
4,2994701,roc mélé,roc mélé,8,8,<BOS>ro roc oc<IOS> <IOS>mé mél élé lé<EOS>,<BOS>ro roc oc<IOS> <IOS>mé mél élé lé<EOS>,0,0,7,0,0,0,0,0,0
5,3007683,pic des langounelles,pic des langounelles,20,20,<BOS>pi pic ic<IOS> <IOS>de des es<IOS> <IOS>l...,<BOS>pi pic ic<IOS> <IOS>de des es<IOS> <IOS>l...,0,0,18,0,0,0,0,0,0
6,3017832,pic de les abelletes,pic de la font-negre,20,20,<BOS>pi pic ic<IOS> <IOS>de de<IOS> <IOS>le le...,<BOS>pi pic ic<IOS> <IOS>de de<IOS> <IOS>la la...,11,12,17,12,12,12,12,12,12
7,3017832,pic de les abelletes,pic de la font-nègre,20,20,<BOS>pi pic ic<IOS> <IOS>de de<IOS> <IOS>le le...,<BOS>pi pic ic<IOS> <IOS>de de<IOS> <IOS>la la...,12,12,17,12,12,12,12,12,12
8,3017832,pic de les abelletes,pic de les abelletes,20,20,<BOS>pi pic ic<IOS> <IOS>de de<IOS> <IOS>le le...,<BOS>pi pic ic<IOS> <IOS>de de<IOS> <IOS>le le...,0,0,17,0,0,0,0,0,0
9,3017833,estany de les abelletes,estany de les abelletes,23,23,<BOS>es est sta tan any ny<IOS> <IOS>de de<IOS...,<BOS>es est sta tan any ny<IOS> <IOS>de de<IOS...,0,0,20,0,0,0,0,0,0
