# LGM-EmbeddingToponymInterlinking

This code implements a Toponym Interlinking task. A FastText embedding model is trained from scratch in order to be able to efficiently produce toponym dense representations (embeddings) which are then utilized in a binary classification task.

In [1]:
import numpy as np
import string
from tqdm import tqdm
from string import punctuation, ascii_lowercase
from gensim.models import FastText
from gensim.test.utils import common_texts
import time
from gensim.test.utils import get_tmpfile
import pandas as pd 
from numpy.linalg import norm
import gc
from text_unidecode import unidecode
import re
import unicodedata

import keras
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation
from keras import backend as K

### Dataset

In [2]:
train_df = pd.read_csv('./data/train.csv')
val_df = pd.read_csv('./data/val.csv')
test_df = pd.read_csv('./data/test.csv')

train_df.shape, val_df.shape, test_df.shape

((1999994, 3), (499999, 3), (2499991, 3))

In [3]:
train_df.head()

Unnamed: 0,s1,s2,label
0,Karpova,Карпове-Кріпенське,0
1,dojeongyo,도전교,1
2,Gaomo,gao mo,1
3,Bieddjujavri,Gåldinjavri davit,0
4,Zanjon La Noria,Sitio Arqueológico La Muralla,0


### Preprocessing

In [4]:
def strip_accents(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

def ascii_transliteration_and_punctuation_strip(s):
    # NFKD: first applies a canonical decomposition, i.e., translates each character into its decomposed form.
    # and afterwards apply the compatibility decomposition, i.e. replace all compatibility characters with their
    # equivalents.
    s = unidecode(strip_accents(s.lower()))
    s = punctuation_regex.sub('', s)
    return s

def data_preprocessing(source):
    source = source.replace('[^A-Za-z]',' ')
    source = source.lower()
    source = source.replace("\s\s+" , " ")
    source = source.replace('\s+[a-z]{1,2}(?!\S)',' ')
    source = source.replace("\s\s+" , " ")
    return source

punctuation_regex = re.compile(u'[‘’“”\'"!?;/⧸⁄‹›«»`ʿ,.-]')


train_df.iloc[:,0]=train_df.iloc[:,0].apply(lambda row: ascii_transliteration_and_punctuation_strip(row))
train_df.iloc[:,1]=train_df.iloc[:,1].apply(lambda row: ascii_transliteration_and_punctuation_strip(row))
val_df.iloc[:,0]=val_df.iloc[:,0].apply(lambda row: ascii_transliteration_and_punctuation_strip(row))
val_df.iloc[:,1]=val_df.iloc[:,1].apply(lambda row: ascii_transliteration_and_punctuation_strip(row))
test_df.iloc[:,0]=test_df.iloc[:,0].apply(lambda row: ascii_transliteration_and_punctuation_strip(row))
test_df.iloc[:,1]=test_df.iloc[:,1].apply(lambda row: ascii_transliteration_and_punctuation_strip(row))

train_df.iloc[:,1] = train_df.iloc[:,1].apply(lambda row: data_preprocessing(row))
train_df.iloc[:,0] = train_df.iloc[:,0].apply(lambda row: data_preprocessing(row))
val_df.iloc[:,1] = val_df.iloc[:,1].apply(lambda row: data_preprocessing(row))
val_df.iloc[:,0] = val_df.iloc[:,0].apply(lambda row: data_preprocessing(row))
test_df.iloc[:,1] = test_df.iloc[:,1].apply(lambda row: data_preprocessing(row))
test_df.iloc[:,0] = test_df.iloc[:,0].apply(lambda row: data_preprocessing(row))

train_df.iloc[:,1] = train_df.iloc[:,1].apply(lambda x: x.translate(str.maketrans('','','1234567890')))
train_df.iloc[:,0] = train_df.iloc[:,0].apply(lambda x: x.translate(str.maketrans('','','1234567890')))
val_df.iloc[:,1] = val_df.iloc[:,1].apply(lambda x: x.translate(str.maketrans('','','1234567890')))
val_df.iloc[:,0] = val_df.iloc[:,0].apply(lambda x: x.translate(str.maketrans('','','1234567890')))
test_df.iloc[:,1] = test_df.iloc[:,1].apply(lambda x: x.translate(str.maketrans('','','1234567890')))
test_df.iloc[:,0] = test_df.iloc[:,0].apply(lambda x: x.translate(str.maketrans('','','1234567890')))

train_df.iloc[:,1] = train_df.iloc[:,1].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
train_df.iloc[:,0] = train_df.iloc[:,0].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
val_df.iloc[:,1] = val_df.iloc[:,1].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
val_df.iloc[:,0] = val_df.iloc[:,0].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
test_df.iloc[:,1] = test_df.iloc[:,1].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
test_df.iloc[:,0] = test_df.iloc[:,0].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

train_df.iloc[:,1]=train_df.iloc[:,1].apply(lambda x: x.lower())
train_df.iloc[:,0]=train_df.iloc[:,0].apply(lambda x: x.lower())
val_df.iloc[:,1]=val_df.iloc[:,1].apply(lambda x: x.lower())
val_df.iloc[:,0]=val_df.iloc[:,0].apply(lambda x: x.lower())
test_df.iloc[:,1]=test_df.iloc[:,1].apply(lambda x: x.lower())
test_df.iloc[:,0]=test_df.iloc[:,0].apply(lambda x: x.lower())

In [5]:
train_df.head()

Unnamed: 0,s1,s2,label
0,karpova,karpovekripenske,0
1,dojeongyo,dojeongyo,1
2,gaomo,gao mo,1
3,bieddjujavri,galdinjavri davit,0
4,zanjon la noria,sitio arqueologico la muralla,0


### Data preparation for embedding model

In [6]:
# For each matching toponym pair <T1, T2>:
#   we split T1, T2 into their tokens T1t1,...,T1tn and T2t1,...,T2tn
#   we add to the training sequences list the following lists:
#   [T1t1,...,T1tn,T2t1,...,T2tn] and [T2t1,...,T2tn,T1t1,...,T1tn]

k=[]
for m in tqdm(train_df[train_df['label']==1].index):
    k.append((train_df.iloc[m,0] + " " + train_df.iloc[m,1]).split())
    k.append((train_df.iloc[m,1] + " " + train_df.iloc[m,0]).split())

100%|██████████| 999767/999767 [00:21<00:00, 45938.13it/s]


### Training the embedding model

In [7]:
# Build the FastText model
model = FastText(size=100, window=3, min_count=1)
model.build_vocab(sentences=k)
model.train(sentences=k, total_examples=len(k), epochs=20)

In [8]:
# fname = "fasttext.model"
# model.save(fname)

In [9]:
# fname = "fasttext.model"
# model = FastText.load(fname)

In [10]:
# Here we built the feautures as the concatenation
# of the embeddings of the pair's toponyms

train_list=[]
for i in tqdm(train_df.index):
    train_list.append(np.concatenate((model.wv[train_df.iloc[i,0]], model.wv[train_df.iloc[i,1]]),axis=0))

X_train=np.array(train_list)
X_train=np.reshape(X_train,(X_train.shape[0],X_train.shape[1]))

y_train=np.array(train_df.iloc[:,2])
y_train=np.reshape(y_train,(y_train.shape[0]))
y_train=y_train.astype('int')

100%|██████████| 1999994/1999994 [02:55<00:00, 11364.51it/s]


In [11]:
val_list=[]
for i in tqdm(val_df.index):
    val_list.append(np.concatenate((model.wv[val_df.iloc[i,0]], model.wv[val_df.iloc[i,1]]),axis=0))

X_val=np.array(val_list)
X_val=np.reshape(X_val,(X_val.shape[0],X_val.shape[1]))

y_val=np.array(val_df.iloc[:,2])
y_val=np.reshape(y_val,(y_val.shape[0]))
y_val=y_val.astype('int')

100%|██████████| 499999/499999 [00:49<00:00, 10186.83it/s]


In [12]:
test_list=[]
for i in tqdm(test_df.index):
    test_list.append(np.concatenate((model.wv[test_df.iloc[i,0]],model.wv[test_df.iloc[i,1]]),axis=0))

X_test = np.array(test_list)
X_test=np.reshape(X_test,(X_test.shape[0],X_test.shape[1]))

y_test=np.array(test_df.iloc[:, 2])
y_test=np.reshape(y_test,(y_test.shape[0]))
y_test=y_test.astype('int')

gc.collect()

100%|██████████| 2499991/2499991 [04:04<00:00, 10213.92it/s]


0

### Classification through a fully-connected NN

In [13]:
nb_classes = 1
max_words=200
print('Number of Classes: {}'.format(nb_classes))

# Number of Epochs that we will train our Feed Forward Network
nb_epoch = 30
# The batch_size of the data that  will be fed to the Model when training
batch_size = 1024 
# Dropout Rate of the Dropout Layer
dropout_rate = 0.2

model1 = Sequential()
model1.add(Dense(2048, input_shape=(max_words,)))
model1.add(Activation('relu'))
model1.add(Dropout(dropout_rate))
model1.add(Dense(512))
model1.add(Activation('relu'))
model1.add(Dropout(dropout_rate))
model1.add(Dense(nb_classes))
model1.add(Activation('sigmoid'))

# print model layers' info
print(model1.summary())



def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

model1.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=[
        'accuracy', f1_m, precision_m,recall_m
    ],
)

es = EarlyStopping(monitor='val_loss', patience=3, verbose=0, mode='auto')

Using TensorFlow backend.


Number of Classes (lyricists): 1
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 2048)              411648    
_________________________________________________________________
activation (Activation)      (None, 2048)              0         
_________________________________________________________________
dropout (Dropout)            (None, 2048)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 512)               1049088   
_________________________________________________________________
activation_1 (Activation)    (None, 512)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)        

In [14]:
# We train (fit our data to) our model
history = model1.fit(
    X_train,
    y_train,
    epochs=nb_epoch,
    batch_size=batch_size,
    validation_data=(X_val, y_val),
    verbose=1,
    callbacks=[es])

Train on 1999994 samples, validate on 499999 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30


In [15]:
# evaluate and store on score variable on the TEST DATASET
score = model1.evaluate(
    X_test,
    y_test,
    batch_size=batch_size,
    verbose=1
)

