In [1]:
import pandas as pd
from tqdm import tqdm

tqdm.pandas()

# Train - Val - Test Dataset

In [2]:
df = pd.read_csv('allCountries.txt', sep='\t', header=None, usecols=[1,2,3])
df = df.where((pd.notnull(df)), None)

In [3]:
# convert second column from string to a set with a single string
df[2] = df[2].progress_apply(lambda x: {x} if x else set())
# split the variations and add them to a set
df[3] = df[3].progress_apply(lambda x: set(x.split(',')) if x else set())

100%|██████████| 11947366/11947366 [00:22<00:00, 534715.42it/s]
100%|██████████| 11947366/11947366 [00:26<00:00, 455146.14it/s]


In [4]:
# calculate the variations by taking the union on the 2, 3 columns and removing (if exists) the one from the first column
df['variations'] = df.progress_apply(lambda row: (row[2] | row[3]) - {row[1]}, axis=1)

100%|██████████| 11947366/11947366 [08:31<00:00, 23368.70it/s]


In [5]:
# keeping only the toponym and it's variations
df = df[[1, 'variations']]
df.columns = ['toponym', 'variations']

In [6]:
# sorting the variations and converting back to string using the " || " separator
df['variations'] = df['variations'].progress_apply(lambda x: ' || '.join(sorted(x)))

100%|██████████| 11947366/11947366 [00:15<00:00, 775704.35it/s]


In [7]:
df.to_csv('all_countries_cleaned.csv', index=False, encoding='utf-8')

# Test Dataset

In [8]:
# reading the pre-constructed test dataset. 
# We want to remove any of the instances from the train-val-test dataset in order to avoid
# information leakage
test_df = pd.read_csv('dataset-string-similarity.txt', sep='\t', header=None, encoding='utf-8', usecols=[0,1])
test_df = test_df.where((pd.notnull(test_df)), None)
test_df.head()

Unnamed: 0,0,1
0,la dom nxy,ลำโดมน้อย
1,Sharunyata,Shartjugskij
2,Krutoy,Крутой
3,Sutangcun,羊山村
4,Jowkār-e Shafī‘,جوکار شفیع


In [9]:
# gathering all the toponyms and variations from the test dataset in a single set
all_test_toponyms = set(filter(None, set(test_df[0]) | set(test_df[1])))

In [10]:
len(all_test_toponyms)

4587775

In [11]:
df['variations'] = df['variations'].progress_apply(lambda x: set(x.split(' || ')))

100%|██████████| 11947366/11947366 [00:42<00:00, 281039.63it/s]


In [12]:
# get a single set from each row from the train-val-test dataset
records = df.progress_apply(lambda row: {row['toponym']} | row['variations'], axis=1)

100%|██████████| 11947366/11947366 [05:31<00:00, 36010.08it/s]


In [13]:
records = records.to_frame()
records.columns = ['records']

In [14]:
records['len_records'] = records['records'].progress_apply(len)

100%|██████████| 11947366/11947366 [00:11<00:00, 1074537.19it/s]


In [15]:
records['len_records_after'] = records['records'].progress_apply(lambda x: len(x - all_test_toponyms))

100%|██████████| 11947366/11947366 [00:20<00:00, 577360.34it/s]


In [16]:
records['in_test_set'] = records['len_records'] != records['len_records_after']

In [17]:
train_val = records[~records['in_test_set']]
test = records[records['in_test_set']]

In [18]:
train_val_index = train_val.index
test_index = test.index

In [19]:
# del train_val
# del records

In [20]:
train_val_df = df.loc[train_val_index]
test_df = df.loc[test_index]

In [21]:
# sorting the variations and converting back to string using the " || " separator
train_val_df['variations'] = train_val_df['variations'].progress_apply(lambda x: ' || '.join(sorted(x)))

100%|██████████| 7861098/7861098 [00:09<00:00, 798553.13it/s]


In [22]:
train_val_df.to_csv('train_val_countries_cleaned.csv', index=False, encoding='utf-8')

In [27]:
# sorting the variations and converting back to string using the " || " separator
test_df['variations'] = test_df['variations'].progress_apply(lambda x: ' || '.join(sorted(x)))

100%|██████████| 4086268/4086268 [00:06<00:00, 598575.99it/s]


In [28]:
test_df.to_csv('test_countries_cleaned.csv', index=False, encoding='utf-8')