In [1]:
import pandas as pd
from tqdm import tqdm
from tqdm import tqdm

tqdm.pandas()

In [2]:
path = '../../data/dataset-string-similarity.txt'

In [3]:
df = pd.read_csv(path, sep='\t', header=None)

In [4]:
df.columns = ['name1', 'name2', 'match', 'geoname_id_1', 'geoname_id_2', 'lang1', 'lang2', 'country1', 'country2']

In [5]:
df.sample(10)

Unnamed: 0,name1,name2,match,geoname_id_1,geoname_id_2,lang1,lang2,country1,country2
1530619,전남중학교,광주시청,False,11172509,11172510,CJK,CJK,KR,KR
3377614,El Sol,Río del Sol,True,9625541,9625541,LATIN,LATIN,MX,MX
666293,Malaja Os'janka,Malaya Oslyanka,False,531472,531473,LATIN,LATIN,RU,RU
1031899,Arjadzor,Arjadzori,False,10426768,10426769,LATIN,LATIN,AM,AM
1406059,Paliepiai,Paliepių,False,596212,596213,LATIN,LATIN,LT,LT
4699636,Guidan Atchyé,Guidan Tadjae,True,2444028,2444028,LATIN,LATIN,NE,NE
3397083,Omskij Nauchno-issledovatel'skij Institut Prir...,Tsentral’naya Poliklinika Omskogo Rayona,False,8468187,8468188,LATIN,LATIN,RU,RU
2434695,Aḑ Ḑab‘ah,Kusin,False,283103,283104,LATIN,LATIN,PS,PS
2231811,Débarégati,Debe Debe Peul,False,2445818,2445819,LATIN,LATIN,NE,NE
3339243,General Pedro Colorado,Zapotal San Miguel Primera Seccion,False,3802248,3802250,LATIN,LATIN,MX,MX


In [6]:
train = df[: int(len(df)/2)]
test = df[int(len(df)/2):]

In [7]:
train1 = train[['geoname_id_1', 'name1']].copy()
train1.columns = ['gid', 'name']
train2 = train[['geoname_id_2', 'name2']].copy()
train2.columns = ['gid', 'name']

train = pd.concat([train1, train2])

In [8]:
test1 = test[['geoname_id_1', 'name1']].copy()
test1.columns = ['gid', 'name']
test2 = test[['geoname_id_2', 'name2']].copy()
test2.columns = ['gid', 'name']

test = pd.concat([test1, test2])

In [9]:
grouped_train = train.groupby('gid').agg({'name': lambda x: set(x)})
grouped_test = test.groupby('gid').agg({'name': lambda x: set(x)})

In [10]:
out = grouped_train.merge(grouped_test, left_index=True, right_index=True)
out.reset_index(inplace=True)
out.columns = ['gid', 'train_names_set', 'test_names_set']

In [12]:
out['common'] = out.progress_apply(lambda row: row['train_names_set'] & row['test_names_set'], axis=1)
out['len_common'] = out['common'].progress_apply(len)

100%|██████████| 1637435/1637435 [00:54<00:00, 29795.71it/s]
100%|██████████| 1637435/1637435 [00:01<00:00, 1106176.47it/s]


In [13]:
common_df = out[~(out['common'] == set())]

In [14]:
common_df.sample(10)

Unnamed: 0,gid,train_names_set,test_names_set,common,len_common
678233,2500279,{Djebel Djama Draa},"{Djebel Djama Draa, Djebel Djama' Dra'}",{Djebel Djama Draa},1
492388,1601397,{Ban Mueang Tao},"{บ้านเมืองเตา, Ban Mueang Tao}",{Ban Mueang Tao},1
459648,1496658,"{Novotroitskoye, Novotroitskoe}","{Novotroitskoye, Novotroitskoe}","{Novotroitskoye, Novotroitskoe}",2
110010,257657,"{Μάρμαρα, Mármara}","{Marmara, Μάρμαρα, Sadovitsa}",{Μάρμαρα},1
219941,533799,{Лубянка},"{Lubjanka, Лубянка}",{Лубянка},1
1024309,7099696,"{Rongrian Chaloem Mani Chai Witthayakhan, rong...",{Rongrian Chaloem Mani Chai Witthayakhan},{Rongrian Chaloem Mani Chai Witthayakhan},1
1028283,7322881,{er jiao zhen},"{er jiao, Erjiao, er jiao zhen}",{er jiao zhen},1
9459,14519,"{Darreh Shūr, Shahid Sheykhi-ye Darreh Shur, D...",{Shahid Sheykhi-ye Darreh Shur},{Shahid Sheykhi-ye Darreh Shur},1
1051468,7421572,{hnxng cx nxy},{hnxng cx nxy},{hnxng cx nxy},1
785290,3385730,"{Rio Urahim, Rio Uraim}","{Rio Ururaím, Rio Urahim}",{Rio Urahim},1


In [15]:
len(common_df)

1392225

In [16]:
separate_df = out[out['common'] == set()]

In [17]:
len(separate_df)

245210

In [18]:
train_ids = set(train['gid'].unique())

In [19]:
test_ids = set(test['gid'].unique())

In [20]:
print(f'Number of Training IDs: {len(train_ids)}')

Number of Training IDs: 1999066


In [21]:
print(f'Number of Testing IDs: {len(test_ids)}')

Number of Testing IDs: 1999112


In [22]:
common_ids = train_ids & test_ids
print(f'Number of Common IDs: {len(common_ids)}')

Number of Common IDs: 1637435


In [23]:
all_ids = train_ids | test_ids
print(f'Total IDs: {len(all_ids)}')

Total IDs: 2360743


In [24]:
clean_train_ids = train_ids - test_ids
print(f'Number of "clean" training IDs: {len(clean_train_ids)}')

Number of "clean" training IDs: 361631
