In [1]:
import numpy as np
import pandas as pd
import textdistance
#https://pypi.org/project/textdistance/
import os

### This code is used to add the Xinhua transliteration to Wikidata dataset, and then to add metrics

In [2]:
from transliterator import Transliterator as T

table=pd.read_csv('translit_pdf_version.csv',sep=';')

In [17]:
filename = 'data_preprocessed_no_duplicates.csv'
filename_2 = 'data_preprocessed_with_duplicates.csv'
df = pd.read_csv(filename_2)
df.head()

Unnamed: 0,wiki_id,label_zh,label_ru,country,type
0,265997,伊曼德拉,Имандра,http://www.wikidata.org/entity/Q159,lake
1,166162,奥涅加,Онежское,http://www.wikidata.org/entity/Q159,lake
2,233071,汉泰,Хантайское,http://www.wikidata.org/entity/Q159,lake
3,245763,奥特拉德诺耶,Отрадное,http://www.wikidata.org/entity/Q159,lake
4,117036,绍托泽罗,Шотозеро,http://www.wikidata.org/entity/Q159,lake


In [18]:
df.shape

(36261, 5)

In [2]:
c_dict = {"http://www.wikidata.org/entity/Q159": 'Russia', "http://www.wikidata.org/entity/Q212": 'Ukraine', 
                                "http://www.wikidata.org/entity/Q15180":'USSR',"http://www.wikidata.org/entity/Q34266":'Russian Empire',
                               "http://www.wikidata.org/entity/Q184":'Belarus', "http://www.wikidata.org/entity/Q907112":'Transnistria'}
t_dict = {"lake": 1, "island": 2, "river": 3, "mountain": 4,
                                  "settlement": 5, "person": 6}
#df = df.replace(cleanup_nums, inplace=True)

In [35]:
df.replace(c_dict, inplace=True)

In [36]:
df.head()

Unnamed: 0,wiki_id,label_zh,label_ru,country,type
0,265997,伊曼德拉,Имандра,Russia,lake
1,166162,奥涅加,Онежское,Russia,lake
2,233071,汉泰,Хантайское,Russia,lake
3,245763,奥特拉德诺耶,Отрадное,Russia,lake
4,117036,绍托泽罗,Шотозеро,Russia,lake


In [21]:
df = df[~df['label_zh'].str.contains("[a-zA-Z]").fillna(False)]

In [24]:
df.shape

(36257, 5)

In [22]:
df.dtypes

wiki_id      int64
label_zh    object
label_ru    object
country     object
type        object
dtype: object

In [37]:
df.to_csv('data_with_duplicates_pre_final.csv', index=False)

## creating "orthodox" Xinhua transliteration

In [38]:
with open('data_with_duplicates_pre_final.csv', 'r', encoding='utf-8') as src:
    data = src.readlines()
    length = len(data)-1

In [39]:
data_chunks = [data[x:x+100] for x in range(0, len(data), 100)]

In [23]:
import os
os.mkdir('data_with_duplicates_by_parts')

In [25]:
length = 36257

In [42]:
filenames = ['data_with_dupl_pt_'+str(i) for i in range(1, (length//100+2))]
filenames

['data_with_dupl_pt_1',
 'data_with_dupl_pt_2',
 'data_with_dupl_pt_3',
 'data_with_dupl_pt_4',
 'data_with_dupl_pt_5',
 'data_with_dupl_pt_6',
 'data_with_dupl_pt_7',
 'data_with_dupl_pt_8',
 'data_with_dupl_pt_9',
 'data_with_dupl_pt_10',
 'data_with_dupl_pt_11',
 'data_with_dupl_pt_12',
 'data_with_dupl_pt_13',
 'data_with_dupl_pt_14',
 'data_with_dupl_pt_15',
 'data_with_dupl_pt_16',
 'data_with_dupl_pt_17',
 'data_with_dupl_pt_18',
 'data_with_dupl_pt_19',
 'data_with_dupl_pt_20',
 'data_with_dupl_pt_21',
 'data_with_dupl_pt_22',
 'data_with_dupl_pt_23',
 'data_with_dupl_pt_24',
 'data_with_dupl_pt_25',
 'data_with_dupl_pt_26',
 'data_with_dupl_pt_27',
 'data_with_dupl_pt_28',
 'data_with_dupl_pt_29',
 'data_with_dupl_pt_30',
 'data_with_dupl_pt_31',
 'data_with_dupl_pt_32',
 'data_with_dupl_pt_33',
 'data_with_dupl_pt_34',
 'data_with_dupl_pt_35',
 'data_with_dupl_pt_36',
 'data_with_dupl_pt_37',
 'data_with_dupl_pt_38',
 'data_with_dupl_pt_39',
 'data_with_dupl_pt_40',
 'data_wi

In [43]:
len(data_chunks)

363

In [44]:
for i in range(len(filenames)):
    filename = 'data_with_duplicates_by_parts/'+filenames[i]+'.csv'
    with open(filename, 'a', encoding='utf-8') as f:
        for el in data_chunks[i]:
            f.write(el)

### Below we chunk our dataset into parts in order to avoid overload of RAM while computing the whole dataset

In [11]:
data_chunks = [data[x:x+100] for x in range(0, len(data), 100)]

In [3]:
import os
all_files = os.listdir('data_with_duplicates_by_parts')
all_files_src = ['data_with_duplicates_by_parts/'+f for f in all_files]
all_files_src

['data_with_duplicates_by_parts/data_with_dupl_pt_1.csv',
 'data_with_duplicates_by_parts/data_with_dupl_pt_10.csv',
 'data_with_duplicates_by_parts/data_with_dupl_pt_100.csv',
 'data_with_duplicates_by_parts/data_with_dupl_pt_101.csv',
 'data_with_duplicates_by_parts/data_with_dupl_pt_102.csv',
 'data_with_duplicates_by_parts/data_with_dupl_pt_103.csv',
 'data_with_duplicates_by_parts/data_with_dupl_pt_104.csv',
 'data_with_duplicates_by_parts/data_with_dupl_pt_105.csv',
 'data_with_duplicates_by_parts/data_with_dupl_pt_106.csv',
 'data_with_duplicates_by_parts/data_with_dupl_pt_107.csv',
 'data_with_duplicates_by_parts/data_with_dupl_pt_108.csv',
 'data_with_duplicates_by_parts/data_with_dupl_pt_109.csv',
 'data_with_duplicates_by_parts/data_with_dupl_pt_11.csv',
 'data_with_duplicates_by_parts/data_with_dupl_pt_110.csv',
 'data_with_duplicates_by_parts/data_with_dupl_pt_111.csv',
 'data_with_duplicates_by_parts/data_with_dupl_pt_112.csv',
 'data_with_duplicates_by_parts/data_with_du

In [10]:
os.mkdir('with_duplicates_by_parts_final')

In [11]:
#os.mkdir('no_duplicates_by_parts_final')
new_files = ['with_duplicates_by_parts_final/'+f for f in all_files]

In [12]:
new_files

['with_duplicates_by_parts_final/data_with_dupl_pt_1.csv',
 'with_duplicates_by_parts_final/data_with_dupl_pt_10.csv',
 'with_duplicates_by_parts_final/data_with_dupl_pt_100.csv',
 'with_duplicates_by_parts_final/data_with_dupl_pt_101.csv',
 'with_duplicates_by_parts_final/data_with_dupl_pt_102.csv',
 'with_duplicates_by_parts_final/data_with_dupl_pt_103.csv',
 'with_duplicates_by_parts_final/data_with_dupl_pt_104.csv',
 'with_duplicates_by_parts_final/data_with_dupl_pt_105.csv',
 'with_duplicates_by_parts_final/data_with_dupl_pt_106.csv',
 'with_duplicates_by_parts_final/data_with_dupl_pt_107.csv',
 'with_duplicates_by_parts_final/data_with_dupl_pt_108.csv',
 'with_duplicates_by_parts_final/data_with_dupl_pt_109.csv',
 'with_duplicates_by_parts_final/data_with_dupl_pt_11.csv',
 'with_duplicates_by_parts_final/data_with_dupl_pt_110.csv',
 'with_duplicates_by_parts_final/data_with_dupl_pt_111.csv',
 'with_duplicates_by_parts_final/data_with_dupl_pt_112.csv',
 'with_duplicates_by_parts_f

In [16]:
import re
with open('data_with_duplicates_final.csv', 'w', encoding='utf-8') as f:
    f.write('wiki_id,label_zh,label_ru,country,type,xinhua,\n') #levenstein_abs,levenstein_normalized,jaccard
import time
idx = 0
for i  in range(len(all_files)):
    with open(all_files_src[i], 'r', encoding='utf-8') as src:
        #print('reading source')
        f_lines = src.readlines()
    print(all_files_src[i]+' is processed...')
    idx += 1
    for line in f_lines:
        #print('working on line:')
        line = line[:-1]
        #print('line')
        wiki_id, to_compare, to_translate, c, t = line.split(',')[0], line.split(',')[1], line.split(',')[2], line.split(',')[3], line.split(',')[4]
        #print(to_compare, to_translate)
        to_translate = re.sub(r'[«»]', r'', to_translate)
        to_compare = HanziConv.toSimplified(to_compare)
        translation = T(to_translate, table).output_word
        new_line = wiki_id +','+ to_compare +','+ to_translate +',' + translation +',' + c +','+ t + '\n' #','+str(lev_abs)+','+str(lev_norm)+','+str(jac)+
        #print(new_line)
        #print('result is ready')
        with open(new_files[i], 'a', encoding='utf-8') as output:
        #    print('writing to new line')
            output.write(new_line)
        with open('data_with_duplicates_final.csv', 'w', encoding='utf-8') as final_file:
            final_file.write(new_line)
        #print('done! next line!')
    print(f'chunk {idx} completed')
    time.sleep(1)

data_with_duplicates_by_parts/data_with_dupl_pt_1.csv is processed...
chunk 1 completed
data_with_duplicates_by_parts/data_with_dupl_pt_10.csv is processed...
chunk 2 completed
data_with_duplicates_by_parts/data_with_dupl_pt_100.csv is processed...
chunk 3 completed
data_with_duplicates_by_parts/data_with_dupl_pt_101.csv is processed...
chunk 4 completed
data_with_duplicates_by_parts/data_with_dupl_pt_102.csv is processed...
chunk 5 completed
data_with_duplicates_by_parts/data_with_dupl_pt_103.csv is processed...
chunk 6 completed
data_with_duplicates_by_parts/data_with_dupl_pt_104.csv is processed...
chunk 7 completed
data_with_duplicates_by_parts/data_with_dupl_pt_105.csv is processed...
chunk 8 completed
data_with_duplicates_by_parts/data_with_dupl_pt_106.csv is processed...
chunk 9 completed
data_with_duplicates_by_parts/data_with_dupl_pt_107.csv is processed...
chunk 10 completed
data_with_duplicates_by_parts/data_with_dupl_pt_108.csv is processed...
chunk 11 completed
data_with_d

chunk 91 completed
data_with_duplicates_by_parts/data_with_dupl_pt_181.csv is processed...
chunk 92 completed
data_with_duplicates_by_parts/data_with_dupl_pt_182.csv is processed...
chunk 93 completed
data_with_duplicates_by_parts/data_with_dupl_pt_183.csv is processed...
chunk 94 completed
data_with_duplicates_by_parts/data_with_dupl_pt_184.csv is processed...
chunk 95 completed
data_with_duplicates_by_parts/data_with_dupl_pt_185.csv is processed...
chunk 96 completed
data_with_duplicates_by_parts/data_with_dupl_pt_186.csv is processed...
chunk 97 completed
data_with_duplicates_by_parts/data_with_dupl_pt_187.csv is processed...
chunk 98 completed
data_with_duplicates_by_parts/data_with_dupl_pt_188.csv is processed...
chunk 99 completed
data_with_duplicates_by_parts/data_with_dupl_pt_189.csv is processed...
chunk 100 completed
data_with_duplicates_by_parts/data_with_dupl_pt_19.csv is processed...
chunk 101 completed
data_with_duplicates_by_parts/data_with_dupl_pt_190.csv is processed..

chunk 181 completed
data_with_duplicates_by_parts/data_with_dupl_pt_262.csv is processed...
chunk 182 completed
data_with_duplicates_by_parts/data_with_dupl_pt_263.csv is processed...
chunk 183 completed
data_with_duplicates_by_parts/data_with_dupl_pt_264.csv is processed...
chunk 184 completed
data_with_duplicates_by_parts/data_with_dupl_pt_265.csv is processed...
chunk 185 completed
data_with_duplicates_by_parts/data_with_dupl_pt_266.csv is processed...
chunk 186 completed
data_with_duplicates_by_parts/data_with_dupl_pt_267.csv is processed...
chunk 187 completed
data_with_duplicates_by_parts/data_with_dupl_pt_268.csv is processed...
chunk 188 completed
data_with_duplicates_by_parts/data_with_dupl_pt_269.csv is processed...
chunk 189 completed
data_with_duplicates_by_parts/data_with_dupl_pt_27.csv is processed...
chunk 190 completed
data_with_duplicates_by_parts/data_with_dupl_pt_270.csv is processed...
chunk 191 completed
data_with_duplicates_by_parts/data_with_dupl_pt_271.csv is pr

data_with_duplicates_by_parts/data_with_dupl_pt_342.csv is processed...
chunk 271 completed
data_with_duplicates_by_parts/data_with_dupl_pt_343.csv is processed...
chunk 272 completed
data_with_duplicates_by_parts/data_with_dupl_pt_344.csv is processed...
chunk 273 completed
data_with_duplicates_by_parts/data_with_dupl_pt_345.csv is processed...
chunk 274 completed
data_with_duplicates_by_parts/data_with_dupl_pt_346.csv is processed...
chunk 275 completed
data_with_duplicates_by_parts/data_with_dupl_pt_347.csv is processed...
chunk 276 completed
data_with_duplicates_by_parts/data_with_dupl_pt_348.csv is processed...
chunk 277 completed
data_with_duplicates_by_parts/data_with_dupl_pt_349.csv is processed...
chunk 278 completed
data_with_duplicates_by_parts/data_with_dupl_pt_35.csv is processed...
chunk 279 completed
data_with_duplicates_by_parts/data_with_dupl_pt_350.csv is processed...
chunk 280 completed
data_with_duplicates_by_parts/data_with_dupl_pt_351.csv is processed...
chunk 281

data_with_duplicates_by_parts/data_with_dupl_pt_97.csv is processed...
chunk 361 completed
data_with_duplicates_by_parts/data_with_dupl_pt_98.csv is processed...
chunk 362 completed
data_with_duplicates_by_parts/data_with_dupl_pt_99.csv is processed...
chunk 363 completed


### Now we aggregate all the files into one .csv dataset again

In [18]:
res = os.listdir('with_duplicates_by_parts_final')
res = ['with_duplicates_by_parts_final/'+l for l in res]

In [19]:
with open('data_with_duplicates_final.csv', 'w', encoding='utf-8') as f:
    f.write('wiki_id,label_zh,label_ru,xinhua,country,type\n')
for i in range(len(res)):
    with open (res[i], 'r', encoding='utf-8') as src:
        lines = src.readlines()
    with open('data_with_duplicates_final.csv', 'a', encoding='utf-8') as f:
        for l in lines:
            f.write(l)

In [21]:
d = pd.read_csv('data_with_duplicates_final.csv')
d = d[~d['label_zh'].str.contains("[a-zA-ZА-Яа-я]").fillna(False)]
#d = d.replace(to_replace=r'·', value=r'',regex=True)
d.shape

(36250, 6)

In [27]:
d.head()

Unnamed: 0,wiki_id,label_zh,label_ru,xinhua,country,type,levenstein_abs,levenstein_norm,jaccard
0,265997,伊曼德拉,Имандра,伊曼德拉,Russia,lake,0,0.0,1.0
1,166162,奥涅加,Онежское,奥涅日斯科耶,Russia,lake,4,0.666667,0.285714
2,233071,汉泰,Хантайское,汉泰斯科耶,Russia,lake,3,0.6,0.4
3,245763,奥特拉德诺耶,Отрадное,奥特拉德诺耶,Russia,lake,0,0.0,1.0
4,117036,绍托泽罗,Шотозеро,绍托泽罗,Russia,lake,0,0.0,1.0


### Applying 3 metrics

In [23]:
d["levenstein_abs"] = d.loc[:, ["label_zh","xinhua"]].apply(lambda x: textdistance.levenshtein.distance(*x), axis=1)

In [25]:
d["levenstein_norm"] = d.loc[:, ["label_zh","xinhua"]].apply(lambda x: textdistance.levenshtein.normalized_distance(*x), axis=1)

In [26]:
d["jaccard"] = d.loc[:, ["label_zh","xinhua"]].apply(lambda x: textdistance.jaccard(*x), axis=1)

### Saving to the final dataset

In [31]:
d.to_csv('data_total.csv')