In [1]:
import pandas as pd

import numpy as np
from numpy import nan as Nan

import math

import transliterate
from transliterate import translit, get_available_language_codes

In [2]:
def extract_list(s):
    return [x.split(',')[1].strip("', ,\"") for x in s[2:-2].split('), (')]

def extract_score(s):
    return [float(x.split(',')[0].strip("', ,\"")) for x in s[2:-2].split('), (')]

def levenshteinDistance(s1, s2):
    if len(s1) > len(s2):
        s1, s2 = s2, s1

    distances = range(len(s1) + 1)
    for i2, c2 in enumerate(s2):
        distances_ = [i2+1]
        for i1, c1 in enumerate(s1):
            if c1 == c2:
                distances_.append(distances[i1])
            else:
                distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
        distances = distances_
    return distances[-1]

In [3]:
PATH_TO_DATA = '/media/andrii/earth/Katia/CS_MasterThesis/data/'
PATH_TO_DATA_RL = '/media/andrii/earth/Katia/CS_MasterThesis/data/ukwiki/uk_red_links/'
PATH_TO_DATA_UK = '/media/andrii/earth/Katia/CS_MasterThesis/data/ukwiki/ukwiki_20180920/'
PATH_TO_DATA_EN = '/media/andrii/earth/Katia/CS_MasterThesis/data/ukwiki/enwiki_20180920/'

In [4]:
uk_redlinks_sample = pd.read_csv(PATH_TO_DATA_RL+'df_sample_red_links_uk.csv', encoding = 'UTF-8')

In [5]:
uk_redlinks_sample

Unnamed: 0,red_link_name
0,Каунт Бейсі
1,аль-Каїм
2,королева
3,Prentice Hall
4,Tokar.ua
5,Monthly Notices of the Royal Astronomical Society
6,Світ (видавництво)
7,Дубович Іван Андрійович
8,помірні повітряні маси
9,Доповідь про торгівлю людьми


In [6]:
# get train and test samples
msk = np.random.rand(len(uk_redlinks_sample)) < 0.8
train = uk_redlinks_sample[msk]
test = uk_redlinks_sample[~msk]

print('train length =', len(train))
print('test length =', len(test))

train length = 2518
test length = 676


In [7]:
# save train and test sets to files
train.to_csv(PATH_TO_DATA_RL+'train_red_links_uk.csv', header=True, index=False)
test.to_csv(PATH_TO_DATA_RL+'test_red_links_uk.csv', header=True, index=False)

In [8]:
# read train and test samples
train_sample = pd.read_csv(PATH_TO_DATA_RL+'train_red_links_uk.csv', encoding = 'UTF-8')
test_sample = pd.read_csv(PATH_TO_DATA_RL+'test_red_links_uk.csv', encoding = 'UTF-8')

In [None]:
#test set

In [9]:
test_list = test_sample['red_link_name'].tolist()


In [10]:
candidates = pd.read_csv(PATH_TO_DATA_RL+'uk_red_links_results_all_candidates.csv', encoding = 'UTF-8')

In [11]:
test_candidates = pd.merge(test_sample, candidates, how='inner', left_on='red_link_name', right_on='red_link_name')

In [12]:
test_candidates.head()

Unnamed: 0,red_link_name,en_similar
0,Tokar.ua,"[(0.05, 'List of virtual communities with more..."
1,Вакуленко Олена Володимирівна,"[(0.053, 'Our World In Data'), (0.042, 'Break ..."
2,Головне управління геодезії та картографії СРСР,"[(0.109, 'Our World In Data'), (0.049, 'Ageing..."
3,Simon & Schuster,"[(0.038, 'The Years of Rice and Salt'), (0.037..."
4,Панчакутек Юпанкі,"[(0.036, 'Zhu Quan'), (0.036, 'Liverpool Castl..."


In [13]:
[test_candidates['red_link_name'][0]]

['Tokar.ua']

In [14]:
all_results = {}
all_results_list = [[]] * test_candidates.shape[0]
for i in range(test_candidates.shape[0]):
    if test_candidates['en_similar'][i] == '[]':
        all_results[test_candidates['red_link_name'][i]] = []
        all_results_list[i] = []
    else:
        res = extract_list(test_candidates['en_similar'][i])
        all_results[test_candidates['red_link_name'][i]] = res
        all_results_list[i] = res

In [15]:
len(all_results.keys())

676

In [16]:
df = pd.DataFrame.from_dict(all_results, orient = 'index')

In [17]:
rl_with_cand = df.stack().reset_index(level=1, drop=True).to_frame(name='candidate')

In [18]:
rl_with_cand['red_link_name'] = rl_with_cand.index

In [19]:
rl_with_cand

Unnamed: 0,candidate,red_link_name
Tokar.ua,List of virtual communities with more than 100...,Tokar.ua
Tokar.ua,Solar Energy Generating Systems,Tokar.ua
Tokar.ua,Stand-alone power system,Tokar.ua
Tokar.ua,Construction Equipment,Tokar.ua
Tokar.ua,Solar tracker,Tokar.ua
Tokar.ua,Comparison of instant messaging clients,Tokar.ua
Tokar.ua,Comparison of VoIP software,Tokar.ua
Tokar.ua,Yahoo! Tech,Tokar.ua
Tokar.ua,Trombe wall,Tokar.ua
Tokar.ua,Trinidadian and Tobagonian Americans,Tokar.ua


In [20]:
# add BabelNet results
BN_results = pd.read_csv(PATH_TO_DATA_RL+'BN_ukredlinks_wiki_evaluated.csv', encoding = 'UTF-8')

In [21]:
BN_results.head()

Unnamed: 0,red link name,EN BN,EN if else,Evaluation,Remarks
0,Каунт Бейсі,,Count Basie,FN,
1,аль-Каїм,Al-Qa'im (caliph),"also Al-Qa'im (town), Al-Qa'im (Cairo)",FP,article in UK Wiki exists. Impossible to know ...
2,королева,"gyne, monarch, queen regnant, queen (chess), T...",,FP,right item not first in the list
3,Prentice Hall,,Prentice Hall,FN,
4,Tokar.ua,,,TN,


In [22]:
df_with_BN = pd.merge(rl_with_cand, BN_results, how='inner', left_on='red_link_name', right_on='red link name')

In [23]:
df_with_BN = df_with_BN[['red_link_name','candidate', 'EN BN']]

In [24]:
df_with_BN

Unnamed: 0,red_link_name,candidate,EN BN
0,Tokar.ua,List of virtual communities with more than 100...,
1,Tokar.ua,Solar Energy Generating Systems,
2,Tokar.ua,Stand-alone power system,
3,Tokar.ua,Construction Equipment,
4,Tokar.ua,Solar tracker,
5,Tokar.ua,Comparison of instant messaging clients,
6,Tokar.ua,Comparison of VoIP software,
7,Tokar.ua,Yahoo! Tech,
8,Tokar.ua,Trombe wall,
9,Tokar.ua,Trinidadian and Tobagonian Americans,


In [25]:
# strip all string values in a dataframe

df_with_BN_obj = df_with_BN.select_dtypes(['object'])
df_with_BN[df_with_BN_obj.columns] = df_with_BN_obj.apply(lambda x: x.str.strip())

In [26]:
df_with_BN

Unnamed: 0,red_link_name,candidate,EN BN
0,Tokar.ua,List of virtual communities with more than 100...,
1,Tokar.ua,Solar Energy Generating Systems,
2,Tokar.ua,Stand-alone power system,
3,Tokar.ua,Construction Equipment,
4,Tokar.ua,Solar tracker,
5,Tokar.ua,Comparison of instant messaging clients,
6,Tokar.ua,Comparison of VoIP software,
7,Tokar.ua,Yahoo! Tech,
8,Tokar.ua,Trombe wall,
9,Tokar.ua,Trinidadian and Tobagonian Americans,


In [27]:
bn_results = []
for index, row in df_with_BN.iterrows():
    bn_result = row['candidate'] == row['EN BN']
    bn_results.append(bn_result)

In [28]:
BN_df = pd.DataFrame(bn_results)

In [29]:
with_BN_df = pd.concat([df_with_BN, BN_df], axis=1)
with_BN_df.columns = ['red_link_name', 'candidate', 'EN BN', 'BN_results']
with_BN_df = with_BN_df[['red_link_name', 'candidate', 'BN_results']]

In [30]:
with_BN_df.head()

Unnamed: 0,red_link_name,candidate,BN_results
0,Tokar.ua,List of virtual communities with more than 100...,False
1,Tokar.ua,Solar Energy Generating Systems,False
2,Tokar.ua,Stand-alone power system,False
3,Tokar.ua,Construction Equipment,False
4,Tokar.ua,Solar tracker,False


In [31]:
with_BN_df['BN_results'] = with_BN_df['BN_results'].replace({False: '0'})

In [32]:
with_BN_df['BN_results'] = with_BN_df['BN_results'].replace({True: '1'})

In [33]:
with_BN_df

Unnamed: 0,red_link_name,candidate,BN_results
0,Tokar.ua,List of virtual communities with more than 100...,0
1,Tokar.ua,Solar Energy Generating Systems,0
2,Tokar.ua,Stand-alone power system,0
3,Tokar.ua,Construction Equipment,0
4,Tokar.ua,Solar tracker,0
5,Tokar.ua,Comparison of instant messaging clients,0
6,Tokar.ua,Comparison of VoIP software,0
7,Tokar.ua,Yahoo! Tech,0
8,Tokar.ua,Trombe wall,0
9,Tokar.ua,Trinidadian and Tobagonian Americans,0


In [34]:
with_BN_df = with_BN_df.reset_index(level=0, drop=True)

In [35]:
# extract jaccard scores

In [36]:
all_results_scores = {}
all_results_list = [[]] * test_candidates.shape[0]
for i in range(test_candidates.shape[0]):
    if test_candidates['en_similar'][i] == '[]':
        all_results_scores[test_candidates['red_link_name'][i]] = []
        all_results_list[i] = []
    else:
        res = extract_score(test_candidates['en_similar'][i])
        all_results_scores[test_candidates['red_link_name'][i]] = res
        all_results_list[i] = res

In [37]:
len(all_results_scores)

676

In [38]:
df = pd.DataFrame.from_dict(all_results_scores, orient = 'index')

rl_with_score = df.stack().reset_index(level=0, drop=True).to_frame(name='jaccard_score')

In [39]:
rl_with_score = rl_with_score.reset_index(level=0, drop=True)

In [40]:
df_2f = pd.concat([with_BN_df, rl_with_score], axis=1)

In [41]:
df_2f.head()

Unnamed: 0,red_link_name,candidate,BN_results,jaccard_score
0,Tokar.ua,List of virtual communities with more than 100...,0,0.05
1,Tokar.ua,Solar Energy Generating Systems,0,0.032
2,Tokar.ua,Stand-alone power system,0,0.027
3,Tokar.ua,Construction Equipment,0,0.025
4,Tokar.ua,Solar tracker,0,0.023


In [46]:
df_2f.shape

(624806, 5)

In [42]:
# add levenstein distance score

In [44]:
def calculate_norm_levenstein_df(row):
    red_link = str(row['red_link_name'])
    candidate = str(row['candidate'])
    abs_dist = levenshteinDistance(translit(red_link, 'uk', reversed=True), candidate)
    norm_dist = abs_dist/max(len(red_link), len(candidate))
    norm_dist = round(norm_dist, 3)
    return norm_dist

In [45]:
df_2f['levenstein_score'] = df_2f.apply(calculate_norm_levenstein_df, axis=1)

In [47]:
df_2f

Unnamed: 0,red_link_name,candidate,BN_results,jaccard_score,levenstein_score
0,Tokar.ua,List of virtual communities with more than 100...,0,0.050,0.940
1,Tokar.ua,Solar Energy Generating Systems,0,0.032,0.871
2,Tokar.ua,Stand-alone power system,0,0.027,0.917
3,Tokar.ua,Construction Equipment,0,0.025,0.864
4,Tokar.ua,Solar tracker,0,0.023,0.692
5,Tokar.ua,Comparison of instant messaging clients,0,0.022,0.897
6,Tokar.ua,Comparison of VoIP software,0,0.020,0.852
7,Tokar.ua,Yahoo! Tech,0,0.018,0.909
8,Tokar.ua,Trombe wall,0,0.018,0.727
9,Tokar.ua,Trinidadian and Tobagonian Americans,0,0.018,0.861


In [49]:
# create a table with features for a train set