In [1]:
import pandas as pd

import numpy as np
from numpy import nan as Nan

import math

import transliterate
from transliterate import translit, get_available_language_codes

In [2]:
def extract_list(s):
    return [x.split(',')[1].strip("', ,\"") for x in s[2:-2].split('), (')]

def extract_score(s):
    return [float(x.split(',')[0].strip("', ,\"")) for x in s[2:-2].split('), (')]

def levenshteinDistance(s1, s2):
    if len(s1) > len(s2):
        s1, s2 = s2, s1

    distances = range(len(s1) + 1)
    for i2, c2 in enumerate(s2):
        distances_ = [i2+1]
        for i1, c1 in enumerate(s1):
            if c1 == c2:
                distances_.append(distances[i1])
            else:
                distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
        distances = distances_
    return distances[-1]

In [3]:
PATH_TO_DATA = '/media/andrii/earth/Katia/CS_MasterThesis/data/'
PATH_TO_DATA_RL = '/media/andrii/earth/Katia/CS_MasterThesis/data/ukwiki/uk_red_links/'
PATH_TO_DATA_UK = '/media/andrii/earth/Katia/CS_MasterThesis/data/ukwiki/ukwiki_20180920/'
PATH_TO_DATA_EN = '/media/andrii/earth/Katia/CS_MasterThesis/data/ukwiki/enwiki_20180920/'

In [4]:
uk_redlinks_sample = pd.read_csv(PATH_TO_DATA_RL+'df_sample_red_links_uk.csv', encoding = 'UTF-8')

In [5]:
uk_redlinks_sample

Unnamed: 0,red_link_name
0,Каунт Бейсі
1,аль-Каїм
2,королева
3,Prentice Hall
4,Tokar.ua
5,Monthly Notices of the Royal Astronomical Society
6,Світ (видавництво)
7,Дубович Іван Андрійович
8,помірні повітряні маси
9,Доповідь про торгівлю людьми


In [6]:
# get train and test samples
msk = np.random.rand(len(uk_redlinks_sample)) < 0.8
train = uk_redlinks_sample[msk]
test = uk_redlinks_sample[~msk]

print('train length =', len(train))
print('test length =', len(test))

train length = 2557
test length = 637


In [7]:
# save train and test sets to files
train.to_csv(PATH_TO_DATA_RL+'train_red_links_uk.csv', header=True, index=False)
test.to_csv(PATH_TO_DATA_RL+'test_red_links_uk.csv', header=True, index=False)

In [8]:
# read train and test samples
train_sample = pd.read_csv(PATH_TO_DATA_RL+'train_red_links_uk.csv', encoding = 'UTF-8')
test_sample = pd.read_csv(PATH_TO_DATA_RL+'test_red_links_uk.csv', encoding = 'UTF-8')

## Table for test set

In [9]:
test_list = test_sample['red_link_name'].tolist()

In [10]:
candidates = pd.read_csv(PATH_TO_DATA_RL+'uk_red_links_results_all_candidates.csv', encoding = 'UTF-8')

In [11]:
test_candidates = pd.merge(test_sample, candidates, how='inner', left_on='red_link_name', right_on='red_link_name')

In [12]:
all_results = {}
all_results_list = [[]] * test_candidates.shape[0]
for i in range(test_candidates.shape[0]):
    if test_candidates['en_similar'][i] == '[]':
        all_results[test_candidates['red_link_name'][i]] = []
        all_results_list[i] = []
    else:
        res = extract_list(test_candidates['en_similar'][i])
        all_results[test_candidates['red_link_name'][i]] = res
        all_results_list[i] = res

In [13]:
len(all_results.keys())

637

In [14]:
df = pd.DataFrame.from_dict(all_results, orient = 'index')

rl_with_cand = df.stack().reset_index(level=1, drop=True).to_frame(name='candidate')

In [15]:
rl_with_cand['red_link_name'] = rl_with_cand.index

In [16]:
rl_with_cand

Unnamed: 0,candidate,red_link_name
аль-Каїм,Ahmad al-Muqtadir,аль-Каїм
аль-Каїм,Cai Xiang,аль-Каїм
аль-Каїм,Odo II,аль-Каїм
аль-Каїм,Judith of Swabia,аль-Каїм
аль-Каїм,Eudokia Makrembolitissa,аль-Каїм
аль-Каїм,Pontificate,аль-Каїм
аль-Каїм,Kadam (Tibetan Buddhism),аль-Каїм
аль-Каїм,Common year starting on Wednesday,аль-Каїм
аль-Каїм,Common year starting on Saturday,аль-Каїм
аль-Каїм,Common year starting on Tuesday,аль-Каїм


In [17]:
# add BabelNet results
BN_results = pd.read_csv(PATH_TO_DATA_RL+'BN_ukredlinks_wiki_evaluated.csv', encoding = 'UTF-8')

In [18]:
BN_results.head()

Unnamed: 0,red link name,EN BN,EN if else,Evaluation,Remarks
0,Каунт Бейсі,,Count Basie,FN,
1,аль-Каїм,Al-Qa'im (caliph),"also Al-Qa'im (town), Al-Qa'im (Cairo)",FP,article in UK Wiki exists. Impossible to know ...
2,королева,"gyne, monarch, queen regnant, queen (chess), T...",,FP,right item not first in the list
3,Prentice Hall,,Prentice Hall,FN,
4,Tokar.ua,,,TN,


In [19]:
df_with_BN = pd.merge(rl_with_cand, BN_results, how='inner', left_on='red_link_name', right_on='red link name')

In [20]:
df_with_BN = df_with_BN[['red_link_name','candidate', 'EN BN']]

In [21]:
df_with_BN.head()

Unnamed: 0,red_link_name,candidate,EN BN
0,аль-Каїм,Ahmad al-Muqtadir,Al-Qa'im (caliph)
1,аль-Каїм,Cai Xiang,Al-Qa'im (caliph)
2,аль-Каїм,Odo II,Al-Qa'im (caliph)
3,аль-Каїм,Judith of Swabia,Al-Qa'im (caliph)
4,аль-Каїм,Eudokia Makrembolitissa,Al-Qa'im (caliph)


In [22]:
bn_results = []
for index, row in df_with_BN.iterrows():
    bn_result = row['candidate'] == row['EN BN']
    bn_results.append(bn_result)

In [23]:
BN_df = pd.DataFrame(bn_results)

In [24]:
with_BN_df = pd.concat([df_with_BN, BN_df], axis=1)
with_BN_df.columns = ['red_link_name', 'candidate', 'EN BN', 'BN_results']
with_BN_df = with_BN_df[['red_link_name', 'candidate', 'BN_results']]

In [25]:
with_BN_df['BN_results'] = with_BN_df['BN_results'].replace(False, 0)

In [26]:
with_BN_df['BN_results'] = with_BN_df['BN_results'].replace(True, 1)

In [27]:
with_BN_df

Unnamed: 0,red_link_name,candidate,BN_results
0,Tokar.ua,List of virtual communities with more than 100...,False
1,Tokar.ua,Solar Energy Generating Systems,False
2,Tokar.ua,Stand-alone power system,False
3,Tokar.ua,Construction Equipment,False
4,Tokar.ua,Solar tracker,False
5,Tokar.ua,Comparison of instant messaging clients,False
6,Tokar.ua,Comparison of VoIP software,False
7,Tokar.ua,Yahoo! Tech,False
8,Tokar.ua,Trombe wall,False
9,Tokar.ua,Trinidadian and Tobagonian Americans,False


In [28]:
with_BN_df = with_BN_df.reset_index(level=0, drop=True)

In [29]:
# extract jaccard scores

In [30]:
all_results_scores = {}
all_results_list = [[]] * test_candidates.shape[0]
for i in range(test_candidates.shape[0]):
    if test_candidates['en_similar'][i] == '[]':
        all_results_scores[test_candidates['red_link_name'][i]] = []
        all_results_list[i] = []
    else:
        res = extract_score(test_candidates['en_similar'][i])
        all_results_scores[test_candidates['red_link_name'][i]] = res
        all_results_list[i] = res

In [31]:
len(all_results_scores)

625

In [32]:
df = pd.DataFrame.from_dict(all_results_scores, orient = 'index')

rl_with_score = df.stack().reset_index(level=0, drop=True).to_frame(name='jaccard_score')

In [33]:
rl_with_score = rl_with_score.reset_index(level=0, drop=True)

In [34]:
df_2f = pd.concat([with_BN_df, rl_with_score], axis=1)

In [35]:
df_2f.head()

Unnamed: 0,red_link_name,candidate,BN_results,jaccard_score
0,Tokar.ua,List of virtual communities with more than 100...,False,0.05
1,Tokar.ua,Solar Energy Generating Systems,False,0.032
2,Tokar.ua,Stand-alone power system,False,0.027
3,Tokar.ua,Construction Equipment,False,0.025
4,Tokar.ua,Solar tracker,False,0.023


In [36]:
# add levenstein distance score

In [37]:
def calculate_levendtein_df(row):
    return -levenshteinDistance(translit(row['red_link_name'], 'uk', reversed=True), row['candidate'])

In [38]:
df_2f['levenstein_score'] = df_2f.apply(calculate_levendtein_df, axis=1)

AttributeError: ("'float' object has no attribute 'translate'", 'occurred at index 563255')

In [None]:
df_3f = df_2f
df_3f