In [1]:
import pandas as pd

In [2]:
import numpy as np
from numpy import nan as Nan

In [3]:
import math

In [4]:
import transliterate
from transliterate import translit, get_available_language_codes

In [5]:
def extract_list(s):
    return [x.split(',')[1].strip("', ,\"") for x in s[2:-2].split('), (')]

In [6]:
def extract_score(s):
    return [float(x.split(',')[0].strip("', ,\"")) for x in s[2:-2].split('), (')]

In [7]:
def levenshteinDistance(s1, s2):
    if len(s1) > len(s2):
        s1, s2 = s2, s1

    distances = range(len(s1) + 1)
    for i2, c2 in enumerate(s2):
        distances_ = [i2+1]
        for i1, c1 in enumerate(s1):
            if c1 == c2:
                distances_.append(distances[i1])
            else:
                distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
        distances = distances_
    return distances[-1]

In [8]:
PATH_TO_DATA = '/media/andrii/earth/Katia/CS_MasterThesis/data/'
PATH_TO_DATA_RL = '/media/andrii/earth/Katia/CS_MasterThesis/data/ukwiki/uk_red_links/'
PATH_TO_DATA_UK = '/media/andrii/earth/Katia/CS_MasterThesis/data/ukwiki/ukwiki_20180920/'
PATH_TO_DATA_EN = '/media/andrii/earth/Katia/CS_MasterThesis/data/ukwiki/enwiki_20180920/'

In [9]:
uk_redlinks_sample = pd.read_csv(PATH_TO_DATA_RL+'df_sample_red_links_uk.csv', encoding = 'UTF-8')

In [10]:
df_basie = uk_redlinks_sample.iloc[[0]]

In [11]:
basie_value = uk_redlinks_sample.iloc[0]['red_link_name']

In [12]:
basie_value

'Каунт Бейсі'

In [13]:
candidates = pd.read_csv(PATH_TO_DATA_RL+'uk_red_links_results_all_candidates.csv', encoding = 'UTF-8')

In [14]:
list_of_basie_candidates = extract_list(candidates['en_similar'][0])

In [15]:
len(list_of_basie_candidates)

1018

In [16]:
df_basie_candidates = pd.DataFrame(list_of_basie_candidates)

In [17]:
df_basie_candidates.columns = [['candidates']]

In [18]:
df_basie

Unnamed: 0,red_link_name
0,Каунт Бейсі


In [19]:
df_basie_candidates

Unnamed: 0,candidates
0,The Count Basie Story
1,The Atomic Mr. Basie
2,Basie in London
3,Basie Jazz
4,Basie (album)
5,Everyday I Have the Blues (album)
6,The Count!
7,Golden Boy (Quincy Jones album)
8,Dance Session Album No. 2
9,Count Basie at Newport


In [20]:
df_zero_candidate = pd.DataFrame(['0'])
df_zero_candidate.columns = ['candidates']

In [21]:
df_basie_candidates = df_basie_candidates.append(df_zero_candidate, ignore_index=True)

In [22]:
df_basie_candidates['red_link_name'] = basie_value

In [23]:
df_basie_candidates

Unnamed: 0,candidates,red_link_name
0,The Count Basie Story,Каунт Бейсі
1,The Atomic Mr. Basie,Каунт Бейсі
2,Basie in London,Каунт Бейсі
3,Basie Jazz,Каунт Бейсі
4,Basie (album),Каунт Бейсі
5,Everyday I Have the Blues (album),Каунт Бейсі
6,The Count!,Каунт Бейсі
7,Golden Boy (Quincy Jones album),Каунт Бейсі
8,Dance Session Album No. 2,Каунт Бейсі
9,Count Basie at Newport,Каунт Бейсі


In [24]:
BN_results = pd.read_csv(PATH_TO_DATA_RL+'BN_ukredlinks_wiki_evaluated.csv', encoding = 'UTF-8')

In [25]:
BN_results = BN_results.iloc[0]['EN_BN']

In [26]:
if math.isnan(BN_results):
    BN_results = pd.DataFrame(['0'])
    BN_results.columns = ['BN_results']

In [27]:
BN_results

Unnamed: 0,BN_results
0,0


In [28]:
df_with_BN = pd.merge(df_basie_candidates, BN_results, how='outer', left_on='candidates', right_on='BN_results')

In [29]:
df_with_BN

Unnamed: 0,candidates,red_link_name,BN_results
0,The Count Basie Story,Каунт Бейсі,
1,The Atomic Mr. Basie,Каунт Бейсі,
2,Basie in London,Каунт Бейсі,
3,Basie Jazz,Каунт Бейсі,
4,Basie (album),Каунт Бейсі,
5,Everyday I Have the Blues (album),Каунт Бейсі,
6,The Count!,Каунт Бейсі,
7,Golden Boy (Quincy Jones album),Каунт Бейсі,
8,Dance Session Album No. 2,Каунт Бейсі,
9,Count Basie at Newport,Каунт Бейсі,


In [30]:
df_with_BN['BN_results'] = df_with_BN['BN_results'].fillna(-1)

In [31]:
df_with_BN['BN_results'] = df_with_BN['BN_results'].replace('0', 1)

In [32]:
df_with_BN

Unnamed: 0,candidates,red_link_name,BN_results
0,The Count Basie Story,Каунт Бейсі,-1
1,The Atomic Mr. Basie,Каунт Бейсі,-1
2,Basie in London,Каунт Бейсі,-1
3,Basie Jazz,Каунт Бейсі,-1
4,Basie (album),Каунт Бейсі,-1
5,Everyday I Have the Blues (album),Каунт Бейсі,-1
6,The Count!,Каунт Бейсі,-1
7,Golden Boy (Quincy Jones album),Каунт Бейсі,-1
8,Dance Session Album No. 2,Каунт Бейсі,-1
9,Count Basie at Newport,Каунт Бейсі,-1


In [33]:
# extract jaccard scores

In [34]:
cand_str = candidates['en_similar'][0]

In [35]:
list_of_jaccard_scores = extract_score(cand_str)

In [36]:
len(list_of_jaccard_scores)

1018

In [37]:
list_of_jaccard_scores

[0.081,
 0.076,
 0.068,
 0.066,
 0.066,
 0.065,
 0.063,
 0.063,
 0.063,
 0.062,
 0.061,
 0.059,
 0.051,
 0.051,
 0.051,
 0.05,
 0.05,
 0.049,
 0.049,
 0.049,
 0.048,
 0.048,
 0.048,
 0.047,
 0.046,
 0.046,
 0.045,
 0.043,
 0.041,
 0.038,
 0.036,
 0.034,
 0.033,
 0.033,
 0.032,
 0.032,
 0.032,
 0.032,
 0.032,
 0.032,
 0.031,
 0.031,
 0.031,
 0.031,
 0.029,
 0.029,
 0.029,
 0.029,
 0.027,
 0.027,
 0.027,
 0.027,
 0.027,
 0.027,
 0.026,
 0.026,
 0.026,
 0.025,
 0.025,
 0.024,
 0.023,
 0.023,
 0.023,
 0.023,
 0.023,
 0.023,
 0.023,
 0.023,
 0.023,
 0.022,
 0.022,
 0.022,
 0.022,
 0.021,
 0.021,
 0.02,
 0.019,
 0.019,
 0.019,
 0.019,
 0.019,
 0.019,
 0.019,
 0.018,
 0.018,
 0.018,
 0.018,
 0.018,
 0.018,
 0.018,
 0.018,
 0.018,
 0.018,
 0.017,
 0.017,
 0.017,
 0.017,
 0.017,
 0.017,
 0.017,
 0.017,
 0.017,
 0.016,
 0.016,
 0.016,
 0.016,
 0.016,
 0.016,
 0.016,
 0.016,
 0.016,
 0.016,
 0.016,
 0.016,
 0.016,
 0.016,
 0.016,
 0.016,
 0.016,
 0.016,
 0.016,
 0.016,
 0.016,
 0.016,
 0.016,
 0.

In [38]:
df_of_jaccard_scores = pd.DataFrame(list_of_jaccard_scores)

In [39]:
df_of_jaccard_scores.columns = [['jaccard_score']]

In [40]:
df_of_jaccard_scores

Unnamed: 0,jaccard_score
0,0.081
1,0.076
2,0.068
3,0.066
4,0.066
5,0.065
6,0.063
7,0.063
8,0.063
9,0.062


In [41]:
df_zero_score = pd.DataFrame([0])
df_zero_score.columns = ['jaccard_score']

In [42]:
df_of_jaccard_scores = df_of_jaccard_scores.append(df_zero_score, ignore_index=True)

In [63]:
df_2f = pd.concat([df_with_BN, df_of_jaccard_scores], axis=1)

In [64]:
df_2f

Unnamed: 0,candidates,red_link_name,BN_results,jaccard_score
0,The Count Basie Story,Каунт Бейсі,-1,0.081
1,The Atomic Mr. Basie,Каунт Бейсі,-1,0.076
2,Basie in London,Каунт Бейсі,-1,0.068
3,Basie Jazz,Каунт Бейсі,-1,0.066
4,Basie (album),Каунт Бейсі,-1,0.066
5,Everyday I Have the Blues (album),Каунт Бейсі,-1,0.065
6,The Count!,Каунт Бейсі,-1,0.063
7,Golden Boy (Quincy Jones album),Каунт Бейсі,-1,0.063
8,Dance Session Album No. 2,Каунт Бейсі,-1,0.063
9,Count Basie at Newport,Каунт Бейсі,-1,0.062


In [65]:
# add levenstein distance score

In [66]:
transliterated_rl = translit(basie_value, 'uk', reversed=True)
transliterated_rl

'Kaunt Bejsi'

In [67]:
levenshteinDistance('Kaunt Bejsi', 'The Count Basie Story')

15

In [68]:
df_2f['levenstein_score'] = df_2f['candidates'].apply(lambda row: -(levenshteinDistance(row, transliterated_rl)))

In [69]:
df_2f.dtypes

candidates           object
red_link_name        object
BN_results            int64
jaccard_score       float64
levenstein_score      int64
dtype: object

In [70]:
df_2f

Unnamed: 0,candidates,red_link_name,BN_results,jaccard_score,levenstein_score
0,The Count Basie Story,Каунт Бейсі,-1,0.081,-15
1,The Atomic Mr. Basie,Каунт Бейсі,-1,0.076,-16
2,Basie in London,Каунт Бейсі,-1,0.068,-13
3,Basie Jazz,Каунт Бейсі,-1,0.066,-9
4,Basie (album),Каунт Бейсі,-1,0.066,-11
5,Everyday I Have the Blues (album),Каунт Бейсі,-1,0.065,-28
6,The Count!,Каунт Бейсі,-1,0.063,-11
7,Golden Boy (Quincy Jones album),Каунт Бейсі,-1,0.063,-27
8,Dance Session Album No. 2,Каунт Бейсі,-1,0.063,-20
9,Count Basie at Newport,Каунт Бейсі,-1,0.062,-16


In [71]:
df_2f['total_score'] = df_2f.apply(lambda row: row.BN_results + row.jaccard_score + row.levenstein_score, axis=1)

In [72]:
df_2f

Unnamed: 0,candidates,red_link_name,BN_results,jaccard_score,levenstein_score,total_score
0,The Count Basie Story,Каунт Бейсі,-1,0.081,-15,-15.919
1,The Atomic Mr. Basie,Каунт Бейсі,-1,0.076,-16,-16.924
2,Basie in London,Каунт Бейсі,-1,0.068,-13,-13.932
3,Basie Jazz,Каунт Бейсі,-1,0.066,-9,-9.934
4,Basie (album),Каунт Бейсі,-1,0.066,-11,-11.934
5,Everyday I Have the Blues (album),Каунт Бейсі,-1,0.065,-28,-28.935
6,The Count!,Каунт Бейсі,-1,0.063,-11,-11.937
7,Golden Boy (Quincy Jones album),Каунт Бейсі,-1,0.063,-27,-27.937
8,Dance Session Album No. 2,Каунт Бейсі,-1,0.063,-20,-20.937
9,Count Basie at Newport,Каунт Бейсі,-1,0.062,-16,-16.938


In [73]:
df_2f.loc[df_2f['candidates'] == 'Count Basie']

Unnamed: 0,candidates,red_link_name,BN_results,jaccard_score,levenstein_score,total_score
74,Count Basie,Каунт Бейсі,-1,0.021,-5,-5.979


In [74]:
max_indx = df_2f['total_score'].argmax()

In [75]:
final_answer = df_2f.iloc[max_indx]
final_answer

candidates          Count Basie
red_link_name       Каунт Бейсі
BN_results                   -1
jaccard_score             0.021
levenstein_score             -5
total_score              -5.979
Name: 74, dtype: object

In [76]:
df_2f.head()

Unnamed: 0,candidates,red_link_name,BN_results,jaccard_score,levenstein_score,total_score
0,The Count Basie Story,Каунт Бейсі,-1,0.081,-15,-15.919
1,The Atomic Mr. Basie,Каунт Бейсі,-1,0.076,-16,-16.924
2,Basie in London,Каунт Бейсі,-1,0.068,-13,-13.932
3,Basie Jazz,Каунт Бейсі,-1,0.066,-9,-9.934
4,Basie (album),Каунт Бейсі,-1,0.066,-11,-11.934


In [78]:
df_2f = df_2f[['red_link_name','candidates','BN_results','jaccard_score','levenstein_score','total_score']]

In [79]:
df_2f.head()

Unnamed: 0,red_link_name,candidates,BN_results,jaccard_score,levenstein_score,total_score
0,Каунт Бейсі,The Count Basie Story,-1,0.081,-15,-15.919
1,Каунт Бейсі,The Atomic Mr. Basie,-1,0.076,-16,-16.924
2,Каунт Бейсі,Basie in London,-1,0.068,-13,-13.932
3,Каунт Бейсі,Basie Jazz,-1,0.066,-9,-9.934
4,Каунт Бейсі,Basie (album),-1,0.066,-11,-11.934


In [81]:
# add ground truth
ground_truth = pd.read_csv(PATH_TO_DATA_RL+'ukredlinks_ground_truth_3194.csv', encoding = 'UTF-8')

In [93]:
ground_truth_df = ground_truth.iloc[[0]]
ground_truth_df = ground_truth_df[['ground_truth']]

In [94]:
ground_truth_df

Unnamed: 0,ground_truth
0,Count Basie


In [104]:
ground_truth_name = ground_truth_df['ground_truth'][0]

In [105]:
ground_truth_name

'Count Basie'

In [96]:
df_3f_with_gt = pd.merge(df_2f,ground_truth_df, how='left', left_on='candidates', right_on='ground_truth')

In [107]:
df_3f_with_gt[df_3f_with_gt['ground_truth']!=0]

Unnamed: 0,red_link_name,candidates,BN_results,jaccard_score,levenstein_score,total_score,ground_truth
74,Каунт Бейсі,Count Basie,-1,0.021,-5,-5.979,1


In [98]:
df_3f_with_gt['ground_truth'] = df_3f_with_gt['ground_truth'].fillna(0)

In [106]:
df_3f_with_gt['ground_truth'] = df_3f_with_gt['ground_truth'].replace(ground_truth_name, 1)

In [115]:
# get train and test samples
sample = ground_truth[['red_link_name']]
msk = np.random.rand(len(sample)) < 0.8
train = sample[msk]
test = sample[~msk]

print('train length =', len(train))
print('test length =', len(test))