In [1]:
import pandas as pd
import os
import glob
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import numpy as np
import sklearn.metrics
from collections import defaultdict
import shutil
import pickle

pd.reset_option('all')

As the xlwt package is no longer maintained, the xlwt engine will be removed in a future version of pandas. This is the only engine in pandas that supports writing in the xls format. Install openpyxl and write to an xlsx file instead.

: boolean
    use_inf_as_null had been deprecated and will be removed in a future
    version. Use `use_inf_as_na` instead.



In [16]:
input_file_path = '/Users/amandeep/Github/wikidata-wikifier/wikifier/sample_files/cricketers.csv'
wikify_column_name = "cricketers"
output_path = '/tmp/cricketers'
es_index = 'wikidatadwd-augmented-04'
es_url = 'http://ckg07:9200'

In [3]:
temp_dir = f'{output_path}/temp'

In [4]:
!mkdir -p $output_path
!mkdir -p $temp_dir

In [11]:
#intermediate files
canonical = f'{temp_dir}/canonical.csv'
candidates = f"{temp_dir}/candidates.csv"
singleton_feature = f"{temp_dir}/singleton.csv"
feature_class_count = f"{temp_dir}/feature_class_count.csv"
feature_property_count = f"{temp_dir}/feature_property_count.csv"
feature_class_property_count = f"{temp_dir}/feature_property_class_count.csv"
score_file = f"{temp_dir}/scores.csv"
model_name = 'rf_tuned_ranking.pkl'

embedding_file = f'{temp_dir}/graph_embedding_complex.tsv'
aux_field = 'graph_embedding_complex,class_count,property_count,context'
final_score = f'{temp_dir}/final_score.csv'
top_k_file = f"{temp_dir}/topk-hormones.csv" 
final_output = f"{output_path}/linked-hormones.csv" 

## Peek at the input file

In [12]:
pd.read_csv(input_file_path).fillna("")

Unnamed: 0,cricketers,teams,weight,dob
0,Virat Kohli,royal challengers bangalore,152,5/11/88
1,Tendulkar,mumbai indians,137,24/04/1973
2,Dhoni,chennai super kings,154,7/7/81
3,Jasprit Bumrah,mumbai indians,154,6/12/93
4,Ajinkya Rahane,rajasthan royals,134,6/6/88
5,Rohit Sharma,mumbai indians,159,30/04/1987
6,Bhuvneshwar Kumar,deccan chargers,154,5/2/90
7,Ravindra Jadeja,chennai super kings,132,6/12/88
8,Rishabh Pant,delhi capitals,136,4/8/97
9,Shikhar Dhawan,delhi capitals,157,5/12/85


## Canonicalize

In [13]:
!tl canonicalize \
-c "$wikify_column_name" \
--add-context \
{input_file_path} > {canonical}

In [14]:
df = pd.read_csv(canonical)
df

Unnamed: 0,column,row,label,context
0,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88
1,0,1,Tendulkar,mumbai indians|137|24/04/1973
2,0,2,Dhoni,chennai super kings|154|7/7/81
3,0,3,Jasprit Bumrah,mumbai indians|154|6/12/93
4,0,4,Ajinkya Rahane,rajasthan royals|134|6/6/88
5,0,5,Rohit Sharma,mumbai indians|159|30/04/1987
6,0,6,Bhuvneshwar Kumar,deccan chargers|154|5/2/90
7,0,7,Ravindra Jadeja,chennai super kings|132|6/12/88
8,0,8,Rishabh Pant,delhi capitals|136|4/8/97
9,0,9,Shikhar Dhawan,delhi capitals|157|5/12/85


## Candidate Generation

In [17]:
%%time
!tl clean -c label -o label_clean {canonical} / \
--url $es_url --index $es_index \
get-fuzzy-augmented-matches -c label_clean \
--auxiliary-fields {aux_field} \
--auxiliary-folder $temp_dir / \
--url $es_url --index $es_index \
get-exact-matches -c label_clean \
--auxiliary-fields {aux_field} \
--auxiliary-folder {temp_dir} > {candidates}

CPU times: user 4.35 s, sys: 1.28 s, total: 5.63 s
Wall time: 5min 8s


In [19]:
column_rename_dict = {
    'graph_embedding_complex': 'embedding',
     'class_count': 'class_count',
    'property_count': 'property_count',
    'context': 'context'
}
for field in aux_field.split(','):
    aux_list = []
    for f in glob.glob(f'{temp_dir}/*{field}.tsv'):
        aux_list.append(pd.read_csv(f, sep='\t', dtype=object))
    aux_df = pd.concat(aux_list).drop_duplicates(subset=['qnode']).rename(columns={field: column_rename_dict[field]})
    aux_df.to_csv(f'{temp_dir}/{field}.tsv', sep='\t', index=False)

In [21]:
pd.read_csv(f'{temp_dir}/context.tsv', sep='\t').head(20)

Unnamed: 0,qnode,context
0,Q213854,"d""1988"":P569|e""/m/03qkvyf"":P646|e""101095"":P269..."
1,Q1711834,"i""tendulkar"":P1889:Q7699668"
2,Q7699668,"i""tendulkar"":P1889:Q1711834"
3,Q142613,"d""1988"":P569|e""/m/0bgrs7"":P646|e""17473"":P2698|..."
4,Q3522062,"d""1988"":P569|e""/m/027bdbn"":P646|e""100673"":P269..."
5,Q7487531,"d""1990"":P569|e""/m/0j4c8gg"":P646|e""481896"":P269..."
6,Q101197030,"i""akathethara grama panchayat"":P361:Q16133575|..."
7,Q253202,"e""/m/09tsv8"":P646"
8,Q5269735,"i""aryavratt"":P17:Q668|i""bharat"":P17:Q668|i""bha..."
9,Q5269737,"d""2012"":P577|e""/m/0h3tpdp"":P646|e""313880"":P252..."


In [11]:
pd.read_csv(candidates, nrows = 150).fillna("")

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,pagerank,retrieval_score
0,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q213854,Virat Kohli,Cheeku,fuzzy-augmented,Indian cricket player,3.983031e-09,36.393840
1,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q102354285,Marie Virat,,fuzzy-augmented,Ph. D. 2009,5.918546e-09,23.484630
2,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q16027751,Bernard Virat,,fuzzy-augmented,French biologist (1921-2003),3.740191e-09,23.484630
3,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q7907059,VIRAT,,fuzzy-augmented,,0.000000e+00,20.582134
4,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q2978459,Virata,Virat,fuzzy-augmented,character from the epic Mahabharata,6.890132e-09,20.520416
...,...,...,...,...,...,...,...,...,...,...,...,...
145,0,2,Dhoni,chennai super kings|154|7/7/81,Dhoni,Q31728772,Radio Dhoni,,fuzzy-augmented,radio station in Dhaka,0.000000e+00,17.156477
146,0,2,Dhoni,chennai super kings|154|7/7/81,Dhoni,Q102351448,Shahnawaz Dhani,,fuzzy-augmented,Pakistani cricketer,3.539613e-09,17.115705
147,0,2,Dhoni,chennai super kings|154|7/7/81,Dhoni,Q2627581,Omar Dhani,Omar Dhani,fuzzy-augmented,Commander of the Indonesian Air Force,3.539613e-09,17.115705
148,0,2,Dhoni,chennai super kings|154|7/7/81,Dhoni,Q4695305,Ahmad Dhani,,fuzzy-augmented,Indonesian musician and songwriter,2.755712e-08,17.115705


### Add singleton feature

In [12]:
!tl create-singleton-feature -o singleton  {candidates} > {singleton_feature}

In [13]:
pd.read_csv(singleton_feature, dtype=object).head().fillna("")

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,pagerank,retrieval_score,singleton
0,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q213854,Virat Kohli,Cheeku,fuzzy-augmented,Indian cricket player,3.983031232217997e-09,36.39384,0
1,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q102354285,Marie Virat,,fuzzy-augmented,Ph. D. 2009,5.918546005357847e-09,23.48463,0
2,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q16027751,Bernard Virat,,fuzzy-augmented,French biologist (1921-2003),3.7401912005599e-09,23.48463,0
3,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q7907059,VIRAT,,fuzzy-augmented,,0.0,20.582134,0
4,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q2978459,Virata,Virat,fuzzy-augmented,character from the epic Mahabharata,6.8901323967569805e-09,20.520416,0


### Add Class Count TF IDF Feature

In [14]:
!tl compute-tf-idf \
--feature-file /tmp/cricketers/temp/class_count.tsv \
--feature-name class_count \
--singleton-column singleton \
-o class_count_tf_idf_score \
{singleton_feature} > {feature_class_count}

#### Peak at class count tf idf feature file

In [15]:
pd.read_csv(feature_class_count, dtype=object).head(20).fillna("")

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,pagerank,retrieval_score,singleton,class_count_tf_idf_score
0,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q213854,Virat Kohli,Cheeku,fuzzy-augmented,Indian cricket player,3.983031232217997e-09,36.39384,0,1.0000000000000002
1,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q102354285,Marie Virat,,fuzzy-augmented,Ph. D. 2009,5.918546005357847e-09,23.48463,0,0.5442234316047089
2,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q16027751,Bernard Virat,,fuzzy-augmented,French biologist (1921-2003),3.7401912005599e-09,23.48463,0,0.5442234316047089
3,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q7907059,VIRAT,,fuzzy-augmented,,0.0,20.582134,0,0.0
4,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q2978459,Virata,Virat,fuzzy-augmented,character from the epic Mahabharata,6.8901323967569805e-09,20.520416,0,0.0311056621541158
5,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q16682735,,,fuzzy-augmented,,3.5396131256502836e-09,19.623405,0,0.2028730148266441
6,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q6426050,Kohli,,fuzzy-augmented,,3.5396131256502836e-09,19.601744,0,0.0181540368050153
7,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q46251,Fränzi Mägert-Kohli,Franziska Kohli|Fraenzi Maegert-Kohli,fuzzy-augmented,Swiss snowboarder,3.5396131256502836e-09,19.233713,0,0.6945347101120541
8,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q16434086,Wirat Wachirarattanawong,,fuzzy-augmented,,3.5396131256502836e-09,19.010628,0,0.5442234316047089
9,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q19899153,Virat Singh,,fuzzy-augmented,Indian cricketer,3.5396131256502836e-09,19.010628,0,1.0000000000000002


#### Get top 1 candidate for each cell

In [16]:
!tl get-kg-links -c class_count_tf_idf_score -l label -k 1 --k-rows  $feature_class_count > $temp_dir/class_count_top_k.csv

In [17]:
pd.read_csv(f"{temp_dir}/class_count_top_k.csv").fillna("")

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,pagerank,retrieval_score,singleton,class_count_tf_idf_score
0,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q213854,Virat Kohli,Cheeku,fuzzy-augmented,Indian cricket player,3.983031e-09,36.39384,0,1.0
1,0,1,Tendulkar,mumbai indians|137|24/04/1973,Tendulkar,Q9488,Sachin Tendulkar,Sachin Ramesh Tendulkar|Master Blaster,fuzzy-augmented,Indian former cricketer,1.196003e-08,28.334663,0,1.0
2,0,10,Cheteshwar Pujara,deccan chargers|157|25/01/1988,Cheteshwar Pujara,Q142613,Cheteshwar Pujara,Cheteshwar Arvind Pujara,fuzzy-augmented,Indian cricket player,3.93661e-09,40.74549,0,1.0
3,0,11,Ishant Sharma,delhi capitals|168|2/9/88,Ishant Sharma,Q3522062,Ishant Sharma,,fuzzy-augmented,Indian cricket player.,3.539613e-09,30.923111,0,1.0
4,0,12,Mohammad Shami,kings XI punjab|152|3/9/90,Mohammad Shami,Q7487531,Mohammed Shami,Mohammad Shami|Mohammed Shami Ahmed|Mohammad S...,fuzzy-augmented,Indian cricketer,3.539613e-09,28.241823,0,1.0
5,0,2,Dhoni,chennai super kings|154|7/7/81,Dhoni,Q470774,MS Dhoni,Mr Cool|Mahi|Mahendra Singh Dhoni|Finisher|Cap...,fuzzy-augmented,Indian cricket player,6.350345e-09,21.508753,0,1.0
6,0,3,Jasprit Bumrah,mumbai indians|154|6/12/93,Jasprit Bumrah,Q16227998,Jasprit Bumrah,Jasprit Jasbirsingh Bumrah|Jasprit Jasbir Sing...,fuzzy-augmented,cricketer,3.539613e-09,40.825333,0,1.0
7,0,4,Ajinkya Rahane,rajasthan royals|134|6/6/88,Ajinkya Rahane,Q137669,Ajinkya Rahane,Ajinkya Madhukar Rahane|rahane,fuzzy-augmented,Indian cricketer,3.539613e-09,41.210064,0,1.0
8,0,5,Rohit Sharma,mumbai indians|159|30/04/1987,Rohit Sharma,Q3520045,Rohit Sharma,Rohit Gurunath Sharma|Hitman,fuzzy-augmented,Indian cricketer,3.84048e-09,31.262672,0,1.0
9,0,6,Bhuvneshwar Kumar,deccan chargers|154|5/2/90,Bhuvneshwar Kumar,Q2003153,Bhuvneshwar Kumar,Bhuvneshwar Kumar Singh,fuzzy-augmented,Indian cricket player,3.539613e-09,43.885715,0,1.0


### Add Property Count TF IDF Feature

In [18]:
!tl compute-tf-idf \
--feature-file /tmp/cricketers/temp/property_count.tsv \
--feature-name property_count \
--singleton-column singleton \
-o property_count_tf_idf_score \
{singleton_feature} > {feature_property_count}

#### Peak at property count tf idf feature file

In [19]:
pd.read_csv(feature_property_count, dtype=object).head(20).fillna("")

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,pagerank,retrieval_score,singleton,property_count_tf_idf_score
0,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q213854,Virat Kohli,Cheeku,fuzzy-augmented,Indian cricket player,3.983031232217997e-09,36.39384,0,0.890500842236226
1,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q102354285,Marie Virat,,fuzzy-augmented,Ph. D. 2009,5.918546005357847e-09,23.48463,0,0.00020108672886
2,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q16027751,Bernard Virat,,fuzzy-augmented,French biologist (1921-2003),3.7401912005599e-09,23.48463,0,0.1796921790537892
3,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q7907059,VIRAT,,fuzzy-augmented,,0.0,20.582134,0,0.0233418341311859
4,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q2978459,Virata,Virat,fuzzy-augmented,character from the epic Mahabharata,6.8901323967569805e-09,20.520416,0,0.0710312443418589
5,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q16682735,,,fuzzy-augmented,,3.5396131256502836e-09,19.623405,0,0.0243861245257598
6,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q6426050,Kohli,,fuzzy-augmented,,3.5396131256502836e-09,19.601744,0,0.023542920860046
7,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q46251,Fränzi Mägert-Kohli,Franziska Kohli|Fraenzi Maegert-Kohli,fuzzy-augmented,Swiss snowboarder,3.5396131256502836e-09,19.233713,0,0.2273182570951894
8,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q16434086,Wirat Wachirarattanawong,,fuzzy-augmented,,3.5396131256502836e-09,19.010628,0,0.1279399831341767
9,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q19899153,Virat Singh,,fuzzy-augmented,Indian cricketer,3.5396131256502836e-09,19.010628,0,0.4087703490443972


#### Get top 1 candidate for each cell

In [20]:
!tl get-kg-links -c property_count_tf_idf_score -l label -k 1 --k-rows  $feature_property_count > $temp_dir/property_count_top_k.csv

In [21]:
pd.read_csv(f"{temp_dir}/property_count_top_k.csv").fillna("")

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,pagerank,retrieval_score,singleton,property_count_tf_idf_score
0,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q213854,Virat Kohli,Cheeku,fuzzy-augmented,Indian cricket player,3.983031e-09,36.39384,0,0.890501
1,0,1,Tendulkar,mumbai indians|137|24/04/1973,Tendulkar,Q9488,Sachin Tendulkar,Sachin Ramesh Tendulkar|Master Blaster,fuzzy-augmented,Indian former cricketer,1.196003e-08,28.334663,0,0.894584
2,0,10,Cheteshwar Pujara,deccan chargers|157|25/01/1988,Cheteshwar Pujara,Q142613,Cheteshwar Pujara,Cheteshwar Arvind Pujara,fuzzy-augmented,Indian cricket player,3.93661e-09,40.74549,0,0.867785
3,0,11,Ishant Sharma,delhi capitals|168|2/9/88,Ishant Sharma,Q3522062,Ishant Sharma,,fuzzy-augmented,Indian cricket player.,3.539613e-09,30.923111,0,0.866405
4,0,12,Mohammad Shami,kings XI punjab|152|3/9/90,Mohammad Shami,Q7487531,Mohammed Shami,Mohammad Shami|Mohammed Shami Ahmed|Mohammad S...,fuzzy-augmented,Indian cricketer,3.539613e-09,28.241823,0,0.69542
5,0,2,Dhoni,chennai super kings|154|7/7/81,Dhoni,Q470774,MS Dhoni,Mr Cool|Mahi|Mahendra Singh Dhoni|Finisher|Cap...,fuzzy-augmented,Indian cricket player,6.350345e-09,21.508753,0,0.939707
6,0,3,Jasprit Bumrah,mumbai indians|154|6/12/93,Jasprit Bumrah,Q16227998,Jasprit Bumrah,Jasprit Jasbirsingh Bumrah|Jasprit Jasbir Sing...,fuzzy-augmented,cricketer,3.539613e-09,40.825333,0,0.790571
7,0,4,Ajinkya Rahane,rajasthan royals|134|6/6/88,Ajinkya Rahane,Q137669,Ajinkya Rahane,Ajinkya Madhukar Rahane|rahane,fuzzy-augmented,Indian cricketer,3.539613e-09,41.210064,0,0.851704
8,0,5,Rohit Sharma,mumbai indians|159|30/04/1987,Rohit Sharma,Q3520045,Rohit Sharma,Rohit Gurunath Sharma|Hitman,fuzzy-augmented,Indian cricketer,3.84048e-09,31.262672,0,0.854757
9,0,6,Bhuvneshwar Kumar,deccan chargers|154|5/2/90,Bhuvneshwar Kumar,Q2003153,Bhuvneshwar Kumar,Bhuvneshwar Kumar Singh,fuzzy-augmented,Indian cricket player,3.539613e-09,43.885715,0,0.82672


## Use the combined property and class counts

In [22]:
pdf = pd.read_csv(f"{temp_dir}/property_count.tsv", sep='\t')
cdf = pd.read_csv(f"{temp_dir}/class_count.tsv", sep='\t')

class_prop_file = f"{temp_dir}/class_property_count.tsv"
df = pdf.merge(cdf, on='qnode', how='left').fillna("")
df['class_prop_count_temp'] = list(zip(df.property_count, df.class_count))
df['class_property_count'] = df['class_prop_count_temp'].map(lambda x: "|".join(x) if x[1] != "" else x[0])
df.drop(columns=['class_prop_count_temp', 'class_count', 'property_count'], inplace=True)
df.to_csv(class_prop_file, sep='\t', index=False)
df

Unnamed: 0,qnode,class_property_count
0,Q213854,P106:6339031|P140:361513|P1532:159268|P166:515...
1,Q1711834,P1889:440686|P31:41379394|Q104624828:6675312|Q...
2,Q7699668,P1705:608753|P1889:440686|P31:41379394|Q101352...
3,Q142613,P106:6339031|P1532:159268|P166:515832|P18:3544...
4,Q3522062,P106:6339031|P1532:159268|P166:515832|P18:3544...
...,...,...
1252,Q16045629,P106:6339031|P19:2698945|P214:2562940|P21:6913...
1253,Q4121790,P106:6339031|P18:3544200|P19:2698945|P2020:491...
1254,Q13512643,P106:6339031|P1412:909907|P18:3544200|P19:2698...
1255,Q21064029,P102:392278|P106:6339031|P19:2698945|P21:69137...


In [23]:
!tl compute-tf-idf \
--feature-file /tmp/cricketers/temp/class_property_count.tsv \
--feature-name class_property_count \
--singleton-column singleton \
-o class_property_count_tf_idf_score \
{singleton_feature} > {feature_class_property_count}

#### Peak at class property count tf idf feature file

In [24]:
pd.read_csv(feature_class_property_count, dtype=object).head(20).fillna("")

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,pagerank,retrieval_score,singleton,class_property_count_tf_idf_score
0,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q213854,Virat Kohli,Cheeku,fuzzy-augmented,Indian cricket player,3.983031232217997e-09,36.39384,0,0.9296449012188996
1,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q102354285,Marie Virat,,fuzzy-augmented,Ph. D. 2009,5.918546005357847e-09,23.48463,0,0.1946796853830472
2,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q16027751,Bernard Virat,,fuzzy-augmented,French biologist (1921-2003),3.7401912005599e-09,23.48463,0,0.3100058101564548
3,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q7907059,VIRAT,,fuzzy-augmented,,0.0,20.582134,0,0.0149975313013316
4,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q2978459,Virata,Virat,fuzzy-augmented,character from the epic Mahabharata,6.8901323967569805e-09,20.520416,0,0.0567585378115842
5,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q16682735,,,fuzzy-augmented,,3.5396131256502836e-09,19.623405,0,0.0881921071858676
6,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q6426050,Kohli,,fuzzy-augmented,,3.5396131256502836e-09,19.601744,0,0.0216164878206615
7,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q46251,Fränzi Mägert-Kohli,Franziska Kohli|Fraenzi Maegert-Kohli,fuzzy-augmented,Swiss snowboarder,3.5396131256502836e-09,19.233713,0,0.3943400750531187
8,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q16434086,Wirat Wachirarattanawong,,fuzzy-augmented,,3.5396131256502836e-09,19.010628,0,0.2767541310529668
9,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q19899153,Virat Singh,,fuzzy-augmented,Indian cricketer,3.5396131256502836e-09,19.010628,0,0.6201247448401982


#### Get top 1 candidate for each cell

In [25]:
!tl get-kg-links -c class_property_count_tf_idf_score -l label -k 3 --k-rows  $feature_class_property_count > $temp_dir/class_property_count_top_k.csv

In [26]:
pd.read_csv(f"{temp_dir}/class_property_count_top_k.csv").fillna("")

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,pagerank,retrieval_score,singleton,class_property_count_tf_idf_score
0,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q213854,Virat Kohli,Cheeku,fuzzy-augmented,Indian cricket player,3.983031e-09,36.39384,0,0.929645
1,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q19899153,Virat Singh,,fuzzy-augmented,Indian cricketer,3.539613e-09,19.010628,0,0.620125
2,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q7686953,Taruwar Kohli,Taruwar Sushil Kohli,fuzzy-augmented,Indian cricketer,3.539613e-09,17.400097,0,0.586583
3,0,1,Tendulkar,mumbai indians|137|24/04/1973,Tendulkar,Q9488,Sachin Tendulkar,Sachin Ramesh Tendulkar|Master Blaster,fuzzy-augmented,Indian former cricketer,1.196003e-08,28.334663,0,0.932268
4,0,1,Tendulkar,mumbai indians|137|24/04/1973,Tendulkar,Q22327439,Arjun Tendulkar,,fuzzy-augmented,cricketer,4.609075e-09,20.530342,0,0.566297
5,0,1,Tendulkar,mumbai indians|137|24/04/1973,Tendulkar,Q55744,Vijay Tendulkar,Vijay Dhondopant Tendulkar,fuzzy-augmented,Indian writer,1.156015e-08,20.728312,0,0.340692
6,0,10,Cheteshwar Pujara,deccan chargers|157|25/01/1988,Cheteshwar Pujara,Q142613,Cheteshwar Pujara,Cheteshwar Arvind Pujara,fuzzy-augmented,Indian cricket player,3.93661e-09,40.74549,0,0.91505
7,0,10,Cheteshwar Pujara,deccan chargers|157|25/01/1988,Cheteshwar Pujara,Q16225224,Arvind Pujara,,fuzzy-augmented,cricketer,3.736444e-09,21.39463,0,0.543533
8,0,10,Cheteshwar Pujara,deccan chargers|157|25/01/1988,Cheteshwar Pujara,Q5833940,Enrique Wong,Enrique Wong Pujada,fuzzy-augmented,Peruvian politician,3.539613e-09,15.250788,0,0.39257
9,0,11,Ishant Sharma,delhi capitals|168|2/9/88,Ishant Sharma,Q3522062,Ishant Sharma,,fuzzy-augmented,Indian cricket player.,3.539613e-09,30.923111,0,0.914163
