In [1]:
import tempfile
from pathlib import Path
import glob
import pandas as pd

In [2]:
temp_dir = tempfile.mkdtemp()

In [3]:
# Parameters

input_file_path = "/Users/amandeep/Github/wikidata-wikifier/wikifier/sample_files/cricketers.csv"
wikify_column_name = "cricketers"
output_file = '/tmp/cricketers_results.csv'
k = 5

In [15]:
es_url = 'http://ckg07:9200'
es_index = 'wikidatadwd-augmented'


aux_field = 'graph_embedding_complex,class_count,property_count,context'
aux_path = f"{temp_dir}/aux_files"

Path(aux_path).mkdir(parents=True, exist_ok=True)

candidates_file = f"{temp_dir}/candidates.csv"
features_file = f"{temp_dir}/features.csv"
context_property_file = f"{temp_dir}/context_properties.csv"
colorized_file = f"{temp_dir}/colorized.xlsx"

# Pseudo GT model and normalization factor
pseudo_gt_model = './models/epoch_1_loss_0.534353494644165_top1_0.7883487007544007.pth'
pseudo_gt_min_max_scaler_path = './models/normalization_factor.pkl'

# Table Linker model and normalization factor
min_max_scaler_path = './models/tl_pipeline_normalization_factor.pkl'
tbl_prediction_model = './models/epoch_17_loss_0.014523069374263287_top1_0.9675043327556326.pth'

final_score_column = 'siamese_prediction'
threshold = final_score_column+":median"

pseudo_gt_features = ["monge_elkan","monge_elkan_aliases","jaro_winkler",
                      "levenshtein","singleton","pgr_rts","context_score",
                      "smc_class_score","smc_property_score"]
pgt_feature_str =  ",".join(pseudo_gt_features)

features = ["monge_elkan","monge_elkan_aliases","jaro_winkler",
            "levenshtein","singleton","context_score_3","pgt_centroid_score","pgt_class_count_tf_idf_score",
            "pgt_property_count_tf_idf_score", "num_occurences"]
feature_str = ",".join(features)

In [5]:
temp_dir

'/var/folders/qv/cxzpwz3j29x7n79vwpw253v80000gn/T/tmpb4kwvfte'

## Generate Candidates

In [6]:
!tl canonicalize -c "$wikify_column_name" --add-context $input_file_path \
    / clean -c label -o label_clean \
    / --url $es_url --index $es_index \
    get-fuzzy-augmented-matches -c label_clean \
    --auxiliary-fields {aux_field} \
    --auxiliary-folder "$temp_dir" \
    / get-exact-matches -c label_clean \
    --auxiliary-fields {aux_field} \
    --auxiliary-folder "$temp_dir" \
    / get-ngram-matches -c label_clean  \
    --auxiliary-fields {aux_field} \
    --auxiliary-folder "$temp_dir" \
    / get-trigram-matches -c label_clean \
    --auxiliary-fields {aux_field} \
    --auxiliary-folder "$temp_dir" > "$candidates_file"

canonicalize Time: 0.0019409656524658203s
clean Time: 0.001583099365234375s
get-fuzzy-augmented-matches Time: 2.6989941596984863s
get-exact-matches Time: 0.24348711967468262s
get-ngram-matches Time: 1.110356092453003s
get-trigram-matches Time: 3.3257977962493896s


In [7]:
for field in aux_field.split(','):
    aux_list = []
    if field == 'context':
        file_list = glob.glob(f'{temp_dir}/*{field}.jl')
        context_file = f"{aux_path}/context.jl"
        o_f = open(context_file, 'w')

        for i_f_P in file_list:
            i_f = open(i_f_P)
            for line in i_f:
                o_f.write(line)
            i_f.close()
        o_f.close()

    else:
        for f in glob.glob(f'{temp_dir}/*{field}.tsv'):
            aux_list.append(pd.read_csv(f, sep='\t', dtype=object))
        aux_df = pd.concat(aux_list).drop_duplicates(subset=['qnode'])
        if field == 'class_count':
            class_count_file = f"{aux_path}/class_count.tsv"
            aux_df.to_csv(class_count_file, sep='\t', index=False)
        elif field == 'property_count':
            prop_count_file = f"{aux_path}/prop_count.tsv"
            aux_df.to_csv(prop_count_file, sep='\t', index=False)
        else:
            graph_embedding_file = f"{aux_path}/graph_embedding_complex.tsv"
            aux_df.to_csv(graph_embedding_file, sep='\t', index=False)

## Features

In [8]:
!tl deduplicate-candidates -c kg_id $candidates_file \
    / string-similarity -i --method symmetric_monge_elkan:tokenizer=word -o monge_elkan --threshold 0.5 \
    / string-similarity -i --method symmetric_monge_elkan:tokenizer=word -c label_clean kg_aliases -o monge_elkan_aliases --threshold 0.5 \
    / string-similarity -i --method jaro_winkler -o jaro_winkler --threshold 0.5 \
    / string-similarity -i --method levenshtein -o levenshtein --threshold 0.5 \
    / create-singleton-feature -o singleton \
    / pick-hc-candidates -o ignore_candidate --string-similarity-label-columns monge_elkan,jaro_winkler,levenshtein --string-similarity-alias-columns monge_elkan_aliases \
    / context-match --debug --context-file $context_file --ignore-column-name ignore_candidate -o context_score \
    --similarity-string-threshold 0.85 --similarity-quantity-threshold 0.9 \
    --save-relevant-properties --context-properties-path $context_property_file \
    / kth-percentile -c context_score -o kth_percenter --ignore-column ignore_candidate --k-percentile 0.75  --minimum-cells 10 \
    / pgt-semantic-tf-idf \
    -o smc_class_score \
    --pagerank-column pagerank \
    --retrieval-score-column retrieval_score \
    --feature-file "$class_count_file" \
    --feature-name class_count \
    --high-confidence-column kth_percenter \
    / pgt-semantic-tf-idf \
    -o smc_property_score \
    --pagerank-column pagerank \
    --retrieval-score-column retrieval_score \
    --feature-file "$prop_count_file" \
    --feature-name property_count \
    --high-confidence-column kth_percenter \
    / predict-using-model -o pseudo_gt_prediction \
    --features $pgt_feature_str \
    --ranking-model $pseudo_gt_model \
    --ignore-column ignore_candidate \
    --normalization-factor $pseudo_gt_min_max_scaler_path \
    / create-pseudo-gt -o pseudo_gt \
    --column-thresholds pseudo_gt_prediction:mean \
    --filter smc_class_score:0 \
    / context-match --debug --context-file $context_file -o context_score_3 \
    --similarity-string-threshold 0.85 --similarity-quantity-threshold 0.9 \
    --use-relevant-properties --context-properties-path $context_property_file \
    / mosaic-features -c kg_labels --num-char --num-tokens \
    / score-using-embedding \
    --column-vector-strategy centroid-of-lof \
    --lof-strategy pseudo-gt \
    -o pgt_centroid_score \
    --embedding-file $graph_embedding_file \
    / compute-tf-idf  \
    --feature-file $class_count_file \
    --feature-name class_count \
    --singleton-column pseudo_gt \
    -o pgt_class_count_tf_idf_score \
    / compute-tf-idf \
    --feature-file $prop_count_file \
    --feature-name property_count \
    --singleton-column pseudo_gt \
    -o pgt_property_count_tf_idf_score > $features_file

deduplicate-candidates Time: 1.1828579902648926s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 0.32811784744262695s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 1.4985980987548828s
string-similarity-['jaro_winkler'] Time: 0.09545707702636719s
string-similarity-['levenshtein'] Time: 0.38704514503479004s
create-singleton-feature Time: 0.06041288375854492s
pick-hc-candidates Time: 12.32506012916565s
context-match Time: 9.5367431640625e-07s
kth-percentile Time: 13.148528099060059s
pgt-semantic-tf-idf-class_count Time: 13.32775616645813s
pgt-semantic-tf-idf-property_count Time: 13.5000319480896s
predict-using-model Time: 0.6866550445556641s
create-pseudo-gt Time: 0.021252155303955078s
context-match Time: 0.0s
mosaic-features Time: 0.005371809005737305s
Qnodes to lookup: 1785
Qnodes from file: 1760
Outlier removal generates 6 lof-voted candidates
score-using-embedding Time: 15.056488037109375s
compute-tf-idf-class_count Time: 15.430599927902222s
compute

## Predictions

In [9]:
!tl predict-using-model $features_file -o $final_score_column \
    --features $feature_str \
    --ranking-model $tbl_prediction_model \
    --normalization-factor $min_max_scaler_path \
    / get-kg-links -c $final_score_column -k $k --k-rows > $output_file

predict-using-model Time: 0.4589681625366211s
get-kg-links-siamese_prediction Time: 0.11472797393798828s


In [10]:
out_df = pd.read_csv(output_file)
out_df

Unnamed: 0,column,row,label,context,filename,column-id,label_clean,kg_id,kg_labels,kg_aliases,...,num_char,num_tokens,is_lof,pgt_centroid_score,pgt_class_count_tf_idf_score,top5_class_count,pgt_property_count_tf_idf_score,top5_property_count,siamese_prediction,rank
0,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,cricketers.csv,cricketers.csv-0,Virat Kohli,Q213854,Virat Kohli,V. Kohli|Cheeku|101095|Virat Kohli|1130805034|...,...,11,2,-1,0.933841,1.000000,Q12299841:0.145|Q18536342:0.083|Q4197743:0.083...,0.837123,P3526:0.103|P2698:0.090|P2697:0.083|P1532:0.06...,1.000000,1
1,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,cricketers.csv,cricketers.csv-0,Virat Kohli,Q7686953,Taruwar Kohli,"T. S. Kohli|Taruwar S. Kohli|Kohli, T. S. |T. ...",...,13,2,-1,0.937608,1.000000,Q12299841:0.145|Q18536342:0.083|Q4197743:0.083...,0.374657,P2698:0.090|P2697:0.083|P641:0.036|P19:0.031|P...,0.057351,2
2,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,cricketers.csv,cricketers.csv-0,Virat Kohli,Q7260793,Purab Kohli,Purab|/m/04gtmv4|nm1327322|561056|340461|P. Ko...,...,11,2,-1,0.797688,0.549404,Q5:0.032|Q164509:0.032|Q45983014:0.032|Q154954...,0.245024,P19:0.031|P27:0.028|P166:0.025|P569:0.024|P646...,0.000004,3
3,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,cricketers.csv,cricketers.csv-0,Virat Kohli,Q86510037,Hima Kohli,Justice Hima Kohli|/g/11j320m9n_|Hima Kohli|H....,...,10,2,-1,0.668254,0.549404,Q5:0.032|Q164509:0.032|Q45983014:0.032|Q154954...,0.152836,P19:0.031|P569:0.024|P106:0.021|P21:0.020|P267...,0.000002,4
4,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,cricketers.csv,cricketers.csv-0,Virat Kohli,Q21517679,Brij L. Kohli,"Kohli, B. L. |B. L. Kohli|Kohli|62549|Brij L. ...",...,13,3,-1,0.740785,0.549404,Q5:0.032|Q164509:0.032|Q45983014:0.032|Q154954...,0.105466,P569:0.024|P106:0.021|P428:0.000|P586:0.000|P6...,0.000002,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60,0,9,Shikhar Dhawan,delhi capitals|157|5/12/85,cricketers.csv,cricketers.csv-0,Shikhar Dhawan,Q7487024,Shikhar Dhawan,/m/0273sth|sdhawan25|S. Dhawan|Gabbar|15627|Sh...,...,14,2,1,0.971510,1.000000,Q12299841:0.145|Q18536342:0.083|Q4197743:0.083...,0.668513,P2698:0.090|P2697:0.083|P1532:0.063|P3417:0.06...,1.000000,1
61,0,9,Shikhar Dhawan,delhi capitals|157|5/12/85,cricketers.csv,cricketers.csv-0,Shikhar Dhawan,Q7336038,Rishi Dhawan,"Dhawan, R. |/m/0hhr78g|66189|290727|R. Dhawan|...",...,12,2,-1,0.972536,1.000000,Q12299841:0.145|Q18536342:0.083|Q4197743:0.083...,0.606657,P3526:0.103|P2698:0.090|P2697:0.083|P1532:0.06...,0.004021,2
62,0,9,Shikhar Dhawan,delhi capitals|157|5/12/85,cricketers.csv,cricketers.csv-0,Shikhar Dhawan,Q4805162,Ashita Dhawan,"Dhawan, A. |A. Dhawan|/m/0g54rhh|Ashita Dhawan",...,13,2,-1,0.745629,0.549404,Q5:0.032|Q164509:0.032|Q45983014:0.032|Q154954...,0.205952,P27:0.028|P569:0.024|P646:0.023|P18:0.022|P106...,0.000003,3
63,0,9,Shikhar Dhawan,delhi capitals|157|5/12/85,cricketers.csv,cricketers.csv-0,Shikhar Dhawan,Q7273748,R. K. Dhawan,"R. K. Dhawan|/m/0krstr|R K Dhawan|Dhawan, R. K.",...,12,3,-1,0.855931,0.549404,Q5:0.032|Q164509:0.032|Q45983014:0.032|Q154954...,0.197916,P19:0.031|P27:0.028|P569:0.024|P646:0.023|P106...,0.000002,4


## Add color

In [16]:
!tl add-color $output_file -c $final_score_column -k $k --output $colorized_file

add-color Time: 0.08968877792358398s


In [18]:
!open $colorized_file

## Cleanup temp folder

In [None]:
!rm -rf $temp_dir