In [1]:
import tempfile
from pathlib import Path
import glob
import pandas as pd

In [19]:
temp_dir = tempfile.mkdtemp()

In [20]:
# Parameters

input_file_path = "/Users/amandeep/Github/wikidata-wikifier/wikifier/sample_files/cricketers.csv"
# input_file_path = "/Users/amandeep/Github/table-linker/data/SemTab2020/Round4/tables/5TXLGRXA.csv"
# input_file_path = '/Users/amandeep/Desktop/usda_fas_psd_countries.csv'
wikify_column_name = "cricketers"
# wikify_column_name = "col0,col1"
# output_file = '/tmp/nih_author_org_sample.csv'
output_file = '/tmp/cricketers.csv'
k = 5

In [21]:
es_url = 'http://ckg07:9200'
es_index = 'wikidatadwd-augmented'


aux_field = 'graph_embedding_complex,class_count,property_count,context'
aux_path = f"{temp_dir}/aux_files"

Path(aux_path).mkdir(parents=True, exist_ok=True)

candidates_file = f"{temp_dir}/candidates.csv"
features_file = f"{temp_dir}/features.csv"
context_property_file = f"{temp_dir}/context_properties.csv"
colorized_file = f"{temp_dir}/colorized.xlsx"

# Pseudo GT model and normalization factor
pseudo_gt_model = './models/epoch_1_loss_0.534353494644165_top1_0.7883487007544007.pth'
pseudo_gt_min_max_scaler_path = './models/normalization_factor.pkl'

# Table Linker model and normalization factor
min_max_scaler_path = './models/tl_pipeline_normalization_factor.pkl'
tbl_prediction_model = './models/epoch_17_loss_0.014523069374263287_top1_0.9675043327556326.pth'

final_score_column = 'siamese_prediction'
threshold = final_score_column+":median"

pseudo_gt_features = ["monge_elkan","monge_elkan_aliases","jaro_winkler",
                      "levenshtein","singleton","pgr_rts","context_score",
                      "smc_class_score","smc_property_score"]
pgt_feature_str =  ",".join(pseudo_gt_features)

features = ["monge_elkan","monge_elkan_aliases","jaro_winkler",
            "levenshtein","singleton","context_score_3","pgt_centroid_score","pgt_class_count_tf_idf_score",
            "pgt_property_count_tf_idf_score", "num_occurences"]
feature_str = ",".join(features)
sep = '\t' if input_file_path.endswith('.tsv') else ','
canonicalize = "canonicalize --tsv " if input_file_path.endswith('.tsv') else "canonicalize "
columns_to_color = f"{feature_str},{final_score_column}"

In [22]:
temp_dir

'/var/folders/qv/cxzpwz3j29x7n79vwpw253v80000gn/T/tmpmodtbno3'

## Peek at input file

In [23]:
pd.read_csv(input_file_path, sep=sep).head(10)

Unnamed: 0,cricketers,teams,weight,dob
0,Virat Kohli,royal challengers bangalore,152,5/11/88
1,Tendulkar,mumbai indians,137,24/04/1973
2,Dhoni,chennai super kings,154,7/7/81
3,Jasprit Bumrah,mumbai indians,154,6/12/93
4,Ajinkya Rahane,rajasthan royals,134,6/6/88
5,Rohit Sharma,mumbai indians,159,30/04/1987
6,Bhuvneshwar Kumar,deccan chargers,154,5/2/90
7,Ravindra Jadeja,chennai super kings,132,6/12/88
8,Rishabh Pant,delhi capitals,136,4/8/97
9,Shikhar Dhawan,delhi capitals,157,5/12/85


## Generate Candidates

In [24]:
!tl $canonicalize -c "$wikify_column_name" --add-context $input_file_path \
    / clean -c label -o label_clean \
    / --url $es_url --index $es_index \
    get-fuzzy-augmented-matches -c label_clean \
    --auxiliary-fields {aux_field} \
    --auxiliary-folder "$temp_dir" \
    / get-exact-matches -c label_clean \
    --auxiliary-fields {aux_field} \
    --auxiliary-folder "$temp_dir" \
    / get-ngram-matches -c label_clean  \
    --auxiliary-fields {aux_field} \
    --auxiliary-folder "$temp_dir" \
    / get-trigram-matches -c label_clean \
    --auxiliary-fields {aux_field} \
    --auxiliary-folder "$temp_dir" > "$candidates_file"

canonicalize Time: 0.004869699478149414s
clean Time: 0.0024039745330810547s
get-fuzzy-augmented-matches Time: 2.9010050296783447s
get-exact-matches Time: 0.8108069896697998s
get-ngram-matches Time: 0.9130830764770508s
get-trigram-matches Time: 2.7386600971221924s


In [25]:
for field in aux_field.split(','):
    aux_list = []
    if field == 'context':
        file_list = glob.glob(f'{temp_dir}/*{field}.jl')
        context_file = f"{aux_path}/context.jl"
        o_f = open(context_file, 'w')

        for i_f_P in file_list:
            i_f = open(i_f_P)
            for line in i_f:
                o_f.write(line)
            i_f.close()
        o_f.close()

    else:
        for f in glob.glob(f'{temp_dir}/*{field}.tsv'):
            aux_list.append(pd.read_csv(f, sep='\t', dtype=object))
        aux_df = pd.concat(aux_list).drop_duplicates(subset=['qnode'])
        if field == 'class_count':
            class_count_file = f"{aux_path}/class_count.tsv"
            aux_df.to_csv(class_count_file, sep='\t', index=False)
        elif field == 'property_count':
            prop_count_file = f"{aux_path}/prop_count.tsv"
            aux_df.to_csv(prop_count_file, sep='\t', index=False)
        else:
            graph_embedding_file = f"{aux_path}/graph_embedding_complex.tsv"
            aux_df.to_csv(graph_embedding_file, sep='\t', index=False)

## Features

In [26]:
!tl deduplicate-candidates -c kg_id $candidates_file \
    / string-similarity -i --method symmetric_monge_elkan:tokenizer=word -o monge_elkan --threshold 0.5 \
    / string-similarity -i --method symmetric_monge_elkan:tokenizer=word -c label_clean kg_aliases -o monge_elkan_aliases --threshold 0.5 \
    / string-similarity -i --method jaro_winkler -o jaro_winkler --threshold 0.5 \
    / string-similarity -i --method levenshtein -o levenshtein --threshold 0.5 \
    / create-singleton-feature -o singleton \
    / pick-hc-candidates -o ignore_candidate --string-similarity-label-columns monge_elkan,jaro_winkler,levenshtein --string-similarity-alias-columns monge_elkan_aliases \
    / context-match --debug --context-file $context_file --ignore-column-name ignore_candidate -o context_score \
    --similarity-string-threshold 0.85 --similarity-quantity-threshold 0.9 \
    --save-relevant-properties --context-properties-path $context_property_file \
    / kth-percentile -c context_score -o kth_percenter --ignore-column ignore_candidate --k-percentile 0.75  --minimum-cells 10 \
    / pgt-semantic-tf-idf \
    -o smc_class_score \
    --pagerank-column pagerank \
    --retrieval-score-column retrieval_score \
    --feature-file "$class_count_file" \
    --feature-name class_count \
    --high-confidence-column kth_percenter \
    / pgt-semantic-tf-idf \
    -o smc_property_score \
    --pagerank-column pagerank \
    --retrieval-score-column retrieval_score \
    --feature-file "$prop_count_file" \
    --feature-name property_count \
    --high-confidence-column kth_percenter \
    / predict-using-model -o pseudo_gt_prediction \
    --features $pgt_feature_str \
    --ranking-model $pseudo_gt_model \
    --ignore-column ignore_candidate \
    --normalization-factor $pseudo_gt_min_max_scaler_path \
    / create-pseudo-gt -o pseudo_gt \
    --column-thresholds pseudo_gt_prediction:mean \
    --filter smc_class_score:0 \
    / context-match --debug --context-file $context_file -o context_score_3 \
    --similarity-string-threshold 0.85 --similarity-quantity-threshold 0.9 \
    --use-relevant-properties --context-properties-path $context_property_file \
    / mosaic-features -c kg_labels --num-char --num-tokens \
    / score-using-embedding \
    --column-vector-strategy centroid-of-lof \
    --lof-strategy pseudo-gt \
    -o pgt_centroid_score \
    --embedding-file $graph_embedding_file \
    / compute-tf-idf  \
    --feature-file $class_count_file \
    --feature-name class_count \
    --singleton-column pseudo_gt \
    -o pgt_class_count_tf_idf_score \
    / compute-tf-idf \
    --feature-file $prop_count_file \
    --feature-name property_count \
    --singleton-column pseudo_gt \
    -o pgt_property_count_tf_idf_score > $features_file

deduplicate-candidates Time: 2.3984198570251465s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 0.6663706302642822s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 3.219404935836792s
string-similarity-['jaro_winkler'] Time: 0.1859147548675537s
string-similarity-['levenshtein'] Time: 0.7358980178833008s
create-singleton-feature Time: 0.09965181350708008s
pick-hc-candidates Time: 20.185739994049072s
context-match Time: 1.6689300537109375e-06s
kth-percentile Time: 21.34679388999939s
pgt-semantic-tf-idf-class_count Time: 22.01512384414673s
pgt-semantic-tf-idf-property_count Time: 22.2899169921875s
predict-using-model Time: 2.7653310298919678s
create-pseudo-gt Time: 0.0468289852142334s
context-match Time: 2.1457672119140625e-06s
mosaic-features Time: 0.00846409797668457s
Qnodes to lookup: 1785
Qnodes from file: 1760
Outlier removal generates 3 lof-voted candidates
score-using-embedding Time: 28.412047863006592s
compute-tf-idf-class_count Time: 30.082806110

## Predictions

In [27]:
!tl predict-using-model $features_file -o $final_score_column \
    --features $feature_str \
    --ranking-model $tbl_prediction_model \
    --normalization-factor $min_max_scaler_path \
    / get-kg-links -c $final_score_column -k $k --k-rows > $output_file

predict-using-model Time: 1.4077630043029785s
get-kg-links-siamese_prediction Time: 0.17861294746398926s


In [28]:
out_df = pd.read_csv(output_file)
out_df.head()

Unnamed: 0,column,row,label,context,filename,column-id,label_clean,kg_id,kg_labels,kg_aliases,...,num_char,num_tokens,is_lof,pgt_centroid_score,pgt_class_count_tf_idf_score,top5_class_count,pgt_property_count_tf_idf_score,top5_property_count,siamese_prediction,rank
0,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,cricketers.csv,cricketers.csv-0,Virat Kohli,Q213854,Virat Kohli,Kohli|Viraj Kohli|/m/03qkvyf|Khli|V Kohli|8276...,...,11,2,1,0.966975,1.0,Q12299841:0.145|Q18536342:0.083|Q4197743:0.083...,0.889288,P2698:0.091|P3526:0.089|P2697:0.084|P1532:0.06...,1.0,1
1,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,cricketers.csv,cricketers.csv-0,Virat Kohli,Q7260793,Purab Kohli,Purab Kohli|nm1327322|Purab|Purab Kohli|17226...,...,11,2,-1,0.809551,0.549404,Q5:0.032|Q164509:0.032|Q45983014:0.032|Q154954...,0.239734,P19:0.032|P166:0.030|P27:0.028|P569:0.025|P106...,3e-06,2
2,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,cricketers.csv,cricketers.csv-0,Virat Kohli,Q86510037,Hima Kohli,Hima Kohli|Justice Hima Kohli|/g/11j320m9n_|Ko...,...,10,2,-1,0.678466,0.549404,Q5:0.032|Q164509:0.032|Q45983014:0.032|Q154954...,0.157028,P19:0.032|P569:0.025|P106:0.022|P21:0.021|P267...,2e-06,3
3,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,cricketers.csv,cricketers.csv-0,Virat Kohli,Q21517679,Brij L. Kohli,"62549|20012614-2|Kohli|Kohli, B. L. |B. L. Koh...",...,13,3,-1,0.737171,0.549404,Q5:0.032|Q164509:0.032|Q45983014:0.032|Q154954...,0.10604,P569:0.025|P106:0.022|P428:0.000|P586:0.000|P6...,2e-06,4
4,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,cricketers.csv,cricketers.csv-0,Virat Kohli,Q61072073,Sita Ram Kohli,S. R. Kohli|Sita R. Kohli|074304569|S. Ram Koh...,...,14,3,-1,0.696413,0.549404,Q5:0.032|Q164509:0.032|Q45983014:0.032|Q154954...,0.170035,P19:0.032|P569:0.025|P106:0.022|P21:0.021|P244...,1e-06,5


## Add color

In [29]:
!tl add-color $output_file -c $columns_to_color -k $k --output $colorized_file

add-color Time: 0.19469714164733887s


In [30]:
!open $colorized_file

## Cleanup temp folder

In [31]:
!rm -rf $temp_dir