In [1]:
import tempfile
from pathlib import Path
import glob
import pandas as pd

In [2]:
temp_dir = tempfile.mkdtemp()

In [3]:
# Parameters

# input_file_path = "/Users/amandeep/Github/wikidata-wikifier/wikifier/sample_files/cricketers.csv"
input_file_path = "/Users/amandeep/Github/table-linker/data/SemTab2020/Round4/tables/5TXLGRXA.csv"
wikify_column_name = "col0,col1"
output_file = '/tmp/5TXLGRXA.csv'
k = 5

In [4]:
es_url = 'http://ckg07:9200'
es_index = 'wikidatadwd-augmented'


aux_field = 'graph_embedding_complex,class_count,property_count,context'
aux_path = f"{temp_dir}/aux_files"

Path(aux_path).mkdir(parents=True, exist_ok=True)

candidates_file = f"{temp_dir}/candidates.csv"
features_file = f"{temp_dir}/features.csv"
context_property_file = f"{temp_dir}/context_properties.csv"
colorized_file = f"{temp_dir}/colorized.xlsx"

# Pseudo GT model and normalization factor
pseudo_gt_model = './models/epoch_1_loss_0.534353494644165_top1_0.7883487007544007.pth'
pseudo_gt_min_max_scaler_path = './models/normalization_factor.pkl'

# Table Linker model and normalization factor
min_max_scaler_path = './models/tl_pipeline_normalization_factor.pkl'
tbl_prediction_model = './models/epoch_17_loss_0.014523069374263287_top1_0.9675043327556326.pth'

final_score_column = 'siamese_prediction'
threshold = final_score_column+":median"

pseudo_gt_features = ["monge_elkan","monge_elkan_aliases","jaro_winkler",
                      "levenshtein","singleton","pgr_rts","context_score",
                      "smc_class_score","smc_property_score"]
pgt_feature_str =  ",".join(pseudo_gt_features)

features = ["monge_elkan","monge_elkan_aliases","jaro_winkler",
            "levenshtein","singleton","context_score_3","pgt_centroid_score","pgt_class_count_tf_idf_score",
            "pgt_property_count_tf_idf_score", "num_occurences"]
feature_str = ",".join(features)

In [5]:
temp_dir

'/var/folders/qv/cxzpwz3j29x7n79vwpw253v80000gn/T/tmpgokykh0j'

## Peek at input file

In [15]:
pd.read_csv(input_file_path).head(10)

Unnamed: 0,col0,col1,col2
0,Rostock University Hospital,University of Rostock,1100
1,Tübingen University Hospital,University of Tübingen,1577
2,Charité,Humboldt University of Berlin,3011
3,Hospital General Universitario de Elda Virgen ...,Miguel Hernández University of Elche,513
4,Queen Mary Hospital,Hong Kong West Cluster,1400
5,Bicêtre Hospital,Assistance Publique – Hôpitaux de Paris,2007
6,Hospital General Universitario Santa Lucía,Universidad Católica San Antonio de Murcia,667
7,Hospital Universitario Central de Asturias,University of Oviedo,1039
8,Hospital Universitario de Getafe,Universidad Europea de Madrid,650
9,Hospital Santa Cristina,Universidad Europea de Madrid,192


## Generate Candidates

In [6]:
!tl canonicalize -c "$wikify_column_name" --add-context $input_file_path \
    / clean -c label -o label_clean \
    / --url $es_url --index $es_index \
    get-fuzzy-augmented-matches -c label_clean \
    --auxiliary-fields {aux_field} \
    --auxiliary-folder "$temp_dir" \
    / get-exact-matches -c label_clean \
    --auxiliary-fields {aux_field} \
    --auxiliary-folder "$temp_dir" \
    / get-ngram-matches -c label_clean  \
    --auxiliary-fields {aux_field} \
    --auxiliary-folder "$temp_dir" \
    / get-trigram-matches -c label_clean \
    --auxiliary-fields {aux_field} \
    --auxiliary-folder "$temp_dir" > "$candidates_file"

canonicalize Time: 0.0019681453704833984s
clean Time: 0.0043487548828125s
get-fuzzy-augmented-matches Time: 10.192392826080322s
get-exact-matches Time: 2.695819139480591s
get-ngram-matches Time: 9.330211877822876s
get-trigram-matches Time: 33.722954988479614s


In [7]:
for field in aux_field.split(','):
    aux_list = []
    if field == 'context':
        file_list = glob.glob(f'{temp_dir}/*{field}.jl')
        context_file = f"{aux_path}/context.jl"
        o_f = open(context_file, 'w')

        for i_f_P in file_list:
            i_f = open(i_f_P)
            for line in i_f:
                o_f.write(line)
            i_f.close()
        o_f.close()

    else:
        for f in glob.glob(f'{temp_dir}/*{field}.tsv'):
            aux_list.append(pd.read_csv(f, sep='\t', dtype=object))
        aux_df = pd.concat(aux_list).drop_duplicates(subset=['qnode'])
        if field == 'class_count':
            class_count_file = f"{aux_path}/class_count.tsv"
            aux_df.to_csv(class_count_file, sep='\t', index=False)
        elif field == 'property_count':
            prop_count_file = f"{aux_path}/prop_count.tsv"
            aux_df.to_csv(prop_count_file, sep='\t', index=False)
        else:
            graph_embedding_file = f"{aux_path}/graph_embedding_complex.tsv"
            aux_df.to_csv(graph_embedding_file, sep='\t', index=False)

## Features

In [8]:
!tl deduplicate-candidates -c kg_id $candidates_file \
    / string-similarity -i --method symmetric_monge_elkan:tokenizer=word -o monge_elkan --threshold 0.5 \
    / string-similarity -i --method symmetric_monge_elkan:tokenizer=word -c label_clean kg_aliases -o monge_elkan_aliases --threshold 0.5 \
    / string-similarity -i --method jaro_winkler -o jaro_winkler --threshold 0.5 \
    / string-similarity -i --method levenshtein -o levenshtein --threshold 0.5 \
    / create-singleton-feature -o singleton \
    / pick-hc-candidates -o ignore_candidate --string-similarity-label-columns monge_elkan,jaro_winkler,levenshtein --string-similarity-alias-columns monge_elkan_aliases \
    / context-match --debug --context-file $context_file --ignore-column-name ignore_candidate -o context_score \
    --similarity-string-threshold 0.85 --similarity-quantity-threshold 0.9 \
    --save-relevant-properties --context-properties-path $context_property_file \
    / kth-percentile -c context_score -o kth_percenter --ignore-column ignore_candidate --k-percentile 0.75  --minimum-cells 10 \
    / pgt-semantic-tf-idf \
    -o smc_class_score \
    --pagerank-column pagerank \
    --retrieval-score-column retrieval_score \
    --feature-file "$class_count_file" \
    --feature-name class_count \
    --high-confidence-column kth_percenter \
    / pgt-semantic-tf-idf \
    -o smc_property_score \
    --pagerank-column pagerank \
    --retrieval-score-column retrieval_score \
    --feature-file "$prop_count_file" \
    --feature-name property_count \
    --high-confidence-column kth_percenter \
    / predict-using-model -o pseudo_gt_prediction \
    --features $pgt_feature_str \
    --ranking-model $pseudo_gt_model \
    --ignore-column ignore_candidate \
    --normalization-factor $pseudo_gt_min_max_scaler_path \
    / create-pseudo-gt -o pseudo_gt \
    --column-thresholds pseudo_gt_prediction:mean \
    --filter smc_class_score:0 \
    / context-match --debug --context-file $context_file -o context_score_3 \
    --similarity-string-threshold 0.85 --similarity-quantity-threshold 0.9 \
    --use-relevant-properties --context-properties-path $context_property_file \
    / mosaic-features -c kg_labels --num-char --num-tokens \
    / score-using-embedding \
    --column-vector-strategy centroid-of-lof \
    --lof-strategy pseudo-gt \
    -o pgt_centroid_score \
    --embedding-file $graph_embedding_file \
    / compute-tf-idf  \
    --feature-file $class_count_file \
    --feature-name class_count \
    --singleton-column pseudo_gt \
    -o pgt_class_count_tf_idf_score \
    / compute-tf-idf \
    --feature-file $prop_count_file \
    --feature-name property_count \
    --singleton-column pseudo_gt \
    -o pgt_property_count_tf_idf_score > $features_file

deduplicate-candidates Time: 3.0464608669281006s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 5.512405872344971s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 31.114870071411133s
string-similarity-['jaro_winkler'] Time: 0.8393380641937256s
string-similarity-['levenshtein'] Time: 6.617343902587891s
create-singleton-feature Time: 0.14777898788452148s
pick-hc-candidates Time: 56.273844957351685s
context-match Time: 2.1457672119140625e-06s
kth-percentile Time: 55.95626091957092s
pgt-semantic-tf-idf-class_count Time: 55.402854919433594s
pgt-semantic-tf-idf-property_count Time: 55.76263380050659s
predict-using-model Time: 0.8469710350036621s
create-pseudo-gt Time: 0.03232288360595703s
context-match Time: 2.1457672119140625e-06s
mosaic-features Time: 0.014622926712036133s
Qnodes to lookup: 3184
Qnodes from file: 3127
Outlier removal generates 4 lof-voted candidates
Outlier removal generates 3 lof-voted candidates
score-using-embedding Time: 60.7279028892

## Predictions

In [9]:
!tl predict-using-model $features_file -o $final_score_column \
    --features $feature_str \
    --ranking-model $tbl_prediction_model \
    --normalization-factor $min_max_scaler_path \
    / get-kg-links -c $final_score_column -k $k --k-rows > $output_file

predict-using-model Time: 0.7813351154327393s
get-kg-links-siamese_prediction Time: 0.2632908821105957s


In [10]:
out_df = pd.read_csv(output_file)
out_df

Unnamed: 0,column,row,label,context,filename,column-id,label_clean,kg_id,kg_labels,kg_aliases,...,num_char,num_tokens,is_lof,pgt_centroid_score,pgt_class_count_tf_idf_score,top5_class_count,pgt_property_count_tf_idf_score,top5_property_count,siamese_prediction,rank
0,0,0,Rostock University Hospital,University of Rostock|1100,5TXLGRXA.csv,5TXLGRXA.csv-0,Rostock University Hospital,Q11896324,Hôpital universitaire de Tampere|Tampere Unive...,TAYS|02hvt5f17|Tampere Hospital|60670|TAYKS|Ho...,...,60,6,-1,0.789819,0.867958,Q1059324:0.091|Q1813474:0.090|Q16917:0.052|Q81...,0.471560,P6801:0.076|P3500:0.054|P6782:0.054|P2427:0.05...,0.801745,1
1,0,0,Rostock University Hospital,University of Rostock|1100,5TXLGRXA.csv,5TXLGRXA.csv-0,Rostock University Hospital,Q7593476,St James\s University Hospital,St Jamess University Hospital Leeds|lccn-nr980...,...,30,4,-1,0.739096,0.627126,Q16917:0.052|Q814610:0.052|Q4260475:0.042|Q428...,0.533366,P6801:0.076|P3500:0.054|P6782:0.054|P2427:0.05...,0.657242,2
2,0,0,Rostock University Hospital,University of Rostock|1100,5TXLGRXA.csv,5TXLGRXA.csv-0,Rostock University Hospital,Q15214634,Dong-a University Hospital,2800934180|Dong-a University Hospital|/m/0gxyz...,...,26,3,-1,0.752932,0.627126,Q16917:0.052|Q814610:0.052|Q4260475:0.042|Q428...,0.424249,P6801:0.076|P3500:0.054|P6782:0.054|P2427:0.05...,0.590006,3
3,0,0,Rostock University Hospital,University of Rostock|1100,5TXLGRXA.csv,5TXLGRXA.csv-0,Rostock University Hospital,Q1524967,Rostock Medical Faculty|Universitätsmedizin Ro...,UMR|954598244|1029510660|Universitatsklinikum ...,...,51,4,-1,0.610216,0.429190,Q2385804:0.033|Q5341295:0.033|Q178706:0.030|Q8...,0.349374,P6801:0.076|P6782:0.054|P2427:0.054|P856:0.031...,0.005543,4
4,0,0,Rostock University Hospital,University of Rostock|1100,5TXLGRXA.csv,5TXLGRXA.csv-0,Rostock University Hospital,Q14688467,Indiana University Hospital,22533|0000 0004 0440 2445|05rbkcx47|255400825|...,...,27,3,-1,0.678413,0.867958,Q1059324:0.091|Q1813474:0.090|Q16917:0.052|Q81...,0.359632,P3500:0.054|P6782:0.054|P2427:0.054|P1416:0.04...,0.002513,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,1,9,Universidad Europea de Madrid,Hospital Santa Cristina|192,5TXLGRXA.csv,5TXLGRXA.csv-1,Universidad Europea de Madrid,Q2290716,Universidad Europea de Madrid,5232561-1|XX136694|070725942|000134355|/m/0d10...,...,29,4,-1,0.753023,0.483035,Q3918:0.048|Q4671277:0.045|Q31855:0.041|Q16647...,0.337114,P5600:0.029|P3876:0.022|P1612:0.020|P949:0.018...,0.999999,1
196,1,9,Universidad Europea de Madrid,Hospital Santa Cristina|192,5TXLGRXA.csv,5TXLGRXA.csv-1,Universidad Europea de Madrid,Q219694,Universidad Complutense de Madrid|Universidad ...,Universidad Complutense de Madrid|160251028156...,...,258,22,-1,0.814687,0.970997,Q875538:0.063|Q45400320:0.054|Q3918:0.048|Q105...,0.753449,P5600:0.029|P1075:0.025|P5242:0.024|P3153:0.02...,0.367455,2
197,1,9,Universidad Europea de Madrid,Hospital Santa Cristina|192,5TXLGRXA.csv,5TXLGRXA.csv-1,Universidad Europea de Madrid,Q788091,Autonome Universiteit van Madrid|Autonome Univ...,Autnoma Universityof Madrid|universite autonom...,...,215,21,-1,0.839504,0.970997,Q875538:0.063|Q45400320:0.054|Q3918:0.048|Q105...,0.650184,P5600:0.029|P1075:0.025|P5242:0.024|P3153:0.02...,0.170128,3
198,1,9,Universidad Europea de Madrid,Hospital Santa Cristina|192,5TXLGRXA.csv,5TXLGRXA.csv-1,Universidad Europea de Madrid,Q1247135,Carlos III University of Madrid|Universidade C...,University Carlos III|164240|University of Car...,...,176,20,-1,0.743746,0.970997,Q875538:0.063|Q45400320:0.054|Q3918:0.048|Q105...,0.614525,P5600:0.029|P3153:0.022|P3876:0.022|P3793:0.02...,0.129659,4


## Add color

In [11]:
columns_to_color = f"{feature_str},{final_score_column}"

In [12]:
!tl add-color $output_file -c $columns_to_color -k $k --output $colorized_file

add-color Time: 0.22264790534973145s


In [13]:
!open $colorized_file

## Cleanup temp folder

In [14]:
!rm -rf $temp_dir