## Purpose: A quick notebook to get the predictions and metric results
Parameters to Set:
- Input data path 
- Features to include
- Model to use for prediction
- Scaler to use for prediction
- Experiment Name so as to store the results
- The new column name for the files
Outputs
- Top 5 Ranked files
- Metrics Result witt & without combining
- Colorized files

In [4]:
import pandas as pd
import os
import glob
import numpy as np

In [1]:
##Here we will get location of all complete datasets.
main_dataset_location = '/nas/ckgfs/kgtk/hrathod/scratch/datasets/'
dataset_folder_location = '/complete_data/features/'
# to get to semtab data, we need to go -> main_dataset_location/{semtab_data}/datat2dv2set_folder_location
datasets = "semtab_data,t2dv2_data,limaye_data,bio_div_data,2t_data"


In [2]:
features = ["monge_elkan","monge_elkan_aliases","jaro_winkler",
            "levenshtein","singleton","context_score_3","pgt_centroid_score","pgt_class_count_tf_idf_score",
            "pgt_property_count_tf_idf_score", "num_occurences", "incorrectness_scores"]

In [3]:
final_score_column = "siamese_prediction"

In [5]:
#Mention the test dataset
test_dataset = "semtab_data"
experiment_number = "Experiment_test_semtab"
titled = "_reduced"
result_main_location = f"Experiment/results/{experiment_number+titled}/"
result_predictions_location = f"Experiment/results/{experiment_number+titled}/test/test_predictions_top_k"
result_metrics_location = f"Experiment/results/{experiment_number+titled}/test/test_predictions_metrics/"
result_colorized_location= f"Experiment/results/{experiment_number+titled}/test/test_predictions_colored/"
result_train_predictions_location = f"Experiment/results/{experiment_number+titled}/test/test_predictions_metrics/"
result_train_metrics_location = f"Experiment/results/{experiment_number+titled}/test/test_predictions_metrics/"
!mkdir -p $result_main_location
!mkdir -p $result_predictions_location
!mkdir -p $result_metrics_location
!mkdir -p $result_colorized_location
!mkdir -p $result_train_predictions_location
!mkdir -p $result_train_metrics_location


In [6]:
import boto3
s3 = boto3.client('s3')

In [7]:
model_path = f'Experiments/{experiment_number}/final_models/cell/top1_0.7520739963764661_epoch_1_loss_0.19324207305908203_batch_size_32_learning_rate_1e-05.pth'
min_max_scaler_path = f"Experiments/{experiment_number}/model_training_data/tl_pipeline_normalization_factor.pkl"
s3.download_file('table-linker-datasets', model_path,
                 'model_2.pth')
s3.download_file('table-linker-datasets',
                 min_max_scaler_path,
                 'normalization_factor_2.pkl')
model_path = 'model_2.pth'
min_max_scaler_path = 'Experiment/Exp_test_semtab/training_data/tl_pipeline_normalization_factor.pkl'

In [8]:
!mkdir -p complete_dataset

In [9]:
def dev_prediction(dev_feature_path, dev_predictions_top_k, saved_model, output_column, min_max_scaler_path, k, dev_predictions_metrics):
    print(dev_feature_path)
    df_list = []
    for file in glob.glob(dev_feature_path + '/*.csv'):
        print(file)
        filename = file.split("/")[-1]

        dev_output = f"{dev_predictions_top_k}/{filename}"
        if os.path.exists(dev_output):
            continue
        df = pd.read_csv(file)
        df = df[df['evaluation_label'] != '[]']
        df.to_csv(file, index=False)
        feature_str =  ",".join(features)
        if os.path.getsize(file) == 0:
            continue

        
        # location where the output generated by the predictions wil be stored.

        k = '5'
        print(output_column, k, dev_output, file)
        !tl predict-using-model $file -o $output_column \
            --features $feature_str \
            --ranking-model $saved_model \
            --normalization-factor $min_max_scaler_path \
            / get-kg-links -c $output_column -k $k --k-rows \
            > $dev_output
        filename = file.split("/")[-1]
        print(filename)
        if os.path.getsize(file) == 0:
                    continue
        k_2 = '1'
        score_column = output_column
        dev_metrics_file = f"{dev_predictions_metrics}/{filename}"
        !tl metrics $dev_output -k $k_2 -c $score_column --tag $filename> $dev_metrics_file
        df_list.append(pd.read_csv(dev_metrics_file))
    return pd.concat(df_list)
        
def add_color(dev_predictions_top_k, dev_colorized_path, score_column, k=5):
    for file in glob.glob(dev_predictions_top_k + '/*.csv'):
        filename = file.split("/")[-1]
        print(filename)
        if os.path.getsize(file) == 0:
                    continue
                
        dev_color_file = f"{dev_colorized_path}/{filename[:-4]}.xlsx"
        !tl add-color $file -c "$score_column,evaluation_label" -k $k --output $dev_color_file
'''
def metrics(column, file_path=None, df=None, k: int = 1, tag=""):
    """
    computes the precision, recall and f1 score for the tl pipeline.
    Args:
        column: column with ranking score
        file_path: input file path
        df: or input dataframe
        k: calculate recall at top k candidates
        tag: a tag to use in the output file to identify the results of running the given pipeline
    Returns:
    """
    if file_path is None and df is None:
        raise RequiredInputParameterMissingException(
            'One of the input parameters is required: {} or {}'.format('file_path', 'df'))

    if file_path:
        df = read_csv(file_path, dtype=object)

    # remove duplicate candidates if exist
    df = normalize_scores.drop_duplicate("kg_id", [column], df=df)

    # replace na to 0.0
    df[column] = df[column].astype(float).fillna(0.0)
    df['max_score'] = df.groupby(by=['column', 'row'])[column].transform(max)

    # relevant df
    rdf = df[df['evaluation_label'].astype(float) != 0.0]

    col_grouped = rdf.groupby(by=['column'])
    results = []
    for col, cgdf in col_grouped:
        # true positive for precision at 1
        tp_ps = []

        # true positive for recall at k
        tp_rs = defaultdict(list)
        grouped = cgdf.groupby(by=['row'])
        n = len(grouped)
        for key, gdf in grouped:
            gdf = gdf.sort_values(by=[column, 'kg_id'], ascending=[False, True]).reset_index()

            for i, row in gdf.iterrows():
                if float(row['evaluation_label']) == 1 and row[column] == row['max_score']:
                    tp_ps.append(key)

                # this df is sorted by score, so highest ranked candidate is rank 1 and so on...
                rank = i + 1
                if rank <= k and (row['evaluation_label'] == '1' or row['evaluation_label'] == 1.0):
                    tp_rs[k].append(key)

        precision = float(len(tp_ps)) / float(n)
        recall = {k: float(len(each_tp_rs)) / float(n) for k, each_tp_rs in tp_rs.items()}
        # sort as k value increasing
        recall = {k: v for k, v in sorted(recall.items(), key=lambda x: x[0])}

        for _k, each_recall in recall.items():
            if precision == 0 and each_recall == 0:
                f1_score = 0.0
            else:
                f1_score = (2 * precision * each_recall) / (precision + each_recall)
            results.append({"k": _k,
                            'f1': f1_score,
                            'precision': precision,
                            'recall': each_recall,
                            'column': col,
                            'tag': tag})

    output_df = pd.DataFrame(results)
    return output_df
'''
def compute_metrics(dev_predictions_top_k, dev_predictions_metrics, score_column, k=1):
    df_list = []
    for file in glob.glob(dev_predictions_top_k + '/*.csv'):
        #print(file)
        #print(df)

        filename = file.split("/")[-1]
        print(filename)
        if os.path.getsize(file) == 0:
                    continue
                
        dev_metrics_file = f"{dev_predictions_metrics}/{filename}"
        !tl metrics $file -k $k -c $score_column --tag $filename> $dev_metrics_file
        df_list.append(pd.read_csv(dev_metrics_file))
    return pd.concat(df_list)

In [20]:
# Getting the testing metrics
metrics_df = dev_prediction('../datasets/semtab_data/complete_data/features/', result_predictions_location, model_path, 'siamese_prediction', min_max_scaler_path, 5, result_metrics_location)
#dev_prediction(main_dataset_location+test_dataset+dataset_folder_location, result_predictions_location, model_path, "siamese_prediction", min_max_scaler_path, k=5)

../datasets/semtab_data/complete_data/features/
../datasets/semtab_data/complete_data/features/00UQHNO3.csv
../datasets/semtab_data/complete_data/features/03XJOJ15.csv
../datasets/semtab_data/complete_data/features/06BZ6E4W.csv
../datasets/semtab_data/complete_data/features/06UFHM3B.csv
../datasets/semtab_data/complete_data/features/06ZS1QBE.csv
../datasets/semtab_data/complete_data/features/085QNFGZ.csv
../datasets/semtab_data/complete_data/features/090EELY2.csv
../datasets/semtab_data/complete_data/features/09Z268Z7.csv
../datasets/semtab_data/complete_data/features/0BY9CFLK.csv
../datasets/semtab_data/complete_data/features/0CHQAATH.csv
../datasets/semtab_data/complete_data/features/0D1JRXQD.csv
../datasets/semtab_data/complete_data/features/0DVQGVUW.csv
../datasets/semtab_data/complete_data/features/0HFY1TOW.csv
../datasets/semtab_data/complete_data/features/0HZG8QGJ.csv
../datasets/semtab_data/complete_data/features/0KL64BZL.csv
../datasets/semtab_data/complete_data/features/0LGYY

  if (await self.run_code(code, result,  async_=asy)):


siamese_prediction 5 Experiment/results/Experiment_test_semtab_reduced/test/test_predictions_top_k/1ZFRQBQS.csv ../datasets/semtab_data/complete_data/features/1ZFRQBQS.csv
predict-using-model Time: 3.1823534965515137s
get-kg-links-siamese_prediction Time: 1.7203984260559082s
1ZFRQBQS.csv
metrics Time: 0.7841682434082031s
../datasets/semtab_data/complete_data/features/28D7RBJT.csv
siamese_prediction 5 Experiment/results/Experiment_test_semtab_reduced/test/test_predictions_top_k/28D7RBJT.csv ../datasets/semtab_data/complete_data/features/28D7RBJT.csv
predict-using-model Time: 3.1929478645324707s
get-kg-links-siamese_prediction Time: 0.7407734394073486s
28D7RBJT.csv
metrics Time: 0.3693575859069824s
../datasets/semtab_data/complete_data/features/2FXR6BX7.csv
siamese_prediction 5 Experiment/results/Experiment_test_semtab_reduced/test/test_predictions_top_k/2FXR6BX7.csv ../datasets/semtab_data/complete_data/features/2FXR6BX7.csv
predict-using-model Time: 3.152341365814209s
get-kg-links-siam

  if (await self.run_code(code, result,  async_=asy)):


siamese_prediction 5 Experiment/results/Experiment_test_semtab_reduced/test/test_predictions_top_k/4FG1UN8O.csv ../datasets/semtab_data/complete_data/features/4FG1UN8O.csv
predict-using-model Time: 2.7607169151306152s
get-kg-links-siamese_prediction Time: 1.4005649089813232s
4FG1UN8O.csv
metrics Time: 0.7269737720489502s
../datasets/semtab_data/complete_data/features/4SOL8H0M.csv
siamese_prediction 5 Experiment/results/Experiment_test_semtab_reduced/test/test_predictions_top_k/4SOL8H0M.csv ../datasets/semtab_data/complete_data/features/4SOL8H0M.csv
predict-using-model Time: 3.749408721923828s
get-kg-links-siamese_prediction Time: 1.3504109382629395s
4SOL8H0M.csv
metrics Time: 0.6755545139312744s
../datasets/semtab_data/complete_data/features/50NWQJ1T.csv
siamese_prediction 5 Experiment/results/Experiment_test_semtab_reduced/test/test_predictions_top_k/50NWQJ1T.csv ../datasets/semtab_data/complete_data/features/50NWQJ1T.csv
predict-using-model Time: 2.2468953132629395s
get-kg-links-siam

  if (await self.run_code(code, result,  async_=asy)):


siamese_prediction 5 Experiment/results/Experiment_test_semtab_reduced/test/test_predictions_top_k/0Z8LKW3C.csv ../datasets/semtab_data/complete_data/features/0Z8LKW3C.csv
predict-using-model Time: 3.5264992713928223s
get-kg-links-siamese_prediction Time: 1.3884506225585938s
0Z8LKW3C.csv
metrics Time: 0.8001251220703125s
../datasets/semtab_data/complete_data/features/042AKDN1.csv
siamese_prediction 5 Experiment/results/Experiment_test_semtab_reduced/test/test_predictions_top_k/042AKDN1.csv ../datasets/semtab_data/complete_data/features/042AKDN1.csv
predict-using-model Time: 2.1656181812286377s
get-kg-links-siamese_prediction Time: 0.30202555656433105s
042AKDN1.csv
metrics Time: 0.15872502326965332s
../datasets/semtab_data/complete_data/features/09DZVIPV.csv
siamese_prediction 5 Experiment/results/Experiment_test_semtab_reduced/test/test_predictions_top_k/09DZVIPV.csv ../datasets/semtab_data/complete_data/features/09DZVIPV.csv
predict-using-model Time: 2.320932149887085s
get-kg-links-si

[   k   f1  precision  recall  column  num_cells           tag
 0  1  1.0        1.0     1.0       0         20  1NS33P8C.csv,
    k   f1  precision  recall  column  num_cells           tag
 0  1  1.0        1.0     1.0       0         20  1XIWQBSF.csv
 1  1  1.0        1.0     1.0       1         20  1XIWQBSF.csv,
    k   f1  precision  recall  column  num_cells           tag
 0  1  1.0        1.0     1.0       0         20  1YPLVLS9.csv
 1  1  1.0        1.0     1.0       2         20  1YPLVLS9.csv,
    k       f1  precision   recall  column  num_cells           tag
 0  1  1.00000    1.00000  1.00000       0         32  1ZFRQBQS.csv
 1  1  1.00000    1.00000  1.00000       1         32  1ZFRQBQS.csv
 2  1  0.96875    0.96875  0.96875       2         32  1ZFRQBQS.csv,
    k    f1  precision  recall  column  num_cells           tag
 0  1  1.00       1.00    1.00       0         20  28D7RBJT.csv
 1  1  0.95       0.95    0.95       2         20  28D7RBJT.csv,
    k   f1  precision  reca

In [12]:
add_color(result_predictions_location, result_colorized_location, 'siamese_prediction', k=5)

03XJOJ15.csv
add-color Time: 0.7236204147338867s
06UFHM3B.csv
add-color Time: 0.27489733695983887s
06ZS1QBE.csv
add-color Time: 0.17109084129333496s
090EELY2.csv
add-color Time: 0.2694880962371826s
0CHQAATH.csv
add-color Time: 0.30591297149658203s
0DVQGVUW.csv
add-color Time: 0.3859376907348633s
0HFY1TOW.csv
add-color Time: 0.1995680332183838s
0KL64BZL.csv
add-color Time: 0.1757965087890625s
0LGYY0Y7.csv
add-color Time: 0.18250274658203125s
0U3WMR09.csv
add-color Time: 0.15735101699829102s
0UJRG5XA.csv
add-color Time: 0.20933890342712402s
0VTG406B.csv
add-color Time: 0.17556428909301758s
15S9J1SE.csv
add-color Time: 0.140855073928833s
15ULGJXA.csv
add-color Time: 0.13775277137756348s
15Z7HD0M.csv
add-color Time: 0.2646012306213379s
18PQGWE2.csv
add-color Time: 3.024301767349243s
1A51CWBN.csv
add-color Time: 0.2887074947357178s
1DT331RM.csv
add-color Time: 0.3035416603088379s
1FIFDPBT.csv
add-color Time: 0.18437695503234863s
1FPJTMCX.csv
add-color Time: 0.16524839401245117s
1H6S7TGO.csv

In [14]:
metrics_k_200 = compute_metrics(result_predictions_location, result_metrics_location, final_score_column, k = 200)

T7RPWH6N.csv
NDSTZH1I.csv
metrics Time: 0.66619873046875s
QIW5EC2H.csv
metrics Time: 7.522080421447754s
P8B3IAOY.csv
metrics Time: 0.2715945243835449s
ZZNW93IV.csv
PWNRGOJ5.csv
metrics Time: 0.47112107276916504s
UURPYBGQ.csv
4FW66XNV.csv
metrics Time: 0.06413602828979492s


In [12]:
metrics_k_200

Unnamed: 0,k,f1,precision,recall,column,num_cells,tag
0,200,0.98,0.960784,1.0,1,51,T7RPWH6N.csv
1,200,0.675325,0.509804,1.0,2,51,T7RPWH6N.csv
2,200,0.980392,0.980392,0.980392,6,51,T7RPWH6N.csv
0,200,0.808163,0.733333,0.9,0,60,NDSTZH1I.csv
1,200,0.806061,0.7,0.95,1,60,NDSTZH1I.csv
0,200,0.872972,0.826944,0.924425,0,913,QIW5EC2H.csv
1,200,1.0,1.0,1.0,1,913,QIW5EC2H.csv
0,200,1.0,1.0,1.0,0,15,P8B3IAOY.csv
1,200,0.928571,0.866667,1.0,1,15,P8B3IAOY.csv
2,200,1.0,1.0,1.0,2,15,P8B3IAOY.csv


In [19]:
metrics_k_1 = compute_metrics(result_predictions_location, result_metrics_location, final_score_column, k=1)

PWNRGOJ5.csv
4FW66XNV.csv
metrics Time: 0.06575465202331543s
