In [1]:
import pandas as pd
import sys
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.preprocessing import MinMaxScaler
import pickle
import glob
import os
import numpy as np

In [23]:
#Paths to the required files.

#dev_path: path where data files to be tested are stored.
#G:Drive Link - https://drive.google.com/drive/u/1/folders/13OtEj6WDIng-XfQTqjC3hkrFbavPWQfa
dev_path = "./dev_with_lof_context/"

#dev_predictions: path where the results of predictions for each data file will be present
dev_predictions = './dev_predictions/'

#temp: temporary files are stored here.
temp = "./temp/"
#temporary storing for top 5 candidates.
temp_predicted_top_5 = "./temp/predicted_top_5/"
#temporary storing the files with ranks
temp_predicted_ranks = "./temp/predicted_ranks/"

#Colored XLSX Files with top 5 candidates
dev_predictions_colored = './dev_predictions_colored/'

#correct_vs_incorrect: path where the top 1 correct vs top 1 predicted for each wrong prediction is present.
correct_vs_incorrect = "./correct_vs_incorrect.xlsx"

#dev_output_results: The final results where rank and correct column will be present as well. 
dev_output_results = './dev_output_results/'
#Location where the saved models and normalization factor is present.
saved_model = "./saved_models/epoch_4_loss_0.0869859904050827_top1_0.9081920903954802.pth"
normalization_file_path = './saved_models/normalization_factor.pkl'

#Column name for predictions
output_col = 'siamese_pred'

!mkdir -p {dev_predictions}
!mkdir -p {temp}
!mkdir -p {dev_predictions_colored}
!mkdir -p {dev_output_results}
!mkdir -p {temp_predicted_top_5}


In [3]:
#Replace with metrics command
def parse_eval_files_stats(eval_data, method):
    #print(eval_data.head(5))
    #print(method)
    res = {}
    candidate_eval_data = eval_data.groupby(['table_id', 'row', 'column'])['table_id'].count().reset_index(name="count")
    res['num_tasks'] = len(eval_data.groupby(['table_id', 'row', 'column']))
    res['num_tasks_with_gt'] = len(eval_data[pd.notna(eval_data['GT_kg_id'])].groupby(['table_id', 'row', 'column']))
    res['num_tasks_with_gt_in_candidate'] = len(eval_data[eval_data['evaluation_label'] == 1].groupby(['table_id', 'row', 'column']))
    res['num_tasks_with_singleton_candidate'] = len(candidate_eval_data[candidate_eval_data['count'] == 1].groupby(['table_id', 'row', 'column']))
    singleton_eval_data = candidate_eval_data[candidate_eval_data['count'] == 1]
    num_tasks_with_singleton_candidate_with_gt = 0
    for i, row in singleton_eval_data.iterrows():
        table_id, row_idx, col_idx = row['table_id'], row['row'], row['column']
        c_e_data = eval_data[(eval_data['table_id'] == table_id) & (eval_data['row'] == row_idx) & (eval_data['column'] == col_idx)]
        assert len(c_e_data) == 1
        if c_e_data.iloc[0]['evaluation_label'] == 1:
            num_tasks_with_singleton_candidate_with_gt += 1
    res['num_tasks_with_singleton_candidate_with_gt'] = num_tasks_with_singleton_candidate_with_gt
    num_tasks_with_graph_top_one_accurate = []
    num_tasks_with_graph_top_five_accurate = []
    num_tasks_with_graph_top_ten_accurate = []
    num_tasks_with_model_score_top_one_accurate = []
    num_tasks_with_model_score_top_five_accurate = []
    num_tasks_with_model_score_top_ten_accurate = []
    has_gt_list = []
    has_gt_in_candidate = []
    # candidate_eval_data = candidate_eval_data[:1]
    for i, row in candidate_eval_data.iterrows():
        #print(i)
        table_id, row_idx, col_idx = row['table_id'], row['row'], row['column']
        c_e_data = eval_data[(eval_data['table_id'] == table_id) & (eval_data['row'] == row_idx) & (eval_data['column'] == col_idx)]
        assert len(c_e_data) > 0
        if np.nan not in set(c_e_data['GT_kg_id']):
            has_gt_list.append(1)
        else:
            has_gt_list.append(0)
        if 1 in set(c_e_data['evaluation_label']):
            has_gt_in_candidate.append(1)
        else:
            has_gt_in_candidate.append(0)
            
        # handle graph-embedding-score
        s_data = c_e_data.sort_values(by=['lof-graph-embedding-score'], ascending=False)
        if s_data.iloc[0]['evaluation_label'] == 1:
            num_tasks_with_graph_top_one_accurate.append(1)
        else:
            num_tasks_with_graph_top_one_accurate.append(0)
        if 1 in set(s_data.iloc[0:5]['evaluation_label']):
            num_tasks_with_graph_top_five_accurate.append(1)
        else:
            num_tasks_with_graph_top_five_accurate.append(0)
        if 1 in set(s_data.iloc[0:10]['evaluation_label']):
            num_tasks_with_graph_top_ten_accurate.append(1)
        else:
            num_tasks_with_graph_top_ten_accurate.append(0)
        
        #rank on model score
        s_data = c_e_data.sort_values(by=[method], ascending=False)
        #print(s_data.iloc[0]['evaluation_label'])
        if s_data.iloc[0]['evaluation_label'] == '1':
            num_tasks_with_model_score_top_one_accurate.append(1)
        else:
            num_tasks_with_model_score_top_one_accurate.append(0)
        if 1 in set(s_data.iloc[0:5]['evaluation_label']):
            num_tasks_with_model_score_top_five_accurate.append(1)
        else:
            num_tasks_with_model_score_top_five_accurate.append(0)
        if 1 in set(s_data.iloc[0:10]['evaluation_label']):
            num_tasks_with_model_score_top_ten_accurate.append(1)
        else:
            num_tasks_with_model_score_top_ten_accurate.append(0)
            
        cf_e_data = c_e_data.copy()
        cf_e_data['lof-graph-embedding-score'] = cf_e_data['lof-graph-embedding-score'].replace(np.nan, 0)
        cf_e_data[method] = cf_e_data[method].replace(np.nan, 0)

    candidate_eval_data['lof-graph_top_one_accurate'] = num_tasks_with_graph_top_one_accurate
    candidate_eval_data['lof-graph_top_five_accurate'] = num_tasks_with_graph_top_five_accurate
    candidate_eval_data['lof-graph_top_ten_accurate'] = num_tasks_with_graph_top_five_accurate
    candidate_eval_data['model_top_one_accurate'] = num_tasks_with_model_score_top_one_accurate
    candidate_eval_data['model_top_five_accurate'] = num_tasks_with_model_score_top_five_accurate
    candidate_eval_data['model_top_ten_accurate'] = num_tasks_with_model_score_top_ten_accurate
    candidate_eval_data['has_gt'] = has_gt_list
    candidate_eval_data['has_gt_in_candidate'] = has_gt_in_candidate
    res['num_tasks_with_graph_top_one_accurate'] = sum(num_tasks_with_graph_top_one_accurate)
    res['num_tasks_with_graph_top_five_accurate'] = sum(num_tasks_with_graph_top_five_accurate)
    res['num_tasks_with_graph_top_ten_accurate'] = sum(num_tasks_with_graph_top_ten_accurate)
    res['num_tasks_with_model_score_top_one_accurate'] = sum(num_tasks_with_model_score_top_one_accurate)
    res['num_tasks_with_model_score_top_five_accurate'] = sum(num_tasks_with_model_score_top_five_accurate)
    res['num_tasks_with_model_score_top_ten_accurate'] = sum(num_tasks_with_model_score_top_ten_accurate)
    return res, candidate_eval_data

In [4]:
# Model Definition
class PairwiseNetwork(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        #original 12x24, 24x12, 12x12, 12x1
        self.fc1 = nn.Linear(hidden_size, 2*hidden_size)
        self.fc2 = nn.Linear(2*hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, hidden_size)
        self.fc4 = nn.Linear(hidden_size, 1)
    
    def forward(self, pos_features, neg_features):
        # Positive pass
        x = F.relu(self.fc1(pos_features))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        pos_out = torch.sigmoid(self.fc4(x))
        
        # Negative Pass
        x = F.relu(self.fc1(neg_features))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        neg_out = torch.sigmoid(self.fc4(x))
        
        return pos_out, neg_out
    
    def predict(self, test_feat):
        x = F.relu(self.fc1(test_feat))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        test_out = torch.sigmoid(self.fc4(x))
        return test_out


def predict(output_column, ranking_model, min_max_scaler_path, file_path=None, df=None):
    if file_path is None and df is None:
        raise RequiredInputParameterMissingException(
            'One of the input parameters is required: {} or {}'.format("file_path", "df"))

    if file_path:
        df = pd.read_csv(file_path, dtype=object)

    model = PairwiseNetwork(14)
    model.load_state_dict(torch.load(ranking_model))
    scaler = pickle.load(open(min_max_scaler_path, 'rb'))

    normalize_features = ['pagerank','retrieval_score','monge_elkan','des_cont_jaccard',
                         'jaro_winkler','levenshtein','singleton','is_lof','num_char','num_tokens', 'lof_property_count_tf_idf_score',
                         'lof-graph-embedding-score', 'lof-reciprocal-rank', 'context_score']
    df[normalize_features] = df[normalize_features].astype('float64')
    grouped_obj = df.groupby(['column', 'row'])
    new_df_list = []
    pred = []
    for cell in grouped_obj:
        cell[1][normalize_features] = scaler.transform(cell[1][normalize_features])
        df_copy = cell[1].copy()
        df_features = df_copy[normalize_features]
        new_df_list.append(df_copy)
        arr = df_features.to_numpy()
        test_inp = []
        for a in arr:
            test_inp.append(a)
        test_tensor = torch.tensor(test_inp).float()
        scores = model.predict(test_tensor)
        pred.extend(torch.squeeze(scores).tolist())
    out_df = pd.concat(new_df_list)
    out_df[output_column] = pred
    
    return out_df

In [34]:
#This code cell predicts for each test file and creates the top 5 candidates.
for file in glob.glob(dev_path + '*.csv'):
    filename = file.split("/")[-1]
    print("For file: ", filename)
    #location where the output generated by the predictions wil be stored.
    dev_output = os.path.join(dev_predictions, filename)
    #Location where top 5 candidates will be stored for each cell in the file.
    top_5_output_file = os.path.join(temp_predicted_top_5, filename)
    df = pd.read_csv(file,dtype=object)
    
    predicted_df = predict(output_column=output_col, ranking_model= saved_model, min_max_scaler_path=normalization_file_path, df=df)
    #Now, we add a column rank in this file which is sorted by the output_col.
    predicted_df['rank'] = 0
    predicted_df = predicted_df.sort_values([output_col], ascending = False)
    grouped_obj = predicted_df.groupby(['row', 'column'])
    for cell, group in grouped_obj:
        m = len(group.index)
        rank_list = list(range(1, m+1, 1))
        group['rank'] = rank_list
        for row, index in group.iterrows():
            predicted_df.loc[row, 'rank'] = index['rank']
            #print(predicted_df.loc[row, 'rank'])
    predicted_df.to_csv(dev_output, index = False)
    predicted_df['table_id'] ='0'
    res, candidate_eval_data = parse_eval_files_stats(predicted_df, output_col)
    top1_precision = res['num_tasks_with_model_score_top_one_accurate']/res['num_tasks_with_gt']
    print("Top_1 Accuracy: ",top1_precision)
    !tl get-kg-links -c $output_col -k 5 --k-rows {dev_output} > {top_5_output_file}
    

For file:  28086084_0_3127660530989916727.csv
Top_1 Accuracy:  0.7863636363636364
For file:  84575189_0_6365692015941409487.csv
Top_1 Accuracy:  0.9285714285714286
For file:  29414811_2_4773219892816395776.csv
Top_1 Accuracy:  0.7727272727272727
For file:  14067031_0_559833072073397908.csv
Top_1 Accuracy:  0.9433962264150944
For file:  50270082_0_444360818941411589.csv
Top_1 Accuracy:  0.9702380952380952
For file:  52299421_0_4473286348258170200.csv
Top_1 Accuracy:  0.8901098901098901
For file:  14380604_4_3329235705746762392.csv
Top_1 Accuracy:  0.95
For file:  39759273_0_1427898308030295194.csv
Top_1 Accuracy:  0.98
For file:  45073662_0_3179937335063201739.csv
Top_1 Accuracy:  1.0


In [50]:
#The following code block adds the correct column, adds the correct candidate in case it is not present in top 5. Also creates the correct_vs_incorrect.xlsx
correct_df = pd.DataFrame()
incorrect_df = pd.DataFrame()
for file in glob.glob(temp_predicted_top_5 + '*.csv'):
    #Create a correct column with a value either -1, 1, 0.
    file_df = pd.read_csv(file)
    file_df['correct'] = 0
    filename = file.split("/")[-1]
    filename_colored = filename.replace('.csv', '.xlsx')
    extra_df = pd.DataFrame()
    output_file_path = os.path.join(temp, filename)
    rank_file_path = os.path.join(dev_predictions, filename)
    rank_file = pd.read_csv(rank_file_path)
    grouped_obj = file_df.groupby(['row', 'column'])
    for cell, group in grouped_obj:
        m = len(group.index)
        eval_pred_val = group['evaluation_label'].values.tolist()
        group['rank'] = list(range(1, m+1, 1))
        final_correct = []

        if eval_pred_val[0] == 1:
            final_correct =[1 for i in range(m)]   
            group['correct'] = final_correct
            
        elif 1 in eval_pred_val:
            final_correct = [0 for i in range(m)]  
            group['correct'] = final_correct
            incorrect_row = group.iloc[[0]]
            incorrect_row = incorrect_row.copy()
            incorrect_row['table_id'] = filename.split('.csv')[0]
            incorrect_df = pd.concat([incorrect_df, incorrect_row])
            cond = group.evaluation_label == 1
            correct_row = group.loc[cond, :]
            correct_row = correct_row.copy()
            correct_row['table_id'] = filename.split('.csv')[0]
            correct_df = correct_df.append(correct_row, ignore_index=True)
            
        else:
            final_correct = [-1 for i in range(m)]    
            group['correct'] = final_correct
            group_2 = group.copy()
            row = group_2['row'].values[0]
            column = group_2['column'].values[0]
            group2 = group_2.sort_index(inplace=True)
            ind = rank_file[(rank_file['row']==row) & (rank_file['column']==column) & (rank_file['evaluation_label']==1)].index.values
            if len(ind) > 1:
                ind_true = ind[0]
                corr = rank_file.iloc[[ind_true]]
            elif len(ind) == 1:
                corr = rank_file.iloc[ind]
            else:
                continue
            corr_df = pd.DataFrame()
            corr_df = corr.copy()
            corr_df.loc[:,'correct'] = -1
            extra_df = pd.concat([extra_df, corr_df])
            corr_df['table_id'] = filename.split('.csv')[0]
            correct_df = correct_df.append(corr_df, ignore_index=True)
            incorrect_row = group.iloc[[0]]
            incorrect_row = incorrect_row.copy()
            incorrect_row['table_id'] = filename.split('.csv')[0]
            incorrect_df = pd.concat([incorrect_df, incorrect_row])
                
        #Find the correct column from the rank file 
        for row, index in group.iterrows():
            file_df.loc[row, 'correct'] = index['correct'] 
            file_df.loc[row, 'rank'] = index['rank']
            
    file_df = pd.concat([file_df, extra_df])
    file_df = file_df.sort_values(['column', 'row', 'rank'], ascending = True)
    file_df.to_csv(output_file_path, index=False)
incorrect_df['row_type'] = "Incorrect"
correct_df['row_type'] = "Correct"
result_df = pd.concat([correct_df, incorrect_df])
result_df = result_df.sort_values(["table_id","row", "column", "label", "rank"], ascending = True)
output_path = os.path.join(temp,"correctandincorrect.csv")
result_df.to_csv(output_path, index = False)


In [51]:
result_df

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,...,levenshtein,class_tfidf_reciprocal_rank,property_tfidf_reciprocal_rank,context_properties,context_score,siamese_pred,rank,correct,table_id,row_type
0,0,0,Abraham the Syrian,--|Yes|--|--|--,Abraham the Syrian,Q200608,Ephrem the Syrian,Ephraem Syrus,fuzzy-augmented,"4th century Syriac deacon, hymnographer and th...",...,0.666667,0.142857,1.000000,NaN|NaN|NaN|NaN|NaN,0.0000,9.995826e-01,1,-1.0,28086084_0_3127660530989916727,Incorrect
12,0,0,Abraham the Syrian,--|Yes|--|--|--,Abraham the Syrian,Q1292819,Pope Abraham of Alexandria,,fuzzy-augmented,"Coptic Orthodox Pope of Alexandria, Egypt",...,0.423077,1.000000,0.047619,NaN|NaN|NaN|NaN|NaN,0.0000,2.628786e-11,41,-1.0,28086084_0_3127660530989916727,Correct
65,0,11,Alban,Yes|--|Yes|Yes|--,Alban,Q56464777,Ashley Alban,,fuzzy-augmented,American pornographic actress,...,0.416667,0.019231,0.009434,P1340|NaN|P1340|P1340|NaN,0.7023,9.370953e-01,1,0.0,28086084_0_3127660530989916727,Incorrect
13,0,11,Alban,Yes|--|Yes|Yes|--,Alban,Q312982,Alban,St. Alban|Albanus|Saint Alban,exact-match,English protomartyr,...,1.000000,0.014085,0.166667,NaN|NaN|NaN|NaN|NaN,0.0000,4.818838e-01,2,0.0,28086084_0_3127660530989916727,Correct
175,0,13,Alphege,--|--|--|Yes|--,Alphege,Q4735284,Alphege of Wells,Elfheah|Ælfheah,exact-match,Bishop of Wells,...,0.437500,0.333333,0.062500,NaN|NaN|NaN|NaN|NaN,0.0000,9.999716e-01,1,0.0,28086084_0_3127660530989916727,Incorrect
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54,2,60,Angewandte Chemie International Edition,61|109,Angewandte Chemie International Edition,Q538683,Angewandte Chemie,,fuzzy-augmented,journal,...,0.435897,0.250000,1.000000,NaN|NaN,0.0000,9.998993e-01,2,0.0,84575189_0_6365692015941409487,Correct
456,2,92,Journal of the American Veterinary Medical Ass...,93|78,Journal of the American Veterinary Medical Ass...,Q1470970,Journal of the American Medical Association,Jour. A.M.A.|JAMA|Journal of the American Medi...,fuzzy-augmented,peer-reviewed medical journal,...,0.796296,0.062500,1.000000,NaN|NaN,0.0000,9.999889e-01,1,-1.0,84575189_0_6365692015941409487,Incorrect
55,2,92,Journal of the American Veterinary Medical Ass...,93|78,Journal of the American Veterinary Medical Ass...,Q2843064,American Veterinary Medical Association,AVMA,fuzzy-augmented,organization,...,0.722222,0.013514,0.014286,NaN|NaN,0.0000,1.141063e-03,18,-1.0,84575189_0_6365692015941409487,Correct
486,2,98,virology,99|75,virology,Q7215,virology,,exact-match,study of viruses,...,1.000000,0.015385,0.014493,NaN|NaN,0.0000,9.999987e-01,1,0.0,84575189_0_6365692015941409487,Incorrect


In [52]:
#Adding color to the final file
!tl add-color -c $output_col -k 2 $output_path --output $correct_vs_incorrect