## Purpose
For leave one out experiments, 
- preprocesses data to add the new feature prefix_monge_elkan similarity,
- runs the model pipeline experiment over the processed data, 
- predicts over the test dataset and 
- provides overview with cell based and column based metrics.

For data: Directly downloaded from AWS S3.
For running over different test dataset change needed
- variable setup tagged cell
- test setup tagged cell

For experimental setup, a new folder is created, changes to the path can be set in test_setup

In [1]:
###Importing all the libraries
import glob
import boto3
import time
import os
import pandas as pd
import sklearn.metrics
from sklearn.preprocessing import MinMaxScaler
import pickle
from argparse import ArgumentParser, Namespace
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from itertools import chain
import copy
import shutil
import pickle
from tqdm import tqdm
import scipy.sparse as sp
import os

In [2]:
# Setup the variables
features = ["monge_elkan","monge_elkan_aliases","jaro_winkler",
            "levenshtein", "prefix_monge_elkan_similarity","singleton","context_score_3","pgt_centroid_score","pgt_class_count_tf_idf_score",
            "pgt_property_count_tf_idf_score", "num_occurences", "incorrectness_scores"]
BATCH_SIZE = 32
LEARNING_RATE = 0.00001

In [3]:
experiment_name = f"Experiment_test_semtab_feature_addition"
experiment_data = '2t_data,limaye_data,biotab_data,t2dv2_data,biodiv_data'
### Creating the directories for the results.
experiment_train_data = experiment_data
experiment_dev_data =  experiment_data
experiment_test_data = 'semtab_data'
experiment_store_path = f"Experiment/{experiment_name}"
processed_dev_data = f"{experiment_store_path}/dev/features/"
processed_test_data = f"{experiment_store_path}/test/features/"
dev_predictions = f"{experiment_store_path}/dev/dev_predictions/"
test_predictions = f"{experiment_store_path}/test/test_predictions/"
dev_output_pred = f"{experiment_store_path}/dev/dev_output/"
dev_predictions_top_k = f"{experiment_store_path}/dev/dev_predictions_top_k/"
dev_metrics = f"{experiment_store_path}/dev/dev_metrics/"
dev_predictions_colorized = f"{experiment_store_path}/dev/dev_predictions_colorized/"
model_save_path = f'{experiment_store_path}/model_save_path/'
best_model_path = ''

training_data_path = f'{experiment_store_path}/model_training_data'

pos_output = f'{training_data_path}/tl_pipeline_pos_features_{BATCH_SIZE}.pkl'
neg_output = f'{training_data_path}/tl_pipeline_neg_features_{BATCH_SIZE}.pkl'
min_max_scaler_path = f'{training_data_path}/tl_pipeline_normalization_factor.pkl'

final_score_column = 'siamese_prediction'

extra_feat = ['column-id', 'column', 'row', 'evaluation_label', 'dataset_id', 'table_id']
for f in features:
    extra_feat.append(f)
    

!mkdir -p $experiment_store_path 
!mkdir -p $dev_predictions 
!mkdir -p $dev_output_pred
!mkdir -p $dev_predictions_top_k 
!mkdir -p $dev_metrics 
!mkdir -p $dev_predictions_colorized 
!mkdir -p $model_save_path 
!mkdir -p $training_data_path
!mkdir -p $processed_dev_data
!mkdir -p $processed_test_data
!mkdir -p $test_predictions
results_main_location = f"{experiment_store_path}/results/"
result_predictions_location = f"{results_main_location}/test_predictions_top_k"
result_metrics_location = f"{results_main_location}/test_predictions_metrics/"
result_colorized_location= f"{results_main_location}/test_predictions_colored/"

!mkdir -p $results_main_location
!mkdir -p $result_predictions_location
!mkdir -p $result_metrics_location
!mkdir -p $result_colorized_location


In [14]:
# Copying the data from aws to a local directory
aws_location = f's3://table-linker-datasets/Experiments/Experiment_test_{experiment_test_data.split("_")[0]}/'
!aws s3 cp --recursive {aws_location} {experiment_store_path}

download: s3://table-linker-datasets/Experiments/Experiment_test_semtab/final_models/cell/top1_0.7520739963764661_epoch_1_loss_0.19324207305908203_batch_size_32_learning_rate_1e-05.pth to Experiment/Experiment_test_semtab_feature_addition/final_models/cell/top1_0.7520739963764661_epoch_1_loss_0.19324207305908203_batch_size_32_learning_rate_1e-05.pth
download: s3://table-linker-datasets/Experiments/Experiment_test_semtab/final_models/cell/top1_0.7484504624773529_epoch_0_loss_0.517475962638855_batch_size_32_learning_rate_1e-05.pth to Experiment/Experiment_test_semtab_feature_addition/final_models/cell/top1_0.7484504624773529_epoch_0_loss_0.517475962638855_batch_size_32_learning_rate_1e-05.pth
download: s3://table-linker-datasets/Experiments/Experiment_test_semtab/final_models/complete_shuffle/top1_0.6955472911850438_epoch_1_loss_0.9994743466377258_batch_size_32_learning_rate_1e-05.pth to Experiment/Experiment_test_semtab_feature_addition/final_models/complete_shuffle/top1_0.6955472911850

In [7]:
train_files_path = [f'{experiment_store_path}/reduced_train_data/' + i for i in experiment_train_data.split(',')]
train_files = []
for train_path in train_files_path:
    set_of_files = glob.glob(train_path + '/*.csv')
    train_files.extend(set_of_files)
print(len(train_files))
dev_files_path = [f'{experiment_store_path}/reduced_dev_data/' + i for i in experiment_train_data.split(',')]
dev_files = []
for dev_path in dev_files_path:
    set_of_files = glob.glob(dev_path + '/*.csv')
    dev_files.extend(set_of_files)
print(len(dev_files))
test_files = []
test_files_location = [f'{experiment_store_path}/{experiment_test_data.split("_")[0]}/complete_data']
for k in test_files_location:
    test_files.extend(glob.glob(k + '/*.csv'))
print(len(test_files))

450
157
345


In [4]:
preprocessed_train_files_path = f'{experiment_store_path}/processed_train_data/'
preprocessed_dev_files_path = f'{experiment_store_path}/processed_dev_data/'
preprocessed_test_files_path = f'{experiment_store_path}/processed_test_data/'


In [5]:
### EXTRA DATA PREPROCESSING - Adding a new feature prefix column to all the data

!mkdir -p {preprocessed_train_files_path}
!mkdir -p {preprocessed_dev_files_path}
!mkdir -p {preprocessed_test_files_path}
preprocessed_train_files = []
preprocessed_dev_files = []
preprocessed_test_files = []

print("Preprocessing for train data")
for file in train_files:
    file_name = file.split('/')[-1]
    print(file_name)
    result_file = preprocessed_train_files_path + file_name
    !tl string-similarity --method prefix_monge_elkan:tokenizer=word -c label_clean kg_labels -o prefix_monge_elkan_similarity $file  > $result_file
    preprocessed_train_files.append(result_file)
    
print("Preprocessing for dev data")
for file in dev_files:
    file_name = file.split('/')[-1]
    print(file_name)
    result_file = preprocessed_dev_files_path + file_name
    !tl string-similarity --method prefix_monge_elkan:tokenizer=word -c label_clean kg_labels -o prefix_monge_elkan_similarity $file  > $result_file
    preprocessed_dev_files.append(result_file)
    
print("Preprocessing for test data")
for file in test_files:
    file_name = file.split('/')[-1]
    print(file_name)
    result_file = preprocessed_test_files_path + file_name
    !tl string-similarity --method prefix_monge_elkan:tokenizer=word -c label_clean kg_labels -o prefix_monge_elkan_similarity $file  > $result_file
    preprocessed_test_files.append(result_file)


Preprocessing for train data


NameError: name 'train_files' is not defined

In [5]:
preprocessed_train_files = glob.glob(preprocessed_train_files_path + '/*.csv')
preprocessed_dev_files = glob.glob(preprocessed_dev_files_path + '/*.csv')
preprocessed_test_files = glob.glob(preprocessed_test_files_path + '/*.csv')
len(preprocessed_train_files), len(preprocessed_dev_files), len(preprocessed_test_files)

(450, 157, 345)

In [6]:
# Helpher functions for data preprocessing
def merge_files(args):
    # datapath = args.train_path
    df_list = []
    for fn in args.train_files:
        fid = fn.split('/')[-1][:-4]
        dataset_id = fn.split('/')[-2]
        df = read_file(fn)
        if not isinstance(df, pd.DataFrame) :
            continue

        df['table_id'] = fid
        print(dataset_id, fid)
        df['dataset_id'] = dataset_id
        df['context_score'].fillna(0.0, inplace=True)
        if 'column-id' not in df.columns:
            df['column-id'] = fn.split('/')[-1] + df['column'].astype('str')

        df = df[extra_feat]
        df_list.append(df)
    return pd.concat(df_list)


def compute_normalization_factor(args, all_data):
    min_max_scaler_path = args.min_max_scaler_path
    all_data_features = all_data[features]
    scaler = MinMaxScaler()
    scaler.fit(all_data_features)
    return scaler


def read_file(key):
    #resp = s3.get_object(Bucket = bucket, Key = key)
    try:
        df = pd.read_csv(key, sep = ',')
    except pd.errors.EmptyDataError:
        df = ''
        print('Empty csv file!')
    return df

In [7]:

def generate_train_data(args, all_data, scaler, shuffle_by = None):
    num_cells_1 = 0
    scaler_path = args.min_max_scaler_path
    #scaler = pickle.load(open('./tmp/min_max_scaler_path.pkl', 'rb'))
    final_list = []
    sfeatures = copy.deepcopy(features)
    normalize_features = features
    evaluation_label = ['evaluation_label']
    positive_features_final = []
    negative_features_final = []
    super_groups = all_data.groupby(['column-id'])
    if shuffle_by == 'dataset':
        super_groups = all_data.groupby(['dataset_id'])
        for i, s_group in super_groups:
            pos_features_dataset = []
            neg_features_dataset = []
            grouped_obj = s_group.groupby(['column', 'row', 'column-id'])

            for cell in grouped_obj:
                num_cells_1 += 1
                cell[1][normalize_features] = scaler.transform(cell[1][normalize_features])
                pos_features = []
                neg_features = []
                a = cell[1][cell[1]['evaluation_label'] == 1]
                if a.empty:
                    continue
                pos_rows = cell[1][cell[1]['evaluation_label'].astype(int) == 1][sfeatures].to_numpy()
                if len(pos_rows) < 1:
                    continue
                if len(pos_rows) > 1:
                    print("here")
                for i in range(len(pos_rows)):
                    pos_features.append(pos_rows[i])
                neg_rows = cell[1][cell[1]['evaluation_label'].astype(int) == -1][sfeatures].to_numpy()
                for i in range(min(BATCH_SIZE, len(neg_rows))):
                    neg_features.append(neg_rows[i])

                for k in range(len(neg_features) - len(pos_features)):
                    pos_features.append(random.choice(pos_rows))
                if len(pos_features) != len(neg_features):
                    continue
                pos_features_dataset.append(pos_features)
                neg_features_dataset.append(neg_features)
            if len(pos_features_dataset) > 0:
                c = list(zip(pos_features_dataset, neg_features_dataset))
                random.shuffle(c)
                pos_features_dataset, neg_features_dataset = zip(*c)
                positive_features_final.extend(pos_features_dataset)
                negative_features_final.extend(neg_features_dataset)
    elif shuffle_by == 'table':
        super_groups = all_data.groupby(['table_id'])
        for i, s_group in super_groups:
            pos_features_table = []
            neg_features_table = []
            file_name = i.split('-')[0]
            #s_group_n = context_property_similarity_code.return_a_table(s_group, pca_components = N_PCA_COMPONENTS)
            #print("entering ", file_name)
            ds_id = s_group['dataset_id'].values[0]
            print("File: ", file_name, ds_id)
            grouped_obj = s_group.groupby(['column', 'row'])
    
            for cell in grouped_obj:
                num_cells_1 += 1
                cell[1][normalize_features] = scaler.transform(cell[1][normalize_features])
                pos_features = []
                neg_features = []
                a = cell[1][cell[1]['evaluation_label'] == 1]
                if a.empty:
                    continue
                pos_rows = cell[1][cell[1]['evaluation_label'].astype(int) == 1][sfeatures].to_numpy()
                if len(pos_rows) < 1:
                    continue
                if len(pos_rows) > 1:
                    print("here")
                for i in range(len(pos_rows)):
                    pos_features.append(pos_rows[i])
                neg_rows = cell[1][cell[1]['evaluation_label'].astype(int) == -1][sfeatures].to_numpy()
                for i in range(min(BATCH_SIZE, len(neg_rows))):
                    neg_features.append(neg_rows[i])

                for k in range(len(neg_features) - len(pos_features)):
                    pos_features.append(random.choice(pos_rows))
                if len(pos_features) != len(neg_features):
                    continue
                random.shuffle(pos_features)

                random.shuffle(neg_features)
                pos_features_table.append(pos_features)
                neg_features_table.append(neg_features)
            if len(pos_features_table) > 0:
                c = list(zip(pos_features_table, neg_features_table))
                random.shuffle(c)
                pos_features_table, neg_features_table = zip(*c)
                positive_features_final.extend(pos_features_table)
                negative_features_final.extend(neg_features_table)
    else:
        for i, s_group in super_groups:
            file_name = i.split('-')[0]
            #print("entering ", file_name)
            grouped_obj = s_group.groupby(['column', 'row'])
            for cell in grouped_obj:
                num_cells_1 += 1
                cell[1][normalize_features] = scaler.transform(cell[1][normalize_features])
                pos_features = []
                neg_features = []
                a = cell[1][cell[1]['evaluation_label'] == 1]
                if a.empty:
                    continue
                pos_rows = cell[1][cell[1]['evaluation_label'].astype(int) == 1][sfeatures].to_numpy()
                if len(pos_rows) < 1:
                    continue
                if len(pos_rows) > 1:
                    print("here")
                for i in range(len(pos_rows)):
                    pos_features.append(pos_rows[i])
                neg_rows = cell[1][cell[1]['evaluation_label'].astype(int) == -1][sfeatures].to_numpy()
                for i in range(min(BATCH_SIZE, len(neg_rows))):
                    neg_features.append(neg_rows[i])

                for k in range(len(neg_features) - len(pos_features)):
                    pos_features.append(random.choice(pos_rows))
                random.shuffle(pos_features)
                random.shuffle(neg_features)
                if len(pos_features) != len(neg_features):
                    print("Something Wrong")
                else:
                    positive_features_final.append(pos_features)
                    negative_features_final.append(neg_features)

    if shuffle_by == 'complete_shuffle':
        c = list(zip(positive_features_final, negative_features_final))
        random.shuffle(c)
        positive_features_final, positive_features_final = zip(*c)
    print(len(positive_features_final), len(positive_features_final[3]))
    print(len(negative_features_final), len(negative_features_final[3]))

    print(len(positive_features_final), len(positive_features_final[3]))
    print(len(negative_features_final), len(negative_features_final[3]))
    return positive_features_final, negative_features_final
def generate_dataloader(positive_feat_path, negative_feat_path):
    pos_features = pickle.load(open(positive_feat_path, 'rb'))
    neg_features = pickle.load(open(negative_feat_path, 'rb'))
    print(pos_features[10])
    pos_features_flatten = list(chain.from_iterable(pos_features))
    neg_features_flatten = list(chain.from_iterable(neg_features))
    return pos_features_flatten, neg_features_flatten

In [16]:
# Setting the scaler and merging files
gen_training_data_args = Namespace(train_files=preprocessed_train_files, pos_output=pos_output, neg_output=neg_output,min_max_scaler_path=min_max_scaler_path)
all_data = merge_files(gen_training_data_args)
scaler = compute_normalization_factor(gen_training_data_args, all_data)
pickle.dump(scaler, open(min_max_scaler_path, 'wb'))

processed_train_data 3N6S2FCX
processed_train_data CNQ5Z0BG
processed_train_data 4J75OL3W
processed_train_data HGYG2DVU
processed_train_data 3LG8J4MX
processed_train_data CHZGO92A
processed_train_data EQZ21058
processed_train_data HB00DX4L
processed_train_data FVKKTA8O


  exec(code_obj, self.user_global_ns, self.user_ns)


processed_train_data K2VEUQT0


  exec(code_obj, self.user_global_ns, self.user_ns)


processed_train_data HQRATPBV
processed_train_data IPTLCCCU
processed_train_data IYZYF533
processed_train_data KOQM4YU9
processed_train_data NV4GY44T
processed_train_data N6QAC84T
processed_train_data OKW6UUW5
processed_train_data OIDIXPNZ
processed_train_data QOL4ZIHL
processed_train_data SRVLBA90
processed_train_data PZXDACJ9
processed_train_data SPPNJXB2
processed_train_data TW1HGRFI
processed_train_data VGUZX5R3
processed_train_data UURPYBGQ
processed_train_data U1FDHL7N
processed_train_data WH6JINCM
processed_train_data file110008_0_cols1_rows66
processed_train_data file110816_0_cols1_rows40
processed_train_data file171157_0_cols1_rows21
processed_train_data file13997_0_cols1_rows20
processed_train_data file183712_0_cols1_rows19
processed_train_data file198804_0_cols1_rows39
processed_train_data file205708_0_cols1_rows20
processed_train_data file222751_0_cols1_rows13
processed_train_data file229323_0_cols1_rows7
processed_train_data file236160_0_cols1_rows28
processed_train_data f

  exec(code_obj, self.user_global_ns, self.user_ns)


processed_train_data 6C8349
processed_train_data 68D30A
processed_train_data 7CBBC4
processed_train_data AD61AB
processed_train_data 98F137
processed_train_data 861398
processed_train_data C74D97
processed_train_data D64592
processed_train_data FC490C
processed_train_data ED3D2C
processed_train_data FE9FC2
processed_train_data F89913
processed_train_data 14067031_0_559833072073397908
processed_train_data 33401079_0_9127583903019856402


  exec(code_obj, self.user_global_ns, self.user_ns)


processed_train_data 38428277_0_1311643810102462607
processed_train_data 77694908_0_6083291340991074532
processed_train_data 41480166_0_6681239260286218499
processed_train_data 80588006_0_6965325215443683359


  exec(code_obj, self.user_global_ns, self.user_ns)


processed_train_data 69537082_0_7789694313271016902
Empty csv file!


  exec(code_obj, self.user_global_ns, self.user_ns)


processed_train_data 39a2d36769294a0a846cc209c45234e4
Empty csv file!
processed_train_data 89e72a749d764c1aacd9284e01c412a4
processed_train_data file69673_0_cols1_rows58
processed_train_data file304250_0_cols1_rows14
processed_train_data file290427_0_cols1_rows14
processed_train_data file234482_0_cols1_rows68
processed_train_data file338151_0_cols1_rows6
processed_train_data file456872_2_cols1_rows15
processed_train_data E22XXKVQ
processed_train_data file77529_0_cols1_rows20
processed_train_data file137293_0_cols1_rows6
processed_train_data ORPKCFRL
processed_train_data WRPAQPNC
processed_train_data GNDO9OXJ
processed_train_data LV5N8XDB
processed_train_data B38A9Q5R
processed_train_data QPFX5Z8J
processed_train_data MANO2PKR
processed_train_data Z4M8AT89
processed_train_data YCOUS57M
processed_train_data NDSTZH1I
processed_train_data GY7KNULP
processed_train_data OZYCQ769
processed_train_data HQCEC5NO
processed_train_data GINQPZQC
processed_train_data AUU9A6KL
processed_train_data J9E

  exec(code_obj, self.user_global_ns, self.user_ns)


processed_train_data 34169c088ee848e4866f42e87b4ccbc2
Empty csv file!
processed_train_data a407fa84d55d4072945ac61346150d03
Empty csv file!


  exec(code_obj, self.user_global_ns, self.user_ns)


processed_train_data b785d9ed72b347ae9cb0997df17258d8
processed_train_data file215105_0_cols1_rows23
processed_train_data file192085_0_cols1_rows31
processed_train_data file132514_0_cols1_rows17
processed_train_data file114403_0_cols1_rows82
processed_train_data file183833_0_cols1_rows21
processed_train_data WWBIR8H6
processed_train_data file294473_0_cols1_rows17
processed_train_data S8UOQYBG
processed_train_data ET9REW9Y
processed_train_data 71SY0Z5S
processed_train_data file37584_2_cols1_rows28
processed_train_data X23TMJ3R
processed_train_data B20WIQKU
processed_train_data file244317_0_cols1_rows39
processed_train_data file268478_9_cols1_rows20
processed_train_data file234558_0_cols1_rows39
processed_train_data HLJ9HHEE
processed_train_data file119797_0_cols1_rows39
processed_train_data QW492LGU
processed_train_data file376591_0_cols1_rows19
processed_train_data DXTA1MV8
processed_train_data file268478_11_cols1_rows25
processed_train_data T7RPWH6N
processed_train_data file178948_0_c

  exec(code_obj, self.user_global_ns, self.user_ns)


processed_train_data 9F6140
processed_train_data E2EF52
processed_train_data B6D767
processed_train_data A5771B
processed_train_data AC627A
processed_train_data AAB323
processed_train_data C16A53
processed_train_data FBD793
processed_train_data D2DDEA
processed_train_data F033AB
processed_train_data D09BF4
processed_train_data D9D4F4
processed_train_data F0935E
processed_train_data 37856682_0_6818907050314633217
processed_train_data 1438042989043_35_20150728002309-00287-ip-10-236-191-2_875026214_2
processed_train_data 39107734_2_2329160387535788734
processed_train_data 39650055_5_7135804139753401681


  exec(code_obj, self.user_global_ns, self.user_ns)


processed_train_data c36f7f977dd8442681ace9541e0177c8
processed_train_data 217d8c0c33a74bb28a55efffcb047885
Empty csv file!
processed_train_data file152457_0_cols1_rows7
processed_train_data file493682_0_cols1_rows31
processed_train_data file568285_0_cols1_rows15
processed_train_data file51375_4_cols1_rows88
processed_train_data F98SUVJH
processed_train_data file224977_0_cols1_rows66
processed_train_data OHGI1JNY
processed_train_data JZ22O0DD
processed_train_data file289661_0_cols1_rows6
processed_train_data RYTFLT5K
processed_train_data QOAVEFGY
processed_train_data file267184_0_cols1_rows22
processed_train_data 3DOM5NIW
processed_train_data file313602_0_cols1_rows78
processed_train_data file269877_0_cols1_rows39
processed_train_data file322685_0_cols1_rows29
processed_train_data file167874_0_cols1_rows11
processed_train_data O0BECB72
processed_train_data file219474_11_cols1_rows22
processed_train_data file267997_0_cols1_rows69
processed_train_data I6BBMPNU
processed_train_data O0UJJV

  exec(code_obj, self.user_global_ns, self.user_ns)


processed_train_data 093F65
processed_train_data 2723D0
processed_train_data 43EC51
processed_train_data 3416A7
processed_train_data 44F683
processed_train_data 67C6A1
processed_train_data A5BFC9
processed_train_data 9778D5
processed_train_data 735B90
processed_train_data 9BF31C
processed_train_data A97DA6
processed_train_data A87FF6
processed_train_data EC8956
processed_train_data C81E72
processed_train_data C9E107
processed_train_data F457C5
processed_train_data F4B9EC
processed_train_data 24036779_0_5608105867560183058
processed_train_data 29414811_12_251152470253168163
processed_train_data 16767252_0_2409448375013995751
processed_train_data 40534006_0_4617468856744635526
processed_train_data 14380604_4_3329235705746762392
processed_train_data 1438042989018_40_20150728002309-00067-ip-10-236-191-2_57714692_2
processed_train_data 53822652_0_5767892317858575530
processed_train_data 21362676_0_6854186738074119688
processed_train_data 29414811_13_8724394428539174350
processed_train_data 

In [17]:
# Creating final data
positive_features_final, negative_features_final = generate_train_data(gen_training_data_args, all_data, scaler, shuffle_by = 'table')
pickle.dump(positive_features_final, open(pos_output, 'wb'))
pickle.dump(negative_features_final, open(neg_output, 'wb'))

File:  093F65 processed_train_data
File:  0bc67e05a4d14011a2cf3fca2f869495 processed_train_data
File:  10579449_0_1681126353774891032 processed_train_data
File:  14067031_0_559833072073397908 processed_train_data
File:  1438042986423_95_20150728002306 processed_train_data
File:  1438042989018_40_20150728002309 processed_train_data
File:  1438042989043_35_20150728002309 processed_train_data
File:  14380604_4_3329235705746762392 processed_train_data
File:  14BFA6 processed_train_data
File:  16767252_0_2409448375013995751 processed_train_data
File:  167909 processed_train_data
File:  19CA14 processed_train_data
File:  1C383C processed_train_data
File:  1F0E3D processed_train_data
File:  1FF1DE processed_train_data
File:  1d09a099d3964602aca9425adcde89cd processed_train_data
File:  20135078_0_7570343137119682530 processed_train_data
File:  21362676_0_6854186738074119688 processed_train_data
File:  217d8c0c33a74bb28a55efffcb047885 processed_train_data
File:  24036779_0_5608105867560183058 p

In [9]:
#load scaler and saved pos and neg pairs
scaler = pickle.load(open(min_max_scaler_path, 'rb'))
pos, neg = generate_dataloader(pos_output, neg_output)

[array([0.76      , 1.        , 0.76      , 0.5       , 0.76      ,
       1.        , 0.        , 0.94852424, 1.        , 0.89424682,
       1.        ]), array([0.76      , 1.        , 0.76      , 0.5       , 0.76      ,
       1.        , 0.        , 0.94852424, 1.        , 0.89424682,
       1.        ]), array([0.76      , 1.        , 0.76      , 0.5       , 0.76      ,
       1.        , 0.        , 0.94852424, 1.        , 0.89424682,
       1.        ]), array([0.76      , 1.        , 0.76      , 0.5       , 0.76      ,
       1.        , 0.        , 0.94852424, 1.        , 0.89424682,
       1.        ]), array([0.76      , 1.        , 0.76      , 0.5       , 0.76      ,
       1.        , 0.        , 0.94852424, 1.        , 0.89424682,
       1.        ]), array([0.76      , 1.        , 0.76      , 0.5       , 0.76      ,
       1.        , 0.        , 0.94852424, 1.        , 0.89424682,
       1.        ]), array([0.76      , 1.        , 0.76      , 0.5       , 0.76      ,
  

In [10]:
## Functions to train model
class T2DV2Dataset(Dataset):
    def __init__(self, pos_features, neg_features):
        self.pos_features = pos_features
        self.neg_features = neg_features

    def __len__(self):
        return len(self.pos_features)

    def __getitem__(self, idx):
        return self.pos_features[idx], self.neg_features[idx]
    
class PairwiseNetwork(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        # original 12x24, 24x12, 12x12, 12x1
        self.fc1 = nn.Linear(hidden_size, 2*hidden_size)
        
        self.fc2 = nn.Linear(2*hidden_size, hidden_size)
        #self.fc_x = nn.Linear(2*hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, hidden_size)
        self.fc4 = nn.Linear(hidden_size, 1)

    def forward(self, pos_features, neg_features):
        # Positive pass
        x = F.relu(self.fc1(pos_features))
        x = F.relu(self.fc2(x))
        #x = F.relu(self.fc_x(x))
        x = F.relu(self.fc3(x))
        pos_out = torch.sigmoid(self.fc4(x))

        # Negative Pass
        x = F.relu(self.fc1(neg_features))
        x = F.relu(self.fc2(x))
        #x = F.relu(self.fc_x(x))
        x = F.relu(self.fc3(x))
        neg_out = torch.sigmoid(self.fc4(x))

        return pos_out, neg_out

    def predict(self, test_feat):
        x = F.relu(self.fc1(test_feat))
        x = F.relu(self.fc2(x))
        #x = F.relu(self.fc_x(x))
        x = F.relu(self.fc3(x))
        test_out = torch.sigmoid(self.fc4(x))
        return test_out


# Pairwise Loss
class PairwiseLoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.m = 0

    def forward(self, pos_out, neg_out):
        distance = (1 - pos_out) + neg_out
        loss = torch.mean(torch.max(torch.tensor(0), distance))
        return loss
    
def infer_scores(min_max_scaler_path, input_table_path, output_table_path, model, test = 0):
    #scaler = pickle.loads(s3_1.Bucket("table-linker-datasets").Object(min_max_scaler_path).get()['Body'].read())
    #pca = pickle.load(open('pca_var_500.pkl', 'rb'))
    normalize_features = features
    sfeatures = copy.deepcopy(features)
    print(features)
    number_of_cells_top_1 = 0
    number_of_cells_total = 0
    for file in input_table_path:
        file_name = file.split('/')[-1]
        try:
            d_sample = pd.read_csv(file)
        except pd.errors.EmptyDataError:
            continue
        if not isinstance(d_sample, pd.DataFrame):
            continue
        #print(d_sample.columns)
        d_sample[normalize_features] = scaler.transform(d_sample[normalize_features])
        sorted_df = d_sample
        new_features = sfeatures
        sorted_df_features = sorted_df[sfeatures]
        arr = sorted_df_features.to_numpy()
        #test_inp = convert_to_matrix_vector(arr, list(all), pca)
        #print(arr)
        test_tensor = torch.tensor(arr).float()
        scores = model.predict(test_tensor)
        scores_list = torch.squeeze(scores).tolist()
        test_df = d_sample
        test_df[final_score_column] = scores_list
        test_df['table_id'] = file_name
        test_df['dataset_id'] = " "
        #df_input_table.append(test_df)
        num_of_cells_with_correct_top_1, num_of_cells = parse_eval_files_stats(test_df, 'siamese_prediction')
        number_of_cells_top_1 += num_of_cells_with_correct_top_1
        number_of_cells_total += num_of_cells
        if test:
            test_df.to_csv(test_predictions + file_name, index=False)
    # print(number_of_cells_top_1, number_of_cells_total, input_table_path)
    return number_of_cells_top_1 / number_of_cells_total

def train(args):
    if torch.cuda.is_available():
        device = torch.device('cuda')

    else:
        device = torch.device('cpu')
    device = torch.device('cpu')
    train_dataset = T2DV2Dataset(pos, neg)
    train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE)
    criterion = PairwiseLoss()
    EPOCHS = args.num_epochs
    model = PairwiseNetwork(len(pos[0])).to(device=device)
    optimizer = Adam(model.parameters(), lr=args.lr)
    # scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.01)
    top1_max_prec = 0
    for epoch in range(EPOCHS):
        train_epoch_loss = 0
        avg_loss = 0
        model.train()
        for bid_1, batch_1 in tqdm(enumerate(train_dataloader), position=0, leave=True):
            # print("--------------")
            positive_feat = torch.tensor(batch_1[0].float())
            negative_feat = torch.tensor(batch_1[1].float())
            optimizer.zero_grad()
            # print(positive_feat.is_cuda, negative_feat.is_cuda)
            pos_out, neg_out = model(positive_feat, negative_feat)
            ##print(pos_out.is_cuda, neg_out.is_cuda, model.is_cuda)
            loss = criterion(pos_out, neg_out)
            # print(loss.is_cuda)
            loss.backward()
            optimizer.step()
            train_epoch_loss += loss
        avg_loss = train_epoch_loss / bid_1
        # scheduler.step()
        # Evaluation
        model.eval()
        print("Running on dev files")
        top1_precision = infer_scores(args.min_max_scaler_path, args.dev_path, args.dev_output, model)
        # eval_data = merge_eval_files(args.dev_output)
        # res, candidate_eval_data = parse_eval_files_stats(eval_data, final_score_column)
        # top1_precision = res['num_tasks_with_model_score_top_one_accurate'] / res['num_tasks_with_gt']
        if top1_precision > top1_max_prec:
            #copy_files(args.dev_output, dev_predictions)
            top1_max_prec = top1_precision
            model_save_name = 'top1_{}_epoch_{}_loss_{}_batch_size_{}_learning_rate_{}.pth'.format(top1_max_prec, epoch,
                                                                                                   avg_loss, BATCH_SIZE,
                                                                                                   LEARNING_RATE)
            best_model_path = args.model_save_path + model_save_name
            torch.save(model.state_dict(), best_model_path)
            print("Running on test files")
            print("Test Dataset", infer_scores(args.min_max_scaler_path, preprocessed_test_files, args.dev_output, model, test = 1))
        #s3_1.Bucket('table-linker-datasets').upload_file('/tmp/'+model_save_name, best_model_path)

        print("Epoch {}, Avg Loss is {}, epoch top1 {}, max top1 {}".format(epoch, avg_loss, top1_precision,
                                                                            top1_max_prec))
    return best_model_path, model
def parse_eval_files_stats(eval_data, method):
    res = {}
    candidate_eval_data = eval_data.groupby(['table_id', 'column', 'row'])['table_id'].count().reset_index(name="count")
    res['num_tasks_with_gt'] = len(eval_data[pd.notna(eval_data['GT_kg_id'])].groupby(['table_id', 'column', 'row']))
    num_tasks_with_model_score_top_one_accurate = []
    num_tasks_with_model_score_top_five_accurate = []
    num_tasks_with_model_score_top_ten_accurate = []
    has_gt_list = []
    has_gt_in_candidate = []
    for i, row in candidate_eval_data.iterrows():
        table_id, row_idx, col_idx = row['table_id'], row['row'], row['column']
        c_e_data = eval_data[
            (eval_data['table_id'] == table_id) & (eval_data['row'] == row_idx) & (eval_data['column'] == col_idx)]
        assert len(c_e_data) > 0
        if np.nan not in set(c_e_data['GT_kg_id']):
            has_gt_list.append(1)
        else:
            has_gt_list.append(0)
        if 1 in set(c_e_data['evaluation_label']):
            has_gt_in_candidate.append(1)
        else:
            has_gt_in_candidate.append(0)

        # rank on model score
        s_data = c_e_data.sort_values(by=[method], ascending=False)
        if s_data.iloc[0]['evaluation_label'] == 1:
            num_tasks_with_model_score_top_one_accurate.append(1)
        else:
            num_tasks_with_model_score_top_one_accurate.append(0)

    res['num_tasks_with_model_score_top_one_accurate'] = sum(num_tasks_with_model_score_top_one_accurate)
    # print(sum(num_tasks_with_model_score_top_one_accurate))
    return res['num_tasks_with_model_score_top_one_accurate'], res['num_tasks_with_gt']

In [11]:
train_dataset = T2DV2Dataset(pos, neg)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE)

In [31]:
LEARNING_RATE = 0.00001
training_args = Namespace(num_epochs=12, lr=LEARNING_RATE, positive_feat_path=pos_output, negative_feat_path=neg_output,
                         dev_path=preprocessed_dev_files, dev_output=dev_predictions,
                         model_save_path=model_save_path, min_max_scaler_path=min_max_scaler_path)
best_model_path, model = train(training_args)

  positive_feat = torch.tensor(batch_1[0].float())
  negative_feat = torch.tensor(batch_1[1].float())
40223it [01:17, 519.92it/s]


Running on dev files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'prefix_monge_elkan_similarity', 'singleton', 'context_score_3', 'pgt_centroid_score', 'pgt_class_count_tf_idf_score', 'pgt_property_count_tf_idf_score', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


Running on test files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'prefix_monge_elkan_similarity', 'singleton', 'context_score_3', 'pgt_centroid_score', 'pgt_class_count_tf_idf_score', 'pgt_property_count_tf_idf_score', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


Test Dataset 0.9350158982511924
Epoch 0, Avg Loss is 0.4894122779369354, epoch top1 0.7346238199675789, max top1 0.7346238199675789


  positive_feat = torch.tensor(batch_1[0].float())
  negative_feat = torch.tensor(batch_1[1].float())
40223it [01:15, 530.72it/s]


Running on dev files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'prefix_monge_elkan_similarity', 'singleton', 'context_score_3', 'pgt_centroid_score', 'pgt_class_count_tf_idf_score', 'pgt_property_count_tf_idf_score', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


Running on test files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'prefix_monge_elkan_similarity', 'singleton', 'context_score_3', 'pgt_centroid_score', 'pgt_class_count_tf_idf_score', 'pgt_property_count_tf_idf_score', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


Test Dataset 0.9436936936936937
Epoch 1, Avg Loss is 0.1882401406764984, epoch top1 0.7459712024411176, max top1 0.7459712024411176


  positive_feat = torch.tensor(batch_1[0].float())
  negative_feat = torch.tensor(batch_1[1].float())
40223it [01:17, 520.34it/s]


Running on dev files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'prefix_monge_elkan_similarity', 'singleton', 'context_score_3', 'pgt_centroid_score', 'pgt_class_count_tf_idf_score', 'pgt_property_count_tf_idf_score', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


Running on test files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'prefix_monge_elkan_similarity', 'singleton', 'context_score_3', 'pgt_centroid_score', 'pgt_class_count_tf_idf_score', 'pgt_property_count_tf_idf_score', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


Test Dataset 0.9446873343932167
Epoch 2, Avg Loss is 0.17436616122722626, epoch top1 0.7496900924954706, max top1 0.7496900924954706


  positive_feat = torch.tensor(batch_1[0].float())
  negative_feat = torch.tensor(batch_1[1].float())
40223it [01:21, 495.32it/s]


Running on dev files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'prefix_monge_elkan_similarity', 'singleton', 'context_score_3', 'pgt_centroid_score', 'pgt_class_count_tf_idf_score', 'pgt_property_count_tf_idf_score', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


Running on test files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'prefix_monge_elkan_similarity', 'singleton', 'context_score_3', 'pgt_centroid_score', 'pgt_class_count_tf_idf_score', 'pgt_property_count_tf_idf_score', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


Test Dataset 0.9448198198198198
Epoch 3, Avg Loss is 0.1686679720878601, epoch top1 0.7508343663583484, max top1 0.7508343663583484


  positive_feat = torch.tensor(batch_1[0].float())
  negative_feat = torch.tensor(batch_1[1].float())
40223it [01:26, 466.86it/s]


Running on dev files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'prefix_monge_elkan_similarity', 'singleton', 'context_score_3', 'pgt_centroid_score', 'pgt_class_count_tf_idf_score', 'pgt_property_count_tf_idf_score', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


Running on test files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'prefix_monge_elkan_similarity', 'singleton', 'context_score_3', 'pgt_centroid_score', 'pgt_class_count_tf_idf_score', 'pgt_property_count_tf_idf_score', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


Test Dataset 0.9455484896661367
Epoch 4, Avg Loss is 0.16560165584087372, epoch top1 0.7521693525317059, max top1 0.7521693525317059


  positive_feat = torch.tensor(batch_1[0].float())
  negative_feat = torch.tensor(batch_1[1].float())
40223it [01:19, 503.71it/s]


Running on dev files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'prefix_monge_elkan_similarity', 'singleton', 'context_score_3', 'pgt_centroid_score', 'pgt_class_count_tf_idf_score', 'pgt_property_count_tf_idf_score', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


Running on test files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'prefix_monge_elkan_similarity', 'singleton', 'context_score_3', 'pgt_centroid_score', 'pgt_class_count_tf_idf_score', 'pgt_property_count_tf_idf_score', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


Test Dataset 0.9453497615262321
Epoch 5, Avg Loss is 0.16368302702903748, epoch top1 0.7524554209974254, max top1 0.7524554209974254


  positive_feat = torch.tensor(batch_1[0].float())
  negative_feat = torch.tensor(batch_1[1].float())
40223it [01:20, 501.89it/s]


Running on dev files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'prefix_monge_elkan_similarity', 'singleton', 'context_score_3', 'pgt_centroid_score', 'pgt_class_count_tf_idf_score', 'pgt_property_count_tf_idf_score', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  positive_feat = torch.tensor(batch_1[0].float())
  negative_feat = torch.tensor(batch_1[1].float())
46it [00:00, 454.76it/s]

Epoch 6, Avg Loss is 0.16233503818511963, epoch top1 0.7523600648421855, max top1 0.7524554209974254


40223it [01:17, 518.80it/s]


Running on dev files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'prefix_monge_elkan_similarity', 'singleton', 'context_score_3', 'pgt_centroid_score', 'pgt_class_count_tf_idf_score', 'pgt_property_count_tf_idf_score', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


Running on test files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'prefix_monge_elkan_similarity', 'singleton', 'context_score_3', 'pgt_centroid_score', 'pgt_class_count_tf_idf_score', 'pgt_property_count_tf_idf_score', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


Test Dataset 0.9426338102808691
Epoch 7, Avg Loss is 0.16134411096572876, epoch top1 0.752646133307905, max top1 0.752646133307905


  positive_feat = torch.tensor(batch_1[0].float())
  negative_feat = torch.tensor(batch_1[1].float())
40223it [01:17, 520.88it/s]


Running on dev files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'prefix_monge_elkan_similarity', 'singleton', 'context_score_3', 'pgt_centroid_score', 'pgt_class_count_tf_idf_score', 'pgt_property_count_tf_idf_score', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


Running on test files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'prefix_monge_elkan_similarity', 'singleton', 'context_score_3', 'pgt_centroid_score', 'pgt_class_count_tf_idf_score', 'pgt_property_count_tf_idf_score', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


Test Dataset 0.9421701112877583
Epoch 8, Avg Loss is 0.16051961481571198, epoch top1 0.7540764756365024, max top1 0.7540764756365024


  positive_feat = torch.tensor(batch_1[0].float())
  negative_feat = torch.tensor(batch_1[1].float())
40223it [01:15, 529.97it/s]


Running on dev files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'prefix_monge_elkan_similarity', 'singleton', 'context_score_3', 'pgt_centroid_score', 'pgt_class_count_tf_idf_score', 'pgt_property_count_tf_idf_score', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


Epoch 9, Avg Loss is 0.15971888601779938, epoch top1 0.7525507771526652, max top1 0.7540764756365024


  positive_feat = torch.tensor(batch_1[0].float())
  negative_feat = torch.tensor(batch_1[1].float())
40223it [01:17, 521.08it/s]


Running on dev files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'prefix_monge_elkan_similarity', 'singleton', 'context_score_3', 'pgt_centroid_score', 'pgt_class_count_tf_idf_score', 'pgt_property_count_tf_idf_score', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


KeyboardInterrupt: 

In [10]:
def dev_prediction(dev_feature_path, dev_predictions_top_k, saved_model, output_column, min_max_scaler_path, k, dev_predictions_metrics):
    print(dev_feature_path)
    df_list = []
    for file in glob.glob(dev_feature_path + '/*.csv'):
        #print(file)
        filename = file.split("/")[-1]
        if filename == '13BLTPJD.csv':
            continue
        #print(filename)
        dev_output = f"{dev_predictions_top_k}/{filename}"

        df = pd.read_csv(file)
        df = df[df['evaluation_label'] != '[]']
        df.to_csv(file, index=False)
        feature_str =  ",".join(features)
        if os.path.getsize(file) == 0:
            continue

        
        # location where the output generated by the predictions wil be stored.
        
        #print(dev_output)
        #print(os.path.exists(dev_predictions_top_k))
        #print(file, output_column, feature_str, saved_model, min_max_scaler_path, final_score_column, k, dev-output)
        k = '5'
        #print(output_column, k, dev_output, file)
        if not os.path.exists(dev_output):
            !tl predict-using-model $file -o $output_column \
                --features $feature_str \
                --ranking-model $saved_model \
                --normalization-factor $min_max_scaler_path \
                / get-kg-links -c $output_column -k $k --k-rows \
                > $dev_output
        filename = file.split("/")[-1]
        #print(filename)
        if os.path.getsize(file) == 0:
                    continue
        k_2 = '1'
        score_column = output_column
        dev_metrics_file = f"{dev_predictions_metrics}/{filename}"
        #print("__________", dev_metrics_file)
        if not os.path.exists(dev_metrics_file):
            !tl metrics $dev_output -k $k_2 -c $score_column --tag $filename> $dev_metrics_file
        df_list.append(pd.read_csv(dev_metrics_file))

    return df_list
        
def add_color(dev_predictions_top_k, dev_colorized_path, score_column, k=5):
    for file in glob.glob(dev_predictions_top_k + '/*.csv'):
        filename = file.split("/")[-1]
        print(filename)
        if os.path.getsize(file) == 0:
                    continue
                
        dev_color_file = f"{dev_colorized_path}/{filename[:-4]}.xlsx"
        !tl add-color $file -c "$score_column,evaluation_label" -k $k --output $dev_color_file
def metrics(column, file_path=None, df=None, k: int = 1, tag=""):
    """
    computes the precision, recall and f1 score for the tl pipeline.
    Args:
        column: column with ranking score
        file_path: input file path
        df: or input dataframe
        k: calculate recall at top k candidates
        tag: a tag to use in the output file to identify the results of running the given pipeline
    Returns:
    """
    if file_path is None and df is None:
        raise RequiredInputParameterMissingException(
            'One of the input parameters is required: {} or {}'.format('file_path', 'df'))

    if file_path:
        df = read_csv(file_path, dtype=object)

    # remove duplicate candidates if exist
    df = normalize_scores.drop_duplicate("kg_id", [column], df=df)

    # replace na to 0.0
    df[column] = df[column].astype(float).fillna(0.0)
    df['max_score'] = df.groupby(by=['column', 'row'])[column].transform(max)

    # relevant df
    rdf = df[df['evaluation_label'].astype(float) != 0.0]

    col_grouped = rdf.groupby(by=['column'])
    results = []
    for col, cgdf in col_grouped:
        # true positive for precision at 1
        tp_ps = []

        # true positive for recall at k
        tp_rs = defaultdict(list)
        grouped = cgdf.groupby(by=['row'])
        n = len(grouped)
        for key, gdf in grouped:
            gdf = gdf.sort_values(by=[column, 'kg_id'], ascending=[False, True]).reset_index()

            for i, row in gdf.iterrows():
                if float(row['evaluation_label']) == 1 and row[column] == row['max_score']:
                    tp_ps.append(key)

                # this df is sorted by score, so highest ranked candidate is rank 1 and so on...
                rank = i + 1
                if rank <= k and (row['evaluation_label'] == '1' or row['evaluation_label'] == 1.0):
                    tp_rs[k].append(key)

        precision = float(len(tp_ps)) / float(n)
        recall = {k: float(len(each_tp_rs)) / float(n) for k, each_tp_rs in tp_rs.items()}
        # sort as k value increasing
        recall = {k: v for k, v in sorted(recall.items(), key=lambda x: x[0])}

        for _k, each_recall in recall.items():
            if precision == 0 and each_recall == 0:
                f1_score = 0.0
            else:
                f1_score = (2 * precision * each_recall) / (precision + each_recall)
            results.append({"k": _k,
                            'f1': f1_score,
                            'precision': precision,
                            'recall': each_recall,
                            'column': col,
                            'tag': tag})

    output_df = pd.DataFrame(results)
    return output_df
def compute_metrics(dev_predictions_top_k, dev_predictions_metrics, score_column, k=1):
    df_list = []
    for file in glob.glob(dev_predictions_top_k + '/*.csv'):
        #print(file)
        #print(df)

        filename = file.split("/")[-1]
        print(filename)
        if os.path.getsize(file) == 0:
                    continue
                
        dev_metrics_file = f"{dev_predictions_metrics}/{filename}"
        !tl metrics $file -k $k -c $score_column --tag $filename> $dev_metrics_file
        df_list.append(pd.read_csv(dev_metrics_file))
    return pd.concat(df_list)

In [9]:
best_model_path = f"{model_save_path}/top1_0.7398684085057691_epoch_9_loss_0.27942124009132385_batch_size_32_learning_rate_1e-06.pth"

In [11]:
# Getting the testing metrics
df_list = dev_prediction(preprocessed_test_files_path, result_predictions_location, best_model_path, 'siamese_prediction', min_max_scaler_path, 5, result_metrics_location)
#dev_prediction(main_dataset_location+test_dataset+dataset_folder_location, result_predictions_location, model_path, "siamese_prediction", min_max_scaler_path, k=5)

Experiment/Experiment_test_semtab_feature_addition/processed_test_data/
__________ Experiment/Experiment_test_semtab_feature_addition/results//test_predictions_metrics//03XJOJ15.csv
__________ Experiment/Experiment_test_semtab_feature_addition/results//test_predictions_metrics//06ZS1QBE.csv
__________ Experiment/Experiment_test_semtab_feature_addition/results//test_predictions_metrics//080HU8A5.csv
__________ Experiment/Experiment_test_semtab_feature_addition/results//test_predictions_metrics//090EELY2.csv
__________ Experiment/Experiment_test_semtab_feature_addition/results//test_predictions_metrics//06UFHM3B.csv
__________ Experiment/Experiment_test_semtab_feature_addition/results//test_predictions_metrics//0DVQGVUW.csv
__________ Experiment/Experiment_test_semtab_feature_addition/results//test_predictions_metrics//0HFY1TOW.csv
__________ Experiment/Experiment_test_semtab_feature_addition/results//test_predictions_metrics//0U3WMR09.csv
__________ Experiment/Experiment_test_semtab_fea

  if (await self.run_code(code, result,  async_=asy)):


__________ Experiment/Experiment_test_semtab_feature_addition/results//test_predictions_metrics//1DT331RM.csv
__________ Experiment/Experiment_test_semtab_feature_addition/results//test_predictions_metrics//1JV6896P.csv
__________ Experiment/Experiment_test_semtab_feature_addition/results//test_predictions_metrics//1I3AJ8QT.csv
__________ Experiment/Experiment_test_semtab_feature_addition/results//test_predictions_metrics//1JMXOUP0.csv
__________ Experiment/Experiment_test_semtab_feature_addition/results//test_predictions_metrics//243JI1XC.csv
__________ Experiment/Experiment_test_semtab_feature_addition/results//test_predictions_metrics//1H6S7TGO.csv
__________ Experiment/Experiment_test_semtab_feature_addition/results//test_predictions_metrics//1XPHI0P2.csv
__________ Experiment/Experiment_test_semtab_feature_addition/results//test_predictions_metrics//25KTY2D9.csv
__________ Experiment/Experiment_test_semtab_feature_addition/results//test_predictions_metrics//2J3MZCRD.csv
__________

  if (await self.run_code(code, result,  async_=asy)):


__________ Experiment/Experiment_test_semtab_feature_addition/results//test_predictions_metrics//1ZFRQBQS.csv
__________ Experiment/Experiment_test_semtab_feature_addition/results//test_predictions_metrics//25EPRHUJ.csv
__________ Experiment/Experiment_test_semtab_feature_addition/results//test_predictions_metrics//253OSZNL.csv
__________ Experiment/Experiment_test_semtab_feature_addition/results//test_predictions_metrics//23NFMDP0.csv
__________ Experiment/Experiment_test_semtab_feature_addition/results//test_predictions_metrics//27V1TQHW.csv
__________ Experiment/Experiment_test_semtab_feature_addition/results//test_predictions_metrics//2CPISXSF.csv
__________ Experiment/Experiment_test_semtab_feature_addition/results//test_predictions_metrics//29AXQS5A.csv
__________ Experiment/Experiment_test_semtab_feature_addition/results//test_predictions_metrics//2BATTV6X.csv
__________ Experiment/Experiment_test_semtab_feature_addition/results//test_predictions_metrics//2I24Y64Q.csv
__________

  if (await self.run_code(code, result,  async_=asy)):


__________ Experiment/Experiment_test_semtab_feature_addition/results//test_predictions_metrics//0VC79Y6X.csv
__________ Experiment/Experiment_test_semtab_feature_addition/results//test_predictions_metrics//5D3I2S1N.csv


  if (await self.run_code(code, result,  async_=asy)):


__________ Experiment/Experiment_test_semtab_feature_addition/results//test_predictions_metrics//1JCXMKCK.csv
__________ Experiment/Experiment_test_semtab_feature_addition/results//test_predictions_metrics//00UQHNO3.csv
__________ Experiment/Experiment_test_semtab_feature_addition/results//test_predictions_metrics//4C37XZHJ.csv
__________ Experiment/Experiment_test_semtab_feature_addition/results//test_predictions_metrics//1J4YRLOY.csv
__________ Experiment/Experiment_test_semtab_feature_addition/results//test_predictions_metrics//3IBPLURT.csv
__________ Experiment/Experiment_test_semtab_feature_addition/results//test_predictions_metrics//5CG871YQ.csv
__________ Experiment/Experiment_test_semtab_feature_addition/results//test_predictions_metrics//3T93HF0D.csv
__________ Experiment/Experiment_test_semtab_feature_addition/results//test_predictions_metrics//0VDX10IJ.csv
__________ Experiment/Experiment_test_semtab_feature_addition/results//test_predictions_metrics//4SOL8H0M.csv
__________

  if (await self.run_code(code, result,  async_=asy)):


__________ Experiment/Experiment_test_semtab_feature_addition/results//test_predictions_metrics//0Z8LKW3C.csv
__________ Experiment/Experiment_test_semtab_feature_addition/results//test_predictions_metrics//042AKDN1.csv
__________ Experiment/Experiment_test_semtab_feature_addition/results//test_predictions_metrics//09DZVIPV.csv
__________ Experiment/Experiment_test_semtab_feature_addition/results//test_predictions_metrics//0UX45YZS.csv
__________ Experiment/Experiment_test_semtab_feature_addition/results//test_predictions_metrics//1E1WKZOC.csv
__________ Experiment/Experiment_test_semtab_feature_addition/results//test_predictions_metrics//0VGKU21T.csv
__________ Experiment/Experiment_test_semtab_feature_addition/results//test_predictions_metrics//01E31BCQ.csv
__________ Experiment/Experiment_test_semtab_feature_addition/results//test_predictions_metrics//00ECUL14.csv
__________ Experiment/Experiment_test_semtab_feature_addition/results//test_predictions_metrics//18TCX9AJ.csv
__________

In [19]:
metrics_df  = pd.concat(df_list)
metrics_df

Unnamed: 0,k,f1,precision,recall,column,num_cells,tag
0,1,1.00,1.00,1.00,0,20,03XJOJ15.csv
1,1,1.00,1.00,1.00,1,20,03XJOJ15.csv
0,1,0.60,0.60,0.60,0,20,06ZS1QBE.csv
0,1,1.00,1.00,1.00,0,20,080HU8A5.csv
0,1,1.00,1.00,1.00,0,20,090EELY2.csv
...,...,...,...,...,...,...,...
2,1,1.00,1.00,1.00,2,20,5NMYKF77.csv
3,1,1.00,1.00,1.00,3,20,5NMYKF77.csv
0,1,1.00,1.00,1.00,0,20,5WFYRRY7.csv
0,1,0.95,0.95,0.95,0,20,5WA7NENC.csv


In [17]:
# Column Metrics
metrics_df['precision'].mean(), metrics_df['recall'].mean(), metrics_df['f1'].mean()

(0.9518227758230172, 0.9518227758230172, 0.9518227758230172)

In [18]:
# Cell Metrics
sum(metrics_df['precision']*metrics_df['num_cells']) / sum(metrics_df['num_cells']), sum(metrics_df['recall']*metrics_df['num_cells']) / sum(metrics_df['num_cells']), 2*(sum(metrics_df['precision']*metrics_df['num_cells']) / sum(metrics_df['num_cells']))*(sum(metrics_df['recall']*metrics_df['num_cells']) / sum(metrics_df['num_cells']))/(sum(metrics_df['precision']*metrics_df['num_cells']) / sum(metrics_df['num_cells']) + sum(metrics_df['recall']*metrics_df['num_cells']) / sum(metrics_df['num_cells']))

(0.94822328294882, 0.94822328294882, 0.94822328294882)