## Purpose: Use both context similarity vector and context score
To complete this experiment, following changes have been done, 
- Calculate context similarity vector by first calculating normal context property vector from context property string
- Use PCA as dimensional reduction technique on the context similarity vector
- Recalculate the context property vector during training (not flexible to store) and apply autoencoder for dimensional reduction. <br>

Set parameters in the cells tagged "set_parameters" <br>
Download data from table-linker-datasets/context_vector_{train/dev}_data in to Experiments/context_vector_train/dev_data
<br>
It stores result while creating the similarity vector temporarily in temp_c_{N_PCA_Components} which can be erased once the pos and neg pickle files are generated.

In [1]:
###Importing all the libraries
import glob
import boto3
import time
import os
import pandas as pd
import sklearn.metrics
from sklearn.preprocessing import MinMaxScaler
import pickle
from argparse import ArgumentParser, Namespace
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from itertools import chain
import copy
import shutil
import pickle
from tqdm import tqdm
import scipy.sparse as sp
import os
import tensorflow as tf
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Dense
## Importing context_similarity function
import context_property_similarity_code

In [2]:
# Setup the variables
N_PCA_COMPONENTS = 10
features = ["monge_elkan","monge_elkan_aliases","jaro_winkler",
            "levenshtein","singleton","context_score_3","pgt_centroid_score","pgt_class_count_tf_idf_score",
            "pgt_property_count_tf_idf_score", "num_occurences"]
features = ["monge_elkan","monge_elkan_aliases","jaro_winkler",
            "levenshtein","singleton","context_score_3", "num_occurences"]
BATCH_SIZE = 32
LEARNING_RATE = 0.00001
autoencoder_path = 'saved_2000_100/saved_2000_1000'
ENCODER_COMPONENTS = 100


In [3]:
experiment_name = f"Experiment_test_semtab_plain_version_n_{N_PCA_COMPONENTS}"
experiment_data = '2t_data,limaye_data,biotab_data,t2dv2_data,biodiv_data'
### Creating the directories for the results.
experiment_train_data = experiment_data
experiment_dev_data =  experiment_data
experiment_test_data = 'semtab_data'
experiment_store_path = f"../Experiments/{experiment_name}"
processed_dev_data = f"{experiment_store_path}/dev/features/"
processed_test_data = f"{experiment_store_path}/test/features/"
dev_predictions = f"{experiment_store_path}/dev/dev_predictions/"
test_predictions = f"{experiment_store_path}/test/test_predictions/"
dev_output_pred = f"{experiment_store_path}/dev/dev_output/"
dev_predictions_top_k = f"{experiment_store_path}/dev/dev_predictions_top_k/"
dev_metrics = f"{experiment_store_path}/dev/dev_metrics/"
dev_predictions_colorized = f"{experiment_store_path}/dev/dev_predictions_colorized/"
model_save_path = f'{experiment_store_path}/model_save_path/'
best_model_path = ''

training_data_path = f'{experiment_store_path}/model_training_data'

pos_output = f'{training_data_path}/tl_pipeline_pos_features_{BATCH_SIZE}.pkl'
neg_output = f'{training_data_path}/tl_pipeline_neg_features_{BATCH_SIZE}.pkl'
min_max_scaler_path = f'{training_data_path}/tl_pipeline_normalization_factor.pkl'

final_score_column = 'siamese_prediction'

extra_feat = ['column-id', 'column', 'row', 'evaluation_label', 'dataset_id', 'table_id', 'context_property_vector']
for f in features:
    extra_feat.append(f)


!mkdir -p $experiment_store_path 
!mkdir -p $dev_predictions 
!mkdir -p $dev_output_pred
!mkdir -p $dev_predictions_top_k 
!mkdir -p $dev_metrics 
!mkdir -p $dev_predictions_colorized 
!mkdir -p $model_save_path 
!mkdir -p $training_data_path
!mkdir -p $processed_dev_data
!mkdir -p $processed_test_data
!mkdir -p $test_predictions
temp_csv_cfolder = f'temp_contextscore_c{N_PCA_COMPONENTS}'
!mkdir -p $temp_csv_cfolder

In [4]:
train_files_path = [f'../Experiments/context_vector_train_data/' + i for i in experiment_train_data.split(',')]
train_files = []
for train_path in train_files_path:
    set_of_files = glob.glob(train_path + '/*.csv')
    train_files.extend(set_of_files)
print(len(train_files))
dev_files_path = ['../Experiments/context_vector_dev_data/' + i for i in experiment_train_data.split(',')]
dev_files = []
for dev_path in dev_files_path:
    set_of_files = glob.glob(dev_path + '/*.csv')
    dev_files.extend(set_of_files)
print(len(dev_files))
test_files = []
test_files_location = ['../Experiments/context_vector_dev_data/semtab_data', '../Experiments/context_vector_train_data/semtab_data']
for k in test_files_location:
    test_files.extend(glob.glob(k + '/*.csv'))
print(len(test_files))
all_prop = pickle.load(open('all_properties.pkl', 'rb'))

453
161
345


In [5]:
class AutoEncoders(Model):

    def __init__(self, layer_1_unit = 2000, layer_2_unit=100, layer_3_unit = None, output_units = None):
        super().__init__()
        self.encoder = Sequential(
            [
                Dense(layer_1_unit, activation="relu"),
                Dense(layer_2_unit, activation="relu"),
                #Dense(layer_3_unit, activation="relu")
            ]
        )

        self.decoder = Sequential(
            [
                #Dense(layer_2_unit, activation="relu"),
                Dense(layer_1_unit, activation="relu"),
                Dense(output_units, activation="sigmoid")
            ]
        )


    def call(self, inputs):
        encoded = self.encoder(inputs)
        decoded = self.decoder(encoded)
        return decoded

In [6]:
a_model = AutoEncoders(output_units = len(all_prop))
print(a_model.layers)
a_model.load_weights(autoencoder_path)
encoder_layer = a_model.get_layer('sequential')

[<tensorflow.python.keras.engine.sequential.Sequential object at 0x7f65b4f572e0>, <tensorflow.python.keras.engine.sequential.Sequential object at 0x7f65b4f57c10>]


In [7]:
# Helpher functions for data preprocessing
def merge_files(args):
    # datapath = args.train_path
    df_list = []
    for fn in args.train_files:
        fid = fn.split('/')[-1][:-4]
        dataset_id = fn.split('/')[-2]
        df = read_file(fn)
        if not isinstance(df, pd.DataFrame) :
            continue

        df['table_id'] = fid
        print(dataset_id, fid)
        df['dataset_id'] = dataset_id
        df['context_score'].fillna(0.0, inplace=True)
        if 'column-id' not in df.columns:
            df['column-id'] = fn.split('/')[-1] + df['column'].astype('str')

        df = df[extra_feat]
        df_list.append(df)
    return pd.concat(df_list)


def compute_normalization_factor(args, all_data):
    min_max_scaler_path = args.min_max_scaler_path
    all_data_features = all_data[features]
    scaler = MinMaxScaler()
    scaler.fit(all_data_features)
    return scaler


def read_file(key):
    #resp = s3.get_object(Bucket = bucket, Key = key)
    try:
        df = pd.read_csv(key, sep = ',')
    except pd.errors.EmptyDataError:
        df = ''
        print('Empty csv file!')
    return df

In [22]:

def generate_train_data(args, all_data, scaler, shuffle_by = None):
    num_cells_1 = 0
    scaler_path = args.min_max_scaler_path
    #scaler = pickle.load(open('./tmp/min_max_scaler_path.pkl', 'rb'))
    final_list = []
    sfeatures = copy.deepcopy(features) + ['context_property_vector'] #+ [f'csp_{i}' for i in range(N_PCA_COMPONENTS)] 
    normalize_features = features
    evaluation_label = ['evaluation_label']
    positive_features_final = []
    negative_features_final = []
    super_groups = all_data.groupby(['column-id'])
    if shuffle_by == 'dataset':
        super_groups = all_data.groupby(['dataset_id'])
        for i, s_group in super_groups:
            pos_features_dataset = []
            neg_features_dataset = []
            grouped_obj = s_group.groupby(['column', 'row', 'column-id'])

            for cell in grouped_obj:
                num_cells_1 += 1
                cell[1][normalize_features] = scaler.transform(cell[1][normalize_features])
                pos_features = []
                neg_features = []
                a = cell[1][cell[1]['evaluation_label'] == 1]
                if a.empty:
                    continue
                pos_rows = cell[1][cell[1]['evaluation_label'].astype(int) == 1][sfeatures].to_numpy()
                if len(pos_rows) < 1:
                    continue
                if len(pos_rows) > 1:
                    print("here")
                for i in range(len(pos_rows)):
                    pos_features.append(pos_rows[i])
                neg_rows = cell[1][cell[1]['evaluation_label'].astype(int) == -1][sfeatures].to_numpy()
                for i in range(min(BATCH_SIZE, len(neg_rows))):
                    neg_features.append(neg_rows[i])

                for k in range(len(neg_features) - len(pos_features)):
                    pos_features.append(random.choice(pos_rows))
                if len(pos_features) != len(neg_features):
                    continue
                pos_features_dataset.append(pos_features)
                neg_features_dataset.append(neg_features)
            if len(pos_features_dataset) > 0:
                c = list(zip(pos_features_dataset, neg_features_dataset))
                random.shuffle(c)
                pos_features_dataset, neg_features_dataset = zip(*c)
                positive_features_final.extend(pos_features_dataset)
                negative_features_final.extend(neg_features_dataset)
    elif shuffle_by == 'table':
        super_groups = all_data.groupby(['table_id'])
        for i, s_group in super_groups:
            pos_features_table = []
            neg_features_table = []
            file_name = i.split('-')[0]
            # Setting the return context property to false for now. After generating dataset, we will run again to get the 
            # context property vector. Reduces memory used for saving files.
            # s_group_n = context_property_similarity_code.return_a_table(s_group, pca_components = N_PCA_COMPONENTS, return_context_property = False)
            #print("entering ", file_name)
            ds_id = s_group['dataset_id'].values[0]
            print("File: ", file_name, ds_id)
            '''
            if os.path.exists(f'temp_csv_c{N_PCA_COMPONENTS}/neg_{ds_id}_{file_name}.pkl'):
                print("Already Exists!")
                pos_features_table = pickle.load(open(f'temp_csv_c{N_PCA_COMPONENTS}/pos_{ds_id}_{file_name}.pkl', 'rb'))
                neg_features_table = pickle.load(open(f'temp_csv_c{N_PCA_COMPONENTS}/neg_{ds_id}_{file_name}.pkl', 'rb'))
                positive_features_final.extend(pos_features_table)
                negative_features_final.extend(neg_features_table) 
                continue
            '''
            grouped_obj = s_group.groupby(['column', 'row'])
    
            for cell in grouped_obj:
                num_cells_1 += 1
                cell[1][normalize_features] = scaler.transform(cell[1][normalize_features])
                pos_features = []
                neg_features = []
                a = cell[1][cell[1]['evaluation_label'] == 1]
                if a.empty:
                    continue
                pos_rows = cell[1][cell[1]['evaluation_label'].astype(int) == 1][sfeatures].to_numpy()
                if len(pos_rows) < 1:
                    continue
                if len(pos_rows) > 1:
                    print("here")
                for i in range(len(pos_rows)):
                    pos_features.append(pos_rows[i])
                neg_rows = cell[1][cell[1]['evaluation_label'].astype(int) == -1][sfeatures].to_numpy()
                for i in range(min(BATCH_SIZE, len(neg_rows))):
                    neg_features.append(neg_rows[i])

                for k in range(len(neg_features) - len(pos_features)):
                    pos_features.append(random.choice(pos_rows))
                if len(pos_features) != len(neg_features):
                    continue
                random.shuffle(pos_features)

                random.shuffle(neg_features)
                pos_features_table.append(pos_features)
                neg_features_table.append(neg_features)
            if len(pos_features_table) > 0:
                c = list(zip(pos_features_table, neg_features_table))
                random.shuffle(c)
                pos_features_table, neg_features_table = zip(*c)
                positive_features_final.extend(pos_features_table)
                negative_features_final.extend(neg_features_table)
                
                pickle.dump(pos_features_table, open(f'temp_csv_c{N_PCA_COMPONENTS}/pos_{ds_id}_{file_name}.pkl', 'wb'))
                pickle.dump(neg_features_table, open(f'temp_csv_c{N_PCA_COMPONENTS}/neg_{ds_id}_{file_name}.pkl', 'wb'))
    else:
        for i, s_group in super_groups:
            file_name = i.split('-')[0]
            #print("entering ", file_name)
            grouped_obj = s_group.groupby(['column', 'row'])
            for cell in grouped_obj:
                num_cells_1 += 1
                cell[1][normalize_features] = scaler.transform(cell[1][normalize_features])
                pos_features = []
                neg_features = []
                a = cell[1][cell[1]['evaluation_label'] == 1]
                if a.empty:
                    continue
                pos_rows = cell[1][cell[1]['evaluation_label'].astype(int) == 1][sfeatures].to_numpy()
                if len(pos_rows) < 1:
                    continue
                if len(pos_rows) > 1:
                    print("here")
                for i in range(len(pos_rows)):
                    pos_features.append(pos_rows[i])
                neg_rows = cell[1][cell[1]['evaluation_label'].astype(int) == -1][sfeatures].to_numpy()
                for i in range(min(BATCH_SIZE, len(neg_rows))):
                    neg_features.append(neg_rows[i])

                for k in range(len(neg_features) - len(pos_features)):
                    pos_features.append(random.choice(pos_rows))
                random.shuffle(pos_features)
                random.shuffle(neg_features)
                if len(pos_features) != len(neg_features):
                    print("HHHERRRERR")
                else:
                    positive_features_final.append(pos_features)
                    negative_features_final.append(neg_features)
                #print(len(positive_features_final), len(positive_features_final[3]))
                #print(len(negative_features_final), len(negative_features_final[3]))
    if shuffle_by == 'complete_shuffle':
        c = list(zip(positive_features_final, negative_features_final))
        random.shuffle(c)
        positive_features_final, positive_features_final = zip(*c)
    print(len(positive_features_final), len(positive_features_final[3]))
    print(len(negative_features_final), len(negative_features_final[3]))

    print(len(positive_features_final), len(positive_features_final[3]))
    print(len(negative_features_final), len(negative_features_final[3]))
    return positive_features_final, negative_features_final

def generate_dataloader(positive_feat_path, negative_feat_path):
    pos_features = pickle.load(open(positive_feat_path, 'rb'))
    neg_features = pickle.load(open(negative_feat_path, 'rb'))
    print(pos_features[10])
    pos_features_flatten = list(chain.from_iterable(pos_features))
    neg_features_flatten = list(chain.from_iterable(neg_features))
    return pos_features_flatten, neg_features_flatten

In [23]:
# Setting the scaler and merging files
gen_training_data_args = Namespace(train_files=train_files, pos_output=pos_output, neg_output=neg_output,min_max_scaler_path=min_max_scaler_path)
all_data = merge_files(gen_training_data_args)
scaler = compute_normalization_factor(gen_training_data_args, all_data)
pickle.dump(scaler, open(min_max_scaler_path, 'wb'))

2t_data 3DOM5NIW
2t_data 3JXMPC7N
2t_data 3LG8J4MX
2t_data 3N6S2FCX
2t_data 4J75OL3W
2t_data 6I7ET24J
2t_data 71SY0Z5S
2t_data AUU9A6KL
2t_data B20WIQKU
2t_data B38A9Q5R
2t_data BID0NRU0
2t_data CHZGO92A
2t_data CNQ5Z0BG
2t_data DXTA1MV8
2t_data E22XXKVQ
2t_data E3ZK744Q
2t_data E45F8QW2
2t_data EQZ21058
2t_data ET9REW9Y
2t_data EV6LDIB8
2t_data F98SUVJH
2t_data FF00TEZG
2t_data FVKKTA8O
2t_data FWSGRDQ3
2t_data GINQPZQC
2t_data GNDO9OXJ
2t_data GY7KNULP
2t_data HB00DX4L
2t_data HCLZSPEJ
2t_data HGIUTSCG
2t_data HGYG2DVU
2t_data HLJ9HHEE
2t_data HQCEC5NO


  exec(code_obj, self.user_global_ns, self.user_ns)


2t_data HQRATPBV
2t_data HTF6MVY9
2t_data I0ENR9U5
2t_data I6BBMPNU
2t_data IP6ZRIGH
2t_data IPTLCCCU
2t_data IYZYF533
2t_data IZF82AX9
2t_data J9EJV2S3
2t_data JA4L7KWX
2t_data JN0T2O5I
2t_data JZ22O0DD
2t_data JZLRN9PL


  exec(code_obj, self.user_global_ns, self.user_ns)


2t_data K2VEUQT0
2t_data K3X3L22Y
2t_data K71YYDXO
2t_data KOQM4YU9
2t_data LV5N8XDB
2t_data MANO2PKR
2t_data MSCT8MJD
2t_data MZ0BI8NN
2t_data N6QAC84T
2t_data NDSTZH1I
2t_data NV4GY44T
2t_data NYFX4M8T
2t_data O0BECB72
2t_data O0UJJVBR
2t_data OD9USH5H
2t_data OHGI1JNY
2t_data OIDIXPNZ
2t_data OKW6UUW5
2t_data ORPKCFRL
2t_data OZYCQ769
2t_data P86ZJFXK
2t_data P8B3IAOY
2t_data PWNRGOJ5
2t_data PZXDACJ9
2t_data QIW5EC2H
2t_data QNP7O8L5
2t_data QOAVEFGY
2t_data QOL4ZIHL
2t_data QPFX5Z8J
2t_data QW492LGU
2t_data RYTFLT5K


  exec(code_obj, self.user_global_ns, self.user_ns)


2t_data S7DQFOD4
2t_data S8UOQYBG
2t_data SFGT3EDA
2t_data SNUO09BH
2t_data SPPNJXB2
2t_data SRVLBA90
2t_data T7RPWH6N
2t_data TNI3UKH2
2t_data TW1HGRFI
2t_data U1FDHL7N
2t_data U5L6L4Q4
2t_data UURPYBGQ
2t_data VGUZX5R3
2t_data VKWTT7F7
2t_data WH6JINCM
2t_data WI5LYQ0H
2t_data WRGS0WCX
2t_data WRPAQPNC
2t_data WWBIR8H6
2t_data WZL61XYS
2t_data X1LRVWSR
2t_data X23TMJ3R
2t_data X3RACWMT
2t_data XBHX2VRT
2t_data Y85GTOSS
2t_data YCOUS57M
2t_data Z4M8AT89
2t_data ZZNW93IV
limaye_data file110008_0_cols1_rows66
limaye_data file110816_0_cols1_rows40
limaye_data file137293_0_cols1_rows6
limaye_data file13997_0_cols1_rows20
limaye_data file171157_0_cols1_rows21
limaye_data file183712_0_cols1_rows19
limaye_data file198804_0_cols1_rows39
limaye_data file205708_0_cols1_rows20
limaye_data file222751_0_cols1_rows13
limaye_data file229323_0_cols1_rows7
limaye_data file234482_0_cols1_rows68
limaye_data file236160_0_cols1_rows28
limaye_data file240410_0_cols1_rows60
limaye_data file256909_0_cols1_ro

  exec(code_obj, self.user_global_ns, self.user_ns)


biotab_data 167909
biotab_data C16A53
biotab_data 1C383C
biotab_data AD61AB
biotab_data 735B90
biotab_data A5771B
biotab_data A5BFC9
biotab_data 66F041
biotab_data D82C8D
biotab_data 98F137
biotab_data 34173C
biotab_data EA5D2F
biotab_data 9A1158
biotab_data A1D0C6
biotab_data 642E92
biotab_data 19CA14
biotab_data 6364D3
biotab_data 6512BD
biotab_data FBD793
biotab_data F0935E
biotab_data A97DA6
biotab_data 093F65
biotab_data 92CC22
biotab_data 44F683
biotab_data 1F0E3D
biotab_data ECCBC8


  exec(code_obj, self.user_global_ns, self.user_ns)


biotab_data 9F6140
biotab_data 93DB85
t2dv2_data 39173938_0_7916056990138658530


  exec(code_obj, self.user_global_ns, self.user_ns)


t2dv2_data 46671561_0_6122315295162029872
t2dv2_data 58891288_0_1117541047012405958
t2dv2_data 29414811_13_8724394428539174350
t2dv2_data 77694908_0_6083291340991074532
t2dv2_data 50270082_0_444360818941411589
t2dv2_data 14380604_4_3329235705746762392
t2dv2_data 39759273_0_1427898308030295194


  exec(code_obj, self.user_global_ns, self.user_ns)


t2dv2_data 69537082_0_7789694313271016902
t2dv2_data 29414811_12_251152470253168163
t2dv2_data 14067031_0_559833072073397908
t2dv2_data 84548468_0_5955155464119382182
t2dv2_data 1438042989043_35_20150728002309-00287-ip-10-236-191-2_875026214_2
t2dv2_data 35188621_0_6058553107571275232
t2dv2_data 39650055_5_7135804139753401681
t2dv2_data 21362676_0_6854186738074119688
t2dv2_data 10579449_0_1681126353774891032
t2dv2_data 39107734_2_2329160387535788734
t2dv2_data 1438042989018_40_20150728002309-00067-ip-10-236-191-2_57714692_2
t2dv2_data 41480166_0_6681239260286218499
t2dv2_data 9834884_0_3871985887467090123
t2dv2_data 16767252_0_2409448375013995751
t2dv2_data 45073662_0_3179937335063201739
t2dv2_data 28086084_0_3127660530989916727
t2dv2_data 43237185_1_3636357855502246981
t2dv2_data 8468806_0_4382447409703007384
t2dv2_data 54719588_0_8417197176086756912
t2dv2_data 38428277_0_1311643810102462607
t2dv2_data 53989675_0_8697482470743954630
t2dv2_data 37856682_0_6818907050314633217
t2dv2_data

  exec(code_obj, self.user_global_ns, self.user_ns)


biodiv_data cb1601379dca4472b6df915185124990
biodiv_data b785d9ed72b347ae9cb0997df17258d8
biodiv_data 0bc67e05a4d14011a2cf3fca2f869495
biodiv_data 0dc3add04e344228bb140d5392399521


  exec(code_obj, self.user_global_ns, self.user_ns)


biodiv_data 1d09a099d3964602aca9425adcde89cd
biodiv_data dffeec8c3593402bafa69b50f5920fa5


In [24]:
# Creating final data
positive_features_final, negative_features_final = generate_train_data(gen_training_data_args, all_data, scaler, shuffle_by = 'table')
pickle.dump(positive_features_final, open(pos_output, 'wb'))
pickle.dump(negative_features_final, open(neg_output, 'wb'))

File:  093F65 biotab_data
File:  0bc67e05a4d14011a2cf3fca2f869495 biodiv_data
File:  0dc3add04e344228bb140d5392399521 biodiv_data
File:  10579449_0_1681126353774891032 t2dv2_data
File:  14067031_0_559833072073397908 t2dv2_data
File:  1438042986423_95_20150728002306 t2dv2_data
File:  1438042989018_40_20150728002309 t2dv2_data
File:  1438042989043_35_20150728002309 t2dv2_data
File:  14380604_4_3329235705746762392 t2dv2_data
File:  14BFA6 biotab_data
File:  16767252_0_2409448375013995751 t2dv2_data
File:  167909 biotab_data
File:  19CA14 biotab_data
File:  1C383C biotab_data
File:  1F0E3D biotab_data
File:  1FF1DE biotab_data
File:  1d09a099d3964602aca9425adcde89cd biodiv_data
File:  20135078_0_7570343137119682530 t2dv2_data
File:  21362676_0_6854186738074119688 t2dv2_data
File:  24036779_0_5608105867560183058 t2dv2_data
File:  25404227_0_2240631045609013057 t2dv2_data
File:  26657D biotab_data
File:  2723D0 biotab_data
File:  28086084_0_3127660530989916727 t2dv2_data
File:  283802 biotab

In [None]:
# To get the features faster - if all the preprocessing has been done for all the files.
'''
positive_features_final= [] 
negative_features_final = []
for i in train_files:
    file_name = i.split('/')[-1][:-4]
    ds_id = i.split('/')[-2]
    print(file_name, ds_id)
    pos_location = f'temp_csv_c{N_PCA_COMPONENTS}/pos_{ds_id}_{file_name}.pkl'
    neg_location = f'temp_csv_c{N_PCA_COMPONENTS}/neg_{ds_id}_{file_name}.pkl'
    if os.path.exists(pos_location):
        pos_features_table = pickle.load(open(f'temp_csv_c{N_PCA_COMPONENTS}/pos_{ds_id}_{file_name}.pkl', 'rb'))
        neg_features_table = pickle.load(open(f'temp_csv_c{N_PCA_COMPONENTS}/neg_{ds_id}_{file_name}.pkl', 'rb'))
        positive_features_final.extend(pos_features_table)
        negative_features_final.extend(neg_features_table) 
pickle.dump(positive_features_final, open(pos_output, 'wb'))
pickle.dump(negative_features_final, open(neg_output, 'wb'))
'''

3DOM5NIW 2t_data
3JXMPC7N 2t_data
3LG8J4MX 2t_data
3N6S2FCX 2t_data
4J75OL3W 2t_data
6I7ET24J 2t_data
71SY0Z5S 2t_data
AUU9A6KL 2t_data
B20WIQKU 2t_data
B38A9Q5R 2t_data
BID0NRU0 2t_data
CHZGO92A 2t_data
CNQ5Z0BG 2t_data
DXTA1MV8 2t_data
E22XXKVQ 2t_data
E3ZK744Q 2t_data
E45F8QW2 2t_data
EQZ21058 2t_data
ET9REW9Y 2t_data
EV6LDIB8 2t_data
F98SUVJH 2t_data
FF00TEZG 2t_data
FVKKTA8O 2t_data
FWSGRDQ3 2t_data
GINQPZQC 2t_data
GNDO9OXJ 2t_data
GY7KNULP 2t_data
HB00DX4L 2t_data
HCLZSPEJ 2t_data
HGIUTSCG 2t_data
HGYG2DVU 2t_data
HLJ9HHEE 2t_data
HQCEC5NO 2t_data
HQRATPBV 2t_data
HTF6MVY9 2t_data
I0ENR9U5 2t_data
I6BBMPNU 2t_data
IP6ZRIGH 2t_data
IPTLCCCU 2t_data
IYZYF533 2t_data
IZF82AX9 2t_data
J9EJV2S3 2t_data
JA4L7KWX 2t_data
JN0T2O5I 2t_data
JZ22O0DD 2t_data
JZLRN9PL 2t_data
K2VEUQT0 2t_data
K3X3L22Y 2t_data
K71YYDXO 2t_data
KOQM4YU9 2t_data
LV5N8XDB 2t_data
MANO2PKR 2t_data
MSCT8MJD 2t_data
MZ0BI8NN 2t_data
N6QAC84T 2t_data
NDSTZH1I 2t_data
NV4GY44T 2t_data
NYFX4M8T 2t_data
O0BECB72 2t_da

In [25]:
#load scaler and saved pos and neg pairs
scaler = pickle.load(open(min_max_scaler_path, 'rb'))
pos, neg = generate_dataloader(pos_output, neg_output)

[array([1.0, 0.941798941798942, 1.0, 1.0, 1.0, 0.7619047619047619, 1.0,
       "{'_P680': 1.0}"], dtype=object), array([1.0, 0.941798941798942, 1.0, 1.0, 1.0, 0.7619047619047619, 1.0,
       "{'_P680': 1.0}"], dtype=object), array([1.0, 0.941798941798942, 1.0, 1.0, 1.0, 0.7619047619047619, 1.0,
       "{'_P680': 1.0}"], dtype=object), array([1.0, 0.941798941798942, 1.0, 1.0, 1.0, 0.7619047619047619, 1.0,
       "{'_P680': 1.0}"], dtype=object), array([1.0, 0.941798941798942, 1.0, 1.0, 1.0, 0.7619047619047619, 1.0,
       "{'_P680': 1.0}"], dtype=object), array([1.0, 0.941798941798942, 1.0, 1.0, 1.0, 0.7619047619047619, 1.0,
       "{'_P680': 1.0}"], dtype=object), array([1.0, 0.941798941798942, 1.0, 1.0, 1.0, 0.7619047619047619, 1.0,
       "{'_P680': 1.0}"], dtype=object), array([1.0, 0.941798941798942, 1.0, 1.0, 1.0, 0.7619047619047619, 1.0,
       "{'_P680': 1.0}"], dtype=object), array([1.0, 0.941798941798942, 1.0, 1.0, 1.0, 0.7619047619047619, 1.0,
       "{'_P680': 1.0}"], dtype=

In [15]:
N_PCA_COMPONENTS

10

In [26]:
features

['monge_elkan',
 'monge_elkan_aliases',
 'jaro_winkler',
 'levenshtein',
 'singleton',
 'context_score_3',
 'num_occurences']

In [27]:
len(pos[0])

8

In [None]:
# Saving files for dev and test to reduce training time
processed_dev_files = []
processed_test_files = []
for dev_file in dev_files:
    file_name = dev_file.split('/')[-1]
    dev_result_file = f'{processed_dev_data}/{file_name}'
    processed_dev_files.append(dev_result_file)
    if not os.path.exists(dev_result_file):
        print(dev_file, dev_result_file)
        dev_df = pd.read_csv(dev_file)
        try:
            d_result_df = context_property_similarity_code.return_a_table(dev_df, pca_components = N_PCA_COMPONENTS, 
                                                                        autoencoder = encoder_layer, return_context_property = False)
        except MemoryError:
            processed_dev_files.remove(dev_result_file)
            continue
            #print("entering ", file_name)
        d_result_df.to_csv(dev_result_file, index = False)
for dev_file in test_files:

    file_name = dev_file.split('/')[-1]
    dev_result_file = f'{processed_test_data}/{file_name}'
    processed_test_files.append(dev_result_file)

    if not os.path.exists(dev_result_file):
        print(dev_file, dev_result_file)
        dev_df = pd.read_csv(dev_file)
        d_result_df = context_property_similarity_code.return_a_table(dev_df, pca_components = N_PCA_COMPONENTS, 
                                                                        autoencoder = encoder_layer, return_context_property = False)
        d_result_df.to_csv(dev_result_file, index = False)

../Experiments/context_vector_dev_data/2t_data/0D70DN48.csv ../Experiments/Experiment_test_semtab_plain_version_n_10/dev/features//0D70DN48.csv
../Experiments/context_vector_dev_data/2t_data/0IR0XIUW.csv ../Experiments/Experiment_test_semtab_plain_version_n_10/dev/features//0IR0XIUW.csv
../Experiments/context_vector_dev_data/2t_data/1C9LFOKN.csv ../Experiments/Experiment_test_semtab_plain_version_n_10/dev/features//1C9LFOKN.csv
../Experiments/context_vector_dev_data/2t_data/1MQL5T7F.csv ../Experiments/Experiment_test_semtab_plain_version_n_10/dev/features//1MQL5T7F.csv
../Experiments/context_vector_dev_data/2t_data/24W5SSRB.csv ../Experiments/Experiment_test_semtab_plain_version_n_10/dev/features//24W5SSRB.csv
../Experiments/context_vector_dev_data/2t_data/29BNEL1Q.csv ../Experiments/Experiment_test_semtab_plain_version_n_10/dev/features//29BNEL1Q.csv
../Experiments/context_vector_dev_data/2t_data/2BEBH437.csv ../Experiments/Experiment_test_semtab_plain_version_n_10/dev/features//2BEBH

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


../Experiments/context_vector_dev_data/biotab_data/3C59DC.csv ../Experiments/Experiment_test_semtab_plain_version_n_10/dev/features//3C59DC.csv
../Experiments/context_vector_dev_data/biotab_data/70EFDF.csv ../Experiments/Experiment_test_semtab_plain_version_n_10/dev/features//70EFDF.csv
../Experiments/context_vector_dev_data/biotab_data/A684EC.csv ../Experiments/Experiment_test_semtab_plain_version_n_10/dev/features//A684EC.csv


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


../Experiments/context_vector_dev_data/biotab_data/812B4B.csv ../Experiments/Experiment_test_semtab_plain_version_n_10/dev/features//812B4B.csv
../Experiments/context_vector_dev_data/biotab_data/E4DA3B.csv ../Experiments/Experiment_test_semtab_plain_version_n_10/dev/features//E4DA3B.csv


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/nas/home/hrathod/anaconda3/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3418, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-18-3d950978d01e>", line 12, in <module>
    d_result_df = context_property_similarity_code.return_a_table(dev_df, pca_components = N_PCA_COMPONENTS,
  File "/nas/ckgfs/kgtk/hrathod/scratch/context_property_vector/context_property_similarity_code.py", line 112, in return_a_table
    context_similarity_result_array = create_context_similarity(df, apply_pca_components = pca_components, concatenate=True, return_context_property= return_context_property, autoencoder = autoencoder)
  File "/nas/ckgfs/kgtk/hrathod/scratch/context_property_vector/context_property_similarity_code.py", line 75, in create_context_similarity
    context_property_similarity = pca_kernel.fit_transform(context_property_similarity)
  File "/nas/home/hrathod/anaconda3/lib/python3.8/site-pa

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/nas/home/hrathod/anaconda3/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3418, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-18-3d950978d01e>", line 12, in <module>
    d_result_df = context_property_similarity_code.return_a_table(dev_df, pca_components = N_PCA_COMPONENTS,
  File "/nas/ckgfs/kgtk/hrathod/scratch/context_property_vector/context_property_similarity_code.py", line 112, in return_a_table
    context_similarity_result_array = create_context_similarity(df, apply_pca_components = pca_components, concatenate=True, return_context_property= return_context_property, autoencoder = autoencoder)
  File "/nas/ckgfs/kgtk/hrathod/scratch/context_property_vector/context_property_similarity_code.py", line 75, in create_context_similarity
    context_property_similarity = pca_kernel.fit_transform(context_property_similarity)
  File "/nas/home/hrathod/anaconda3/lib/python3.8/site-pa

In [22]:
processed_dev_files = glob.glob(processed_dev_data + '/*.csv')
processed_test_files = glob.glob(processed_test_data + '/*.csv')

In [18]:
len(pos[0])

11

In [19]:
len(test_files)

ERROR! Session/line number was not unique in database. History logging moved to new session 2120


345

In [None]:

X_data = context_property_similarity_code.convert_to_matrix_vector(pos, list(all_prop), encoder_layer)
print(X_data[0])
Y_data = context_property_similarity_code.convert_to_matrix_vector(neg, list(all_prop), encoder_layer)
print(Y_data[0])

In [None]:
len(X_data[0])

In [22]:
## Deleting the context property vector string before feeding data


In [11]:
## Functions to train model
class T2DV2Dataset(Dataset):
    def __init__(self, pos_features, neg_features):
        self.pos_features = pos_features
        self.neg_features = neg_features

    def __len__(self):
        return len(self.pos_features)

    def __getitem__(self, idx):
        return self.pos_features[idx], self.neg_features[idx]
    
class PairwiseNetwork(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        # original 12x24, 24x12, 12x12, 12x1
        self.fc1 = nn.Linear(hidden_size, 2*hidden_size)
        
        self.fc2 = nn.Linear(2*hidden_size, 2*hidden_size)
        #self.fc_x = nn.Linear(2*hidden_size, hidden_size)
        self.fc3 = nn.Linear(2*hidden_size, hidden_size)
        self.fc4 = nn.Linear(hidden_size, 1)

    def forward(self, pos_features, neg_features):
        # Positive pass
        x = F.relu(self.fc1(pos_features))
        x = F.relu(self.fc2(x))
        #x = F.relu(self.fc_x(x))
        x = F.relu(self.fc3(x))
        pos_out = torch.sigmoid(self.fc4(x))

        # Negative Pass
        x = F.relu(self.fc1(neg_features))
        x = F.relu(self.fc2(x))
        #x = F.relu(self.fc_x(x))
        x = F.relu(self.fc3(x))
        neg_out = torch.sigmoid(self.fc4(x))

        return pos_out, neg_out

    def predict(self, test_feat):
        x = F.relu(self.fc1(test_feat))
        x = F.relu(self.fc2(x))
        #x = F.relu(self.fc_x(x))
        x = F.relu(self.fc3(x))
        test_out = torch.sigmoid(self.fc4(x))
        return test_out


# Pairwise Loss
class PairwiseLoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.m = 0

    def forward(self, pos_out, neg_out):
        distance = (1 - pos_out) + neg_out
        loss = torch.mean(torch.max(torch.tensor(0), distance))
        return loss
    
def infer_scores(min_max_scaler_path, input_table_path, output_table_path, model, test = 0):
    #scaler = pickle.loads(s3_1.Bucket("table-linker-datasets").Object(min_max_scaler_path).get()['Body'].read())
    #pca = pickle.load(open('pca_var_500.pkl', 'rb'))
    normalize_features = features
    sfeatures = copy.deepcopy(features) # + [f'csp_{i}' for i in range(N_PCA_COMPONENTS)]
    print(features)
    number_of_cells_top_1 = 0
    number_of_cells_total = 0
    for file in input_table_path:
        file_name = file.split('/')[-1]
        try:
            d_sample = pd.read_csv(file)
        except pd.errors.EmptyDataError:
            continue
        if not isinstance(d_sample, pd.DataFrame):
            continue
        #print(d_sample.columns)
        #d_sample[normalize_features] = scaler.transform(d_sample[normalize_features])
        sorted_df = d_sample
        new_features = sfeatures
        sorted_df_features = sorted_df[sfeatures]
        arr = sorted_df_features.to_numpy()
        #test_inp = context_property_similarity_code.convert_to_matrix_vector(arr, list(all_prop), encoder_layer)
        #print(arr)
        test_inp = arr
        test_tensor = torch.tensor(test_inp).float()
        scores = model.predict(test_tensor)
        scores_list = torch.squeeze(scores).tolist()
        test_df = d_sample
        test_df[final_score_column] = scores_list
        test_df['table_id'] = file_name
        test_df['dataset_id'] = " "
        #df_input_table.append(test_df)
        num_of_cells_with_correct_top_1, num_of_cells = parse_eval_files_stats(test_df, 'siamese_prediction')
        number_of_cells_top_1 += num_of_cells_with_correct_top_1
        number_of_cells_total += num_of_cells
        if test:
            test_df.to_csv(test_predictions + file_name, index=False)
    # print(number_of_cells_top_1, number_of_cells_total, input_table_path)
    return number_of_cells_top_1 / number_of_cells_total

def train(args):
    if torch.cuda.is_available():
        device = torch.device('cuda')

    else:
        device = torch.device('cpu')
    device = torch.device('cpu')
    train_dataset = T2DV2Dataset(X_data, Y_data)
    train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE)
    criterion = PairwiseLoss()
    EPOCHS = args.num_epochs
    model = PairwiseNetwork(len(X_data[0])).to(device=device)
    optimizer = Adam(model.parameters(), lr=args.lr)
    # scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.01)
    top1_max_prec = 0
    for epoch in range(EPOCHS):
        train_epoch_loss = 0
        avg_loss = 0
        model.train()
        for bid_1, batch_1 in tqdm(enumerate(train_dataloader), position=0, leave=True):
            # print("--------------")
            positive_feat = torch.tensor(batch_1[0].float())
            negative_feat = torch.tensor(batch_1[1].float())
            optimizer.zero_grad()
            # print(positive_feat.is_cuda, negative_feat.is_cuda)
            pos_out, neg_out = model(positive_feat, negative_feat)
            ##print(pos_out.is_cuda, neg_out.is_cuda, model.is_cuda)
            loss = criterion(pos_out, neg_out)
            # print(loss.is_cuda)
            loss.backward()
            optimizer.step()
            train_epoch_loss += loss
        avg_loss = train_epoch_loss / bid_1
        # scheduler.step()
        # Evaluation
        model.eval()
        print("Running on dev files")
        top1_precision = infer_scores(args.min_max_scaler_path, dev_files, args.dev_output, model)
        # eval_data = merge_eval_files(args.dev_output)
        # res, candidate_eval_data = parse_eval_files_stats(eval_data, final_score_column)
        # top1_precision = res['num_tasks_with_model_score_top_one_accurate'] / res['num_tasks_with_gt']
        if top1_precision > top1_max_prec:
            #copy_files(args.dev_output, dev_predictions)
            top1_max_prec = top1_precision
            model_save_name = 'top1_{}_epoch_{}_loss_{}_batch_size_{}_learning_rate_{}.pth'.format(top1_max_prec, epoch,
                                                                                                   avg_loss, BATCH_SIZE,
                                                                                                   LEARNING_RATE)
            best_model_path = args.model_save_path + model_save_name
            torch.save(model.state_dict(), best_model_path)
            print("Running on test files")
        print("Test Dataset", infer_scores(args.min_max_scaler_path, test_files, args.dev_output, model, test = 1))
        #s3_1.Bucket('table-linker-datasets').upload_file('/tmp/'+model_save_name, best_model_path)

        print("Epoch {}, Avg Loss is {}, epoch top1 {}, max top1 {}".format(epoch, avg_loss, top1_precision,
                                                                            top1_max_prec))
    return best_model_path, model
def parse_eval_files_stats(eval_data, method):
    res = {}
    candidate_eval_data = eval_data.groupby(['table_id', 'column', 'row'])['table_id'].count().reset_index(name="count")
    res['num_tasks_with_gt'] = len(eval_data[pd.notna(eval_data['GT_kg_id'])].groupby(['table_id', 'column', 'row']))
    num_tasks_with_model_score_top_one_accurate = []
    num_tasks_with_model_score_top_five_accurate = []
    num_tasks_with_model_score_top_ten_accurate = []
    has_gt_list = []
    has_gt_in_candidate = []
    for i, row in candidate_eval_data.iterrows():
        table_id, row_idx, col_idx = row['table_id'], row['row'], row['column']
        c_e_data = eval_data[
            (eval_data['table_id'] == table_id) & (eval_data['row'] == row_idx) & (eval_data['column'] == col_idx)]
        assert len(c_e_data) > 0
        if np.nan not in set(c_e_data['GT_kg_id']):
            has_gt_list.append(1)
        else:
            has_gt_list.append(0)
        if 1 in set(c_e_data['evaluation_label']):
            has_gt_in_candidate.append(1)
        else:
            has_gt_in_candidate.append(0)

        # rank on model score
        s_data = c_e_data.sort_values(by=[method], ascending=False)
        if s_data.iloc[0]['evaluation_label'] == 1:
            num_tasks_with_model_score_top_one_accurate.append(1)
        else:
            num_tasks_with_model_score_top_one_accurate.append(0)

    res['num_tasks_with_model_score_top_one_accurate'] = sum(num_tasks_with_model_score_top_one_accurate)
    # print(sum(num_tasks_with_model_score_top_one_accurate))
    return res['num_tasks_with_model_score_top_one_accurate'], res['num_tasks_with_gt']

In [20]:
X_data = pos
Y_data = neg
train_dataset = T2DV2Dataset(X_data, Y_data)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE)
LEARNING_RATE = 0.000009
training_args = Namespace(num_epochs=12, lr=LEARNING_RATE, positive_feat_path=pos_output, negative_feat_path=neg_output,
                         dev_path=dev_files, dev_output=dev_predictions,
                         model_save_path=model_save_path, min_max_scaler_path=min_max_scaler_path)
best_model_path, model = train(training_args)

  positive_feat = torch.tensor(batch_1[0].float())
  negative_feat = torch.tensor(batch_1[1].float())
34643it [01:22, 417.89it/s]


Running on dev files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'context_score_3', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


Running on test files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'context_score_3', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


Test Dataset 0.8477771897327335
Epoch 0, Avg Loss is 0.8075975775718689, epoch top1 0.5797761194029851, max top1 0.5797761194029851


  positive_feat = torch.tensor(batch_1[0].float())
  negative_feat = torch.tensor(batch_1[1].float())
34643it [01:26, 399.42it/s]


Running on dev files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'context_score_3', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'context_score_3', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


Test Dataset 0.8497618417570786
Epoch 1, Avg Loss is 0.2851220965385437, epoch top1 0.5795522388059702, max top1 0.5797761194029851


  positive_feat = torch.tensor(batch_1[0].float())
  negative_feat = torch.tensor(batch_1[1].float())
34643it [01:24, 409.16it/s]


Running on dev files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'context_score_3', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


Running on test files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'context_score_3', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


Test Dataset 0.8483064302725589
Epoch 2, Avg Loss is 0.21584253013134003, epoch top1 0.58, max top1 0.58


  positive_feat = torch.tensor(batch_1[0].float())
  negative_feat = torch.tensor(batch_1[1].float())
34643it [01:24, 411.67it/s]


Running on dev files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'context_score_3', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


Running on test files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'context_score_3', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


Test Dataset 0.8478433448002117
Epoch 3, Avg Loss is 0.2031344473361969, epoch top1 0.5815671641791045, max top1 0.5815671641791045


  positive_feat = torch.tensor(batch_1[0].float())
  negative_feat = torch.tensor(batch_1[1].float())
34643it [01:09, 496.25it/s]


Running on dev files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'context_score_3', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'context_score_3', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


Test Dataset 0.8491002910822969
Epoch 4, Avg Loss is 0.19503015279769897, epoch top1 0.5802238805970149, max top1 0.5815671641791045


  positive_feat = torch.tensor(batch_1[0].float())
  negative_feat = torch.tensor(batch_1[1].float())
34643it [01:07, 510.10it/s]


Running on dev files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'context_score_3', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'context_score_3', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


Test Dataset 0.8558481079650702
Epoch 5, Avg Loss is 0.1891176849603653, epoch top1 0.5780597014925373, max top1 0.5815671641791045


  positive_feat = torch.tensor(batch_1[0].float())
  negative_feat = torch.tensor(batch_1[1].float())
34643it [01:32, 373.28it/s]


Running on dev files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'context_score_3', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'context_score_3', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


Test Dataset 0.8469833289229955
Epoch 6, Avg Loss is 0.1846804916858673, epoch top1 0.5682835820895522, max top1 0.5815671641791045


  positive_feat = torch.tensor(batch_1[0].float())
  negative_feat = torch.tensor(batch_1[1].float())
26333it [01:00, 434.82it/s]


KeyboardInterrupt: 