## Purpose: Train a model with context similarity as a feature
To do this, 
- convert a the context vector string to context property vector and than context property vector to context similarity vector which is then PCA'ed to lower dimensions. <br>

Set parameters in the cells tagged "set_parameters" <br>
Download data from table-linker-datasets/context_vector_{train/dev}_data in to Experiments/context_vector_train/dev_data
<br>
It stores result while creating the similarity vector temporarily in temp_c_{N_PCA_Components} which can be erased once the pos and neg pickle files are generated.


In [1]:
###Importing all the libraries
import glob
import boto3
import time
import os
import pandas as pd
import sklearn.metrics
from sklearn.preprocessing import MinMaxScaler
import pickle
from argparse import ArgumentParser, Namespace
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from itertools import chain
import copy
import shutil
import pickle
from tqdm import tqdm
import scipy.sparse as sp
import os
## Importing context_similarity function
import context_property_similarity_code

In [2]:
# Setup the variables
N_PCA_COMPONENTS = 50
features = ["monge_elkan","monge_elkan_aliases","jaro_winkler",
            "levenshtein","singleton", "num_occurences"]
BATCH_SIZE = 32
LEARNING_RATE = 0.00001

In [3]:
# Setup the paths

In [3]:
experiment_name = f"Experiment_test_semtab_context_similarity_n_{N_PCA_COMPONENTS}"
experiment_data = '2t_data,limaye_data,biotab_data,t2dv2_data,biodiv_data'
### Creating the directories for the results.
experiment_train_data = experiment_data
experiment_dev_data =  experiment_data
experiment_test_data = 'semtab_data'
experiment_store_path = f"../Experiments/{experiment_name}"
processed_dev_data = f"{experiment_store_path}/dev/features/"
processed_test_data = f"{experiment_store_path}/test/features/"
dev_predictions = f"{experiment_store_path}/dev/dev_predictions/"
test_predictions = f"{experiment_store_path}/test/test_predictions/"
dev_output_pred = f"{experiment_store_path}/dev/dev_output/"
dev_predictions_top_k = f"{experiment_store_path}/dev/dev_predictions_top_k/"
dev_metrics = f"{experiment_store_path}/dev/dev_metrics/"
dev_predictions_colorized = f"{experiment_store_path}/dev/dev_predictions_colorized/"
model_save_path = f'{experiment_store_path}/model_save_path/'
best_model_path = ''

training_data_path = f'{experiment_store_path}/model_training_data'

pos_output = f'{training_data_path}/tl_pipeline_pos_features_{BATCH_SIZE}.pkl'
neg_output = f'{training_data_path}/tl_pipeline_neg_features_{BATCH_SIZE}.pkl'
min_max_scaler_path = f'{training_data_path}/tl_pipeline_normalization_factor.pkl'

final_score_column = 'siamese_prediction'

extra_feat = ['column-id', 'column', 'row', 'evaluation_label', 'dataset_id', 'table_id', 'context_property_vector']
for f in features:
    extra_feat.append(f)


!mkdir -p $experiment_store_path 
!mkdir -p $dev_predictions 
!mkdir -p $dev_output_pred
!mkdir -p $dev_predictions_top_k 
!mkdir -p $dev_metrics 
!mkdir -p $dev_predictions_colorized 
!mkdir -p $model_save_path 
!mkdir -p $training_data_path
!mkdir -p $processed_dev_data
!mkdir -p $processed_test_data
!mkdir -p $test_predictions
!mkdir -p f'temp_{N_PCA_COMPONENTS}'

In [4]:
train_files_path = [f'../Experiments/context_vector_train_data/' + i for i in experiment_train_data.split(',')]
train_files = []
for train_path in train_files_path:
    set_of_files = glob.glob(train_path + '/*.csv')
    train_files.extend(set_of_files)
print(len(train_files))
dev_files_path = ['../Experiments/context_vector_dev_data/' + i for i in experiment_train_data.split(',')]
dev_files = []
for dev_path in dev_files_path:
    set_of_files = glob.glob(dev_path + '/*.csv')
    dev_files.extend(set_of_files)
print(len(dev_files))
test_files = []
test_files_location = ['../Experiments/context_vector_dev_data/semtab_data', '../Experiments/context_vector_train_data/semtab_data']
for k in test_files_location:
    test_files.extend(glob.glob(k + '/*.csv'))
print(len(test_files))

453
161
345


In [5]:
# Helpher functions for data preprocessing
def merge_files(args):
    # datapath = args.train_path
    df_list = []
    for fn in args.train_files:
        fid = fn.split('/')[-1][:-4]
        dataset_id = fn.split('/')[-2]
        df = read_file(fn)
        if not isinstance(df, pd.DataFrame) :
            continue

        df['table_id'] = fid
        print(dataset_id, fid)
        df['dataset_id'] = dataset_id
        df['context_score'].fillna(0.0, inplace=True)
        if 'column-id' not in df.columns:
            df['column-id'] = fn.split('/')[-1] + df['column'].astype('str')

        df = df[extra_feat]
        df_list.append(df)
    return pd.concat(df_list)


def compute_normalization_factor(args, all_data):
    min_max_scaler_path = args.min_max_scaler_path
    all_data_features = all_data[features]
    scaler = MinMaxScaler()
    scaler.fit(all_data_features)
    return scaler


def read_file(key):
    #resp = s3.get_object(Bucket = bucket, Key = key)
    try:
        df = pd.read_csv(key, sep = ',')
    except pd.errors.EmptyDataError:
        df = ''
        print('Empty csv file!')
    return df

In [6]:

def generate_train_data(args, all_data, scaler, shuffle_by = None):
    num_cells_1 = 0
    scaler_path = args.min_max_scaler_path
    #scaler = pickle.load(open('./tmp/min_max_scaler_path.pkl', 'rb'))
    final_list = []
    sfeatures = copy.deepcopy(features) + ['context_property_vector'] + [f'csp_{i}' for i in range(N_PCA_COMPONENTS)]
    normalize_features = features
    evaluation_label = ['evaluation_label']
    positive_features_final = []
    negative_features_final = []
    super_groups = all_data.groupby(['column-id'])
    if shuffle_by == 'dataset':
        super_groups = all_data.groupby(['dataset_id'])
        for i, s_group in super_groups:
            pos_features_dataset = []
            neg_features_dataset = []
            grouped_obj = s_group.groupby(['column', 'row', 'column-id'])

            for cell in grouped_obj:
                num_cells_1 += 1
                cell[1][normalize_features] = scaler.transform(cell[1][normalize_features])
                pos_features = []
                neg_features = []
                a = cell[1][cell[1]['evaluation_label'] == 1]
                if a.empty:
                    continue
                pos_rows = cell[1][cell[1]['evaluation_label'].astype(int) == 1][sfeatures].to_numpy()
                if len(pos_rows) < 1:
                    continue
                if len(pos_rows) > 1:
                    print("here")
                for i in range(len(pos_rows)):
                    pos_features.append(pos_rows[i])
                neg_rows = cell[1][cell[1]['evaluation_label'].astype(int) == -1][sfeatures].to_numpy()
                for i in range(min(BATCH_SIZE, len(neg_rows))):
                    neg_features.append(neg_rows[i])

                for k in range(len(neg_features) - len(pos_features)):
                    pos_features.append(random.choice(pos_rows))
                if len(pos_features) != len(neg_features):
                    continue
                pos_features_dataset.append(pos_features)
                neg_features_dataset.append(neg_features)
            if len(pos_features_dataset) > 0:
                c = list(zip(pos_features_dataset, neg_features_dataset))
                random.shuffle(c)
                pos_features_dataset, neg_features_dataset = zip(*c)
                positive_features_final.extend(pos_features_dataset)
                negative_features_final.extend(neg_features_dataset)
    elif shuffle_by == 'table':
        super_groups = all_data.groupby(['table_id'])
        for i, s_group in super_groups:
            pos_features_table = []
            neg_features_table = []
            file_name = i.split('-')[0]
            s_group_n = context_property_similarity_code.return_a_table(s_group, pca_components = N_PCA_COMPONENTS)
            #print("entering ", file_name)
            ds_id = s_group_n['dataset_id'].values[0]
            print("File: ", file_name, ds_id)
            if os.path.exists(f'temp_{N_PCA_COMPONENTS}/neg_{ds_id}_{file_name}.pkl'):
                print("Already Exists!")
                pos_features_table = pickle.load(open(f'temp_{N_PCA_COMPONENTS}/pos_{ds_id}_{file_name}.pkl', 'rb'))
                neg_features_table = pickle.load(open(f'temp_{N_PCA_COMPONENTS}/neg_{ds_id}_{file_name}.pkl', 'rb'))
                positive_features_final.extend(pos_features_table)
                negative_features_final.extend(neg_features_table) 
                continue
            grouped_obj = s_group_n.groupby(['column', 'row'])
    
            for cell in grouped_obj:
                num_cells_1 += 1
                cell[1][normalize_features] = scaler.transform(cell[1][normalize_features])
                pos_features = []
                neg_features = []
                a = cell[1][cell[1]['evaluation_label'] == 1]
                if a.empty:
                    continue
                pos_rows = cell[1][cell[1]['evaluation_label'].astype(int) == 1][sfeatures].to_numpy()
                if len(pos_rows) < 1:
                    continue
                if len(pos_rows) > 1:
                    print("here")
                for i in range(len(pos_rows)):
                    pos_features.append(pos_rows[i])
                neg_rows = cell[1][cell[1]['evaluation_label'].astype(int) == -1][sfeatures].to_numpy()
                for i in range(min(BATCH_SIZE, len(neg_rows))):
                    neg_features.append(neg_rows[i])

                for k in range(len(neg_features) - len(pos_features)):
                    pos_features.append(random.choice(pos_rows))
                if len(pos_features) != len(neg_features):
                    continue
                random.shuffle(pos_features)

                random.shuffle(neg_features)
                pos_features_table.append(pos_features)
                neg_features_table.append(neg_features)
            if len(pos_features_table) > 0:
                c = list(zip(pos_features_table, neg_features_table))
                random.shuffle(c)
                pos_features_table, neg_features_table = zip(*c)
                positive_features_final.extend(pos_features_table)
                negative_features_final.extend(neg_features_table)
                
                pickle.dump(pos_features_table, open(f'temp_{N_PCA_COMPONENTS}/pos_{ds_id}_{file_name}.pkl', 'wb'))
                pickle.dump(neg_features_table, open(f'temp_{N_PCA_COMPONENTS}/neg_{ds_id}_{file_name}.pkl', 'wb'))
    else:
        for i, s_group in super_groups:
            file_name = i.split('-')[0]
            #print("entering ", file_name)
            grouped_obj = s_group.groupby(['column', 'row'])
            for cell in grouped_obj:
                num_cells_1 += 1
                cell[1][normalize_features] = scaler.transform(cell[1][normalize_features])
                pos_features = []
                neg_features = []
                a = cell[1][cell[1]['evaluation_label'] == 1]
                if a.empty:
                    continue
                pos_rows = cell[1][cell[1]['evaluation_label'].astype(int) == 1][sfeatures].to_numpy()
                if len(pos_rows) < 1:
                    continue
                if len(pos_rows) > 1:
                    print("here")
                for i in range(len(pos_rows)):
                    pos_features.append(pos_rows[i])
                neg_rows = cell[1][cell[1]['evaluation_label'].astype(int) == -1][sfeatures].to_numpy()
                for i in range(min(BATCH_SIZE, len(neg_rows))):
                    neg_features.append(neg_rows[i])

                for k in range(len(neg_features) - len(pos_features)):
                    pos_features.append(random.choice(pos_rows))
                random.shuffle(pos_features)
                random.shuffle(neg_features)
                if len(pos_features) != len(neg_features):
                    print("HHHERRRERR")
                else:
                    positive_features_final.append(pos_features)
                    negative_features_final.append(neg_features)
                #print(len(positive_features_final), len(positive_features_final[3]))
                #print(len(negative_features_final), len(negative_features_final[3]))
    if shuffle_by == 'complete_shuffle':
        c = list(zip(positive_features_final, negative_features_final))
        random.shuffle(c)
        positive_features_final, positive_features_final = zip(*c)
    print(len(positive_features_final), len(positive_features_final[3]))
    print(len(negative_features_final), len(negative_features_final[3]))

    print(len(positive_features_final), len(positive_features_final[3]))
    print(len(negative_features_final), len(negative_features_final[3]))
    return positive_features_final, negative_features_final
def generate_dataloader(positive_feat_path, negative_feat_path):
    pos_features = pickle.load(open(positive_feat_path, 'rb'))
    neg_features = pickle.load(open(negative_feat_path, 'rb'))
    print(pos_features[10])
    pos_features_flatten = list(chain.from_iterable(pos_features))
    neg_features_flatten = list(chain.from_iterable(neg_features))
    return pos_features_flatten, neg_features_flatten

In [8]:
# Setting the scaler and merging files
gen_training_data_args = Namespace(train_files=train_files, pos_output=pos_output, neg_output=neg_output,min_max_scaler_path=min_max_scaler_path)
all_data = merge_files(gen_training_data_args)
scaler = compute_normalization_factor(gen_training_data_args, all_data)
pickle.dump(scaler, open(min_max_scaler_path, 'wb'))

2t_data 3DOM5NIW
2t_data 3JXMPC7N
2t_data 3LG8J4MX
2t_data 3N6S2FCX
2t_data 4J75OL3W
2t_data 6I7ET24J
2t_data 71SY0Z5S
2t_data AUU9A6KL


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/nas/home/hrathod/anaconda3/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3418, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-8-3ff59215c2d3>", line 3, in <module>
    all_data = merge_files(gen_training_data_args)
  File "<ipython-input-6-a37e2e5c3dc7>", line 8, in merge_files
    df = read_file(fn)
  File "<ipython-input-6-a37e2e5c3dc7>", line 35, in read_file
    df = pd.read_csv(key, sep = ',')
  File "/nas/home/hrathod/anaconda3/lib/python3.8/site-packages/pandas/io/parsers.py", line 688, in read_csv
    return _read(filepath_or_buffer, kwds)
  File "/nas/home/hrathod/anaconda3/lib/python3.8/site-packages/pandas/io/parsers.py", line 460, in _read
    data = parser.read(nrows)
  File "/nas/home/hrathod/anaconda3/lib/python3.8/site-packages/pandas/io/parsers.py", line 1198, in read
    ret = self._engine.read(nrows)
  File "/nas/home/hrathod/anaconda3/lib/python3.8/site-packag

TypeError: object of type 'NoneType' has no len()

In [25]:
# Creating final data
positive_features_final, negative_features_final = generate_train_data(gen_training_data_args, all_data, scaler, shuffle_by = 'table')
pickle.dump(positive_features_final, open(pos_output, 'wb'))
pickle.dump(negative_features_final, open(neg_output, 'wb'))

File:  093F65 biotab_data
Already Exists!
File:  0bc67e05a4d14011a2cf3fca2f869495 biodiv_data
Already Exists!
File:  0dc3add04e344228bb140d5392399521 biodiv_data
Already Exists!
File:  10579449_0_1681126353774891032 t2dv2_data
Already Exists!
File:  14067031_0_559833072073397908 t2dv2_data
Already Exists!
File:  1438042986423_95_20150728002306 t2dv2_data
Already Exists!
File:  1438042989018_40_20150728002309 t2dv2_data
Already Exists!
File:  1438042989043_35_20150728002309 t2dv2_data
Already Exists!
File:  14380604_4_3329235705746762392 t2dv2_data
Already Exists!
File:  14BFA6 biotab_data
Already Exists!
File:  16767252_0_2409448375013995751 t2dv2_data
Already Exists!
File:  167909 biotab_data
Already Exists!
File:  19CA14 biotab_data
Already Exists!
File:  1C383C biotab_data
Already Exists!
File:  1F0E3D biotab_data
Already Exists!
File:  1FF1DE biotab_data
Already Exists!
File:  1d09a099d3964602aca9425adcde89cd biodiv_data
Already Exists!
File:  20135078_0_7570343137119682530 t2dv2_d

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/nas/home/hrathod/anaconda3/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3418, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-25-af79fa27a3c4>", line 2, in <module>
    positive_features_final, negative_features_final = generate_train_data(gen_training_data_args, all_data, scaler, shuffle_by = 'table')
  File "<ipython-input-23-fca69efbc40d>", line 56, in generate_train_data
    s_group_n = context_property_similarity_code.return_a_table(s_group, pca_components = N_PCA_COMPONENTS)
  File "/nas/ckgfs/kgtk/hrathod/scratch/context_property_vector/context_property_similarity_code.py", line 108, in return_a_table
    context_similarity_result_array = create_context_similarity(df, apply_pca_components = pca_components, concatenate=True)
  File "/nas/ckgfs/kgtk/hrathod/scratch/context_property_vector/context_property_similarity_code.py", line 71, in create_context_similarity
    context

TypeError: object of type 'NoneType' has no len()

In [9]:
# To get the features faster - if all the preprocessing has been done for all the files.
'''
positive_features_final= [] 
negative_features_final = []
for i in train_files:
    file_name = i.split('/')[-1][:-4]
    ds_id = i.split('/')[-2]
    print(file_name, ds_id)
    pos_location = f'temp_{N_PCA_COMPONENTS}/pos_{ds_id}_{file_name}.pkl'
    neg_location = f'temp_{N_PCA_COMPONENTS}/neg_{ds_id}_{file_name}.pkl'
    if os.path.exists(pos_location):
        pos_features_table = pickle.load(open(f'temp_{N_PCA_COMPONENTS}/pos_{ds_id}_{file_name}.pkl', 'rb'))
        neg_features_table = pickle.load(open(f'temp_{N_PCA_COMPONENTS}/neg_{ds_id}_{file_name}.pkl', 'rb'))
        positive_features_final.extend(pos_features_table)
        negative_features_final.extend(neg_features_table) 
pickle.dump(positive_features_final, open(pos_output, 'wb'))
pickle.dump(negative_features_final, open(neg_output, 'wb'))
'''

3DOM5NIW 2t_data
3JXMPC7N 2t_data
3LG8J4MX 2t_data
3N6S2FCX 2t_data
4J75OL3W 2t_data
6I7ET24J 2t_data
71SY0Z5S 2t_data
AUU9A6KL 2t_data
B20WIQKU 2t_data
B38A9Q5R 2t_data
BID0NRU0 2t_data
CHZGO92A 2t_data
CNQ5Z0BG 2t_data
DXTA1MV8 2t_data
E22XXKVQ 2t_data
E3ZK744Q 2t_data
E45F8QW2 2t_data
EQZ21058 2t_data
ET9REW9Y 2t_data
EV6LDIB8 2t_data
F98SUVJH 2t_data
FF00TEZG 2t_data
FVKKTA8O 2t_data
FWSGRDQ3 2t_data
GINQPZQC 2t_data
GNDO9OXJ 2t_data
GY7KNULP 2t_data
HB00DX4L 2t_data
HCLZSPEJ 2t_data
HGIUTSCG 2t_data
HGYG2DVU 2t_data
HLJ9HHEE 2t_data
HQCEC5NO 2t_data
HQRATPBV 2t_data
HTF6MVY9 2t_data
I0ENR9U5 2t_data
I6BBMPNU 2t_data
IP6ZRIGH 2t_data
IPTLCCCU 2t_data
IYZYF533 2t_data
IZF82AX9 2t_data
J9EJV2S3 2t_data
JA4L7KWX 2t_data
JN0T2O5I 2t_data
JZ22O0DD 2t_data
JZLRN9PL 2t_data
K2VEUQT0 2t_data
K3X3L22Y 2t_data
K71YYDXO 2t_data
KOQM4YU9 2t_data
LV5N8XDB 2t_data
MANO2PKR 2t_data
MSCT8MJD 2t_data
MZ0BI8NN 2t_data
N6QAC84T 2t_data
NDSTZH1I 2t_data
NV4GY44T 2t_data
NYFX4M8T 2t_data
O0BECB72 2t_da

In [7]:
#load scaler and saved pos and neg pairs
scaler = pickle.load(open(min_max_scaler_path, 'rb'))
pos, neg = generate_dataloader(pos_output, neg_output)

[array([0.9916666666666668, 0.9925, 0.992, 0.96, 0.0, 0.3333333333333333,
       "{'P25': 1.0, 'P2949': 0.9555555555555556, 'P106': 0.8387878787878787, 'P4553': 0.8049999999999999, 'P1290': 0.8222222222222222, 'P138': 0.8222222222222222, 'P569': 0.8666666666666667, 'P1636': 0.8666666666666667, 'P166': 0.9014814814814814, '_P138': 0.8274801587301587, '_P921': 0.8274801587301587, '_P180': 0.8274801587301587, '_P542': 0.8274801587301587, '_P1308': 0.8274801587301587, 'P27': 1.0, 'P937': 0.9557291666666666, 'P4200': 0.8049999999999999, '_P40': 0.9476190476190476}",
       -8.48029134824029, 46.93309180692728, 35.13601427756792,
       0.47698237853218783, 2.583875617278776, -7.526190599341469,
       -0.21120526871961218, -0.042333541366722016, -0.6121388081731824,
       0.3406960957300511, -0.3721066485286115, -1.2366335742594121,
       0.7377212464692905, 0.28104156060468827, -0.2717225257291474,
       -0.4916340878223085, 0.0013478524622008619, -0.17416169635136194,
       -0.3188566

In [8]:
pos[0]

array([0.9061728395061728, 1.0, 0.7777777777777777, 0.6666666666666667,
       0.0, 0.6666666666666667,
       "{'_P25': 1.0, '_P1290': 0.8592592592592592, '_P1038': 1.0, '_P40': 1.0, 'P138': 0.8307760141093475, 'P25': 0.8307760141093475, 'P569': 0.8, 'P40': 0.9476190476190476, 'P1559': 0.9061728395061728, 'P268': 0.8333333333333334, 'P2924': 0.8083333333333333, 'P22': 0.8222222222222222, 'P39': 0.8762265512265512, 'P7982': 0.8049999999999999, 'P27': 1.0, 'P2949': 0.975, 'P734': 1.0, '_P35': 1.0, '_P1308': 1.0, '_P1037': 0.9703703703703703, '_P180': 1.0, '_P542': 0.9703703703703703, '_P488': 1.0}",
       -7.54991810635107, 77.44845662482207, 29.97113395137799,
       -1.95616472214946, 0.14340021778202527, -4.059618864843214,
       0.004259078421035359, -0.15500458342642515, 0.18418773860481405,
       0.1685294592183763, 0.6406026077105562, -0.7949385782873237,
       -0.9123871039927044, -0.011550898589083565, -0.047683092675211335,
       -0.002426097076379793, 0.4660525458830826,

In [11]:
## Deleting the context property vector string before feeding data
X_data = []
Y_data = []
for i in pos:
    val = list(i)
    del val[len(features)]
    X_data.append(np.array(val))
for i in neg:
    val = list(i)
    del val[len(features)]
    Y_data.append(np.array(val))

In [12]:
len(X_data[0])

56

In [15]:
# Saving files for dev and test to reduce training time
processed_dev_files = []
processed_test_files = []
for dev_file in dev_files:
    file_name = dev_file.split('/')[-1]
    dev_result_file = f'{processed_dev_data}/{file_name}'
    processed_dev_files.append(dev_result_file)
    if not os.path.exists(dev_result_file):
        print(dev_file, dev_result_file)
        dev_df = pd.read_csv(dev_file)
        d_result_df = context_property_similarity_code.return_a_table(dev_df, pca_components = N_PCA_COMPONENTS)

        d_result_df.to_csv(dev_result_file, index = False)
for dev_file in test_files:

    file_name = dev_file.split('/')[-1]
    dev_result_file = f'{processed_test_data}/{file_name}'
    processed_test_files.append(dev_result_file)

    if not os.path.exists(dev_result_file):
        print(dev_file, dev_result_file)
        dev_df = pd.read_csv(dev_file)
        d_result_df = context_property_similarity_code.return_a_table(dev_df, pca_components = N_PCA_COMPONENTS)
        d_result_df.to_csv(dev_result_file, index = False)

../Experiments/context_vector_dev_data/t2dv2_data/88523363_0_8180214313099580515.csv ../Experiments/Experiment_test_semtab_context_similarity_n_50/dev/features//88523363_0_8180214313099580515.csv
../Experiments/context_vector_dev_data/t2dv2_data/91959037_0_7907661684242014480.csv ../Experiments/Experiment_test_semtab_context_similarity_n_50/dev/features//91959037_0_7907661684242014480.csv


In [16]:
## Functions to train model
class T2DV2Dataset(Dataset):
    def __init__(self, pos_features, neg_features):
        self.pos_features = pos_features
        self.neg_features = neg_features

    def __len__(self):
        return len(self.pos_features)

    def __getitem__(self, idx):
        return self.pos_features[idx], self.neg_features[idx]
    
class PairwiseNetwork(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        # original 12x24, 24x12, 12x12, 12x1
        self.fc1 = nn.Linear(hidden_size, 2*hidden_size)
        
        self.fc2 = nn.Linear(2*hidden_size, 2*hidden_size)
        #self.fc_x = nn.Linear(2*hidden_size, hidden_size)
        self.fc3 = nn.Linear(2*hidden_size, hidden_size)
        self.fc4 = nn.Linear(hidden_size, 1)

    def forward(self, pos_features, neg_features):
        # Positive pass
        x = F.relu(self.fc1(pos_features))
        x = F.relu(self.fc2(x))
        #x = F.relu(self.fc_x(x))
        x = F.relu(self.fc3(x))
        pos_out = torch.sigmoid(self.fc4(x))

        # Negative Pass
        x = F.relu(self.fc1(neg_features))
        x = F.relu(self.fc2(x))
        #x = F.relu(self.fc_x(x))
        x = F.relu(self.fc3(x))
        neg_out = torch.sigmoid(self.fc4(x))

        return pos_out, neg_out

    def predict(self, test_feat):
        x = F.relu(self.fc1(test_feat))
        x = F.relu(self.fc2(x))
        #x = F.relu(self.fc_x(x))
        x = F.relu(self.fc3(x))
        test_out = torch.sigmoid(self.fc4(x))
        return test_out


# Pairwise Loss
class PairwiseLoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.m = 0

    def forward(self, pos_out, neg_out):
        distance = (1 - pos_out) + neg_out
        loss = torch.mean(torch.max(torch.tensor(0), distance))
        return loss
    
def infer_scores(min_max_scaler_path, input_table_path, output_table_path, model, test = 0):
    #scaler = pickle.loads(s3_1.Bucket("table-linker-datasets").Object(min_max_scaler_path).get()['Body'].read())
    #pca = pickle.load(open('pca_var_500.pkl', 'rb'))
    normalize_features = features
    sfeatures = copy.deepcopy(features)  + [f'csp_{i}' for i in range(N_PCA_COMPONENTS)]
    print(features)
    number_of_cells_top_1 = 0
    number_of_cells_total = 0
    for file in input_table_path:
        file_name = file.split('/')[-1]
        try:
            d_sample = pd.read_csv(file)
        except pd.errors.EmptyDataError:
            continue
        if not isinstance(d_sample, pd.DataFrame):
            continue
        #print(d_sample.columns)
        d_sample[normalize_features] = scaler.transform(d_sample[normalize_features])
        sorted_df = d_sample
        new_features = sfeatures
        sorted_df_features = sorted_df[sfeatures]
        arr = sorted_df_features.to_numpy()
        #test_inp = convert_to_matrix_vector(arr, list(all), pca)
        #print(arr)
        test_tensor = torch.tensor(arr).float()
        scores = model.predict(test_tensor)
        scores_list = torch.squeeze(scores).tolist()
        test_df = d_sample
        test_df[final_score_column] = scores_list
        test_df['table_id'] = file_name
        test_df['dataset_id'] = " "
        #df_input_table.append(test_df)
        num_of_cells_with_correct_top_1, num_of_cells = parse_eval_files_stats(test_df, 'siamese_prediction')
        number_of_cells_top_1 += num_of_cells_with_correct_top_1
        number_of_cells_total += num_of_cells
        if test:
            test_df.to_csv(test_predictions + file_name, index=False)
    # print(number_of_cells_top_1, number_of_cells_total, input_table_path)
    return number_of_cells_top_1 / number_of_cells_total

def train(args):
    if torch.cuda.is_available():
        device = torch.device('cuda')

    else:
        device = torch.device('cpu')
    device = torch.device('cpu')
    train_dataset = T2DV2Dataset(X_data, Y_data)
    train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE)
    criterion = PairwiseLoss()
    EPOCHS = args.num_epochs
    model = PairwiseNetwork(len(X_data[0])).to(device=device)
    optimizer = Adam(model.parameters(), lr=args.lr)
    # scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.01)
    top1_max_prec = 0
    for epoch in range(EPOCHS):
        train_epoch_loss = 0
        avg_loss = 0
        model.train()
        for bid_1, batch_1 in tqdm(enumerate(train_dataloader), position=0, leave=True):
            # print("--------------")
            positive_feat = torch.tensor(batch_1[0].float())
            negative_feat = torch.tensor(batch_1[1].float())
            optimizer.zero_grad()
            # print(positive_feat.is_cuda, negative_feat.is_cuda)
            pos_out, neg_out = model(positive_feat, negative_feat)
            ##print(pos_out.is_cuda, neg_out.is_cuda, model.is_cuda)
            loss = criterion(pos_out, neg_out)
            # print(loss.is_cuda)
            loss.backward()
            optimizer.step()
            train_epoch_loss += loss
        avg_loss = train_epoch_loss / bid_1
        # scheduler.step()
        # Evaluation
        model.eval()
        print("Running on dev files")
        top1_precision = infer_scores(args.min_max_scaler_path, processed_dev_files, args.dev_output, model)
        # eval_data = merge_eval_files(args.dev_output)
        # res, candidate_eval_data = parse_eval_files_stats(eval_data, final_score_column)
        # top1_precision = res['num_tasks_with_model_score_top_one_accurate'] / res['num_tasks_with_gt']
        if top1_precision > top1_max_prec:
            #copy_files(args.dev_output, dev_predictions)
            top1_max_prec = top1_precision
            model_save_name = 'top1_{}_epoch_{}_loss_{}_batch_size_{}_learning_rate_{}.pth'.format(top1_max_prec, epoch,
                                                                                                   avg_loss, BATCH_SIZE,
                                                                                                   LEARNING_RATE)
            best_model_path = args.model_save_path + model_save_name
            torch.save(model.state_dict(), best_model_path)
            print("Running on test files")
            print("Test Dataset", infer_scores(args.min_max_scaler_path, processed_test_files, args.dev_output, model, test = 1))
        #s3_1.Bucket('table-linker-datasets').upload_file('/tmp/'+model_save_name, best_model_path)

        print("Epoch {}, Avg Loss is {}, epoch top1 {}, max top1 {}".format(epoch, avg_loss, top1_precision,
                                                                            top1_max_prec))
    return best_model_path, model
def parse_eval_files_stats(eval_data, method):
    res = {}
    candidate_eval_data = eval_data.groupby(['table_id', 'column', 'row'])['table_id'].count().reset_index(name="count")
    res['num_tasks_with_gt'] = len(eval_data[pd.notna(eval_data['GT_kg_id'])].groupby(['table_id', 'column', 'row']))
    num_tasks_with_model_score_top_one_accurate = []
    num_tasks_with_model_score_top_five_accurate = []
    num_tasks_with_model_score_top_ten_accurate = []
    has_gt_list = []
    has_gt_in_candidate = []
    for i, row in candidate_eval_data.iterrows():
        table_id, row_idx, col_idx = row['table_id'], row['row'], row['column']
        c_e_data = eval_data[
            (eval_data['table_id'] == table_id) & (eval_data['row'] == row_idx) & (eval_data['column'] == col_idx)]
        assert len(c_e_data) > 0
        if np.nan not in set(c_e_data['GT_kg_id']):
            has_gt_list.append(1)
        else:
            has_gt_list.append(0)
        if 1 in set(c_e_data['evaluation_label']):
            has_gt_in_candidate.append(1)
        else:
            has_gt_in_candidate.append(0)

        # rank on model score
        s_data = c_e_data.sort_values(by=[method], ascending=False)
        if s_data.iloc[0]['evaluation_label'] == 1:
            num_tasks_with_model_score_top_one_accurate.append(1)
        else:
            num_tasks_with_model_score_top_one_accurate.append(0)

    res['num_tasks_with_model_score_top_one_accurate'] = sum(num_tasks_with_model_score_top_one_accurate)
    # print(sum(num_tasks_with_model_score_top_one_accurate))
    return res['num_tasks_with_model_score_top_one_accurate'], res['num_tasks_with_gt']

In [17]:
train_dataset = T2DV2Dataset(X_data, Y_data)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE)

In [38]:

training_args = Namespace(num_epochs=12, lr=LEARNING_RATE, positive_feat_path=pos_output, negative_feat_path=neg_output,
                         dev_path=processed_dev_files, dev_output=dev_predictions,
                         model_save_path=model_save_path, min_max_scaler_path=min_max_scaler_path)
best_model_path, model = train(training_args)

  positive_feat = torch.tensor(batch_1[0].float())
  negative_feat = torch.tensor(batch_1[1].float())
34643it [01:08, 503.60it/s]


Running on dev files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

Running on test files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

Test Dataset 0.5099894151892035
Epoch 0, Avg Loss is 0.9572699666023254, epoch top1 0.3086567164179104, max top1 0.3086567164179104


  positive_feat = torch.tensor(batch_1[0].float())
  negative_feat = torch.tensor(batch_1[1].float())
34643it [01:06, 522.51it/s]


Running on dev files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

Running on test files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

Test Dataset 0.5250066155067478
Epoch 1, Avg Loss is 0.8831175565719604, epoch top1 0.3352238805970149, max top1 0.3352238805970149


  positive_feat = torch.tensor(batch_1[0].float())
  negative_feat = torch.tensor(batch_1[1].float())
34643it [01:06, 517.80it/s]


Running on dev files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

Running on test files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

Test Dataset 0.5332759989415189
Epoch 2, Avg Loss is 0.8203123211860657, epoch top1 0.34044776119402986, max top1 0.34044776119402986


  positive_feat = torch.tensor(batch_1[0].float())
  negative_feat = torch.tensor(batch_1[1].float())
34643it [01:06, 518.77it/s]


Running on dev files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

Running on test files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

Test Dataset 0.5476978036517597
Epoch 3, Avg Loss is 0.7629157304763794, epoch top1 0.3452238805970149, max top1 0.3452238805970149


  positive_feat = torch.tensor(batch_1[0].float())
  negative_feat = torch.tensor(batch_1[1].float())
34643it [01:05, 526.59it/s]


Running on dev files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

Epoch 4, Avg Loss is 0.6985363960266113, epoch top1 0.3435074626865672, max top1 0.3452238805970149


34643it [01:06, 519.68it/s]


Running on dev files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

Epoch 5, Avg Loss is 0.6235359311103821, epoch top1 0.32649253731343286, max top1 0.3452238805970149


34643it [01:07, 511.88it/s]


Running on dev files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

Epoch 6, Avg Loss is 0.5466070175170898, epoch top1 0.3191044776119403, max top1 0.3452238805970149


34643it [01:31, 378.91it/s]


Running on dev files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

Epoch 7, Avg Loss is 0.48198825120925903, epoch top1 0.31022388059701494, max top1 0.3452238805970149


34643it [01:05, 526.24it/s]


Running on dev files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

Epoch 8, Avg Loss is 0.4340038597583771, epoch top1 0.315, max top1 0.3452238805970149


34643it [01:06, 523.10it/s]


Running on dev files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


KeyboardInterrupt: 

In [41]:

training_args = Namespace(num_epochs=12, lr=LEARNING_RATE, positive_feat_path=pos_output, negative_feat_path=neg_output,
                         dev_path=processed_dev_files, dev_output=dev_predictions,
                         model_save_path=model_save_path, min_max_scaler_path=min_max_scaler_path)
best_model_path, model = train(training_args)

  positive_feat = torch.tensor(batch_1[0].float())
  negative_feat = torch.tensor(batch_1[1].float())
34643it [01:08, 508.15it/s]


Running on dev files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

Running on test files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

Test Dataset 0.6342947869806828
Epoch 0, Avg Loss is 0.5954132676124573, epoch top1 0.34097014925373137, max top1 0.34097014925373137


  positive_feat = torch.tensor(batch_1[0].float())
  negative_feat = torch.tensor(batch_1[1].float())
34643it [01:07, 513.09it/s]


Running on dev files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

Running on test files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

Test Dataset 0.6551336332363059
Epoch 1, Avg Loss is 0.3167226314544678, epoch top1 0.34432835820895524, max top1 0.34432835820895524


  positive_feat = torch.tensor(batch_1[0].float())
  negative_feat = torch.tensor(batch_1[1].float())
34643it [01:07, 513.01it/s]


Running on dev files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

Running on test files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

Test Dataset 0.6673723207197672
Epoch 2, Avg Loss is 0.2961110770702362, epoch top1 0.35582089552238805, max top1 0.35582089552238805


  positive_feat = torch.tensor(batch_1[0].float())
  negative_feat = torch.tensor(batch_1[1].float())
34643it [01:07, 510.73it/s]


Running on dev files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

Running on test files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

Test Dataset 0.6784863720560995
Epoch 3, Avg Loss is 0.2856811583042145, epoch top1 0.3742537313432836, max top1 0.3742537313432836


  positive_feat = torch.tensor(batch_1[0].float())
  negative_feat = torch.tensor(batch_1[1].float())
34643it [01:12, 477.24it/s]


Running on dev files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

Running on test files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

Test Dataset 0.6765678750992326
Epoch 4, Avg Loss is 0.27606165409088135, epoch top1 0.4014925373134328, max top1 0.4014925373134328


  positive_feat = torch.tensor(batch_1[0].float())
  negative_feat = torch.tensor(batch_1[1].float())
34643it [01:32, 373.75it/s]


Running on dev files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

Running on test files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

Test Dataset 0.6938343477110347
Epoch 5, Avg Loss is 0.2643102705478668, epoch top1 0.40701492537313433, max top1 0.40701492537313433


  positive_feat = torch.tensor(batch_1[0].float())
  negative_feat = torch.tensor(batch_1[1].float())
34643it [01:13, 472.86it/s]


Running on dev files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

Running on test files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

Test Dataset 0.710439269648055
Epoch 6, Avg Loss is 0.2524220645427704, epoch top1 0.4274626865671642, max top1 0.4274626865671642


  positive_feat = torch.tensor(batch_1[0].float())
  negative_feat = torch.tensor(batch_1[1].float())
34643it [01:13, 469.90it/s]


Running on dev files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

Epoch 7, Avg Loss is 0.24449339509010315, epoch top1 0.42656716417910445, max top1 0.4274626865671642


34643it [01:15, 461.42it/s]


Running on dev files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

Running on test files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

Test Dataset 0.7331966128605452
Epoch 8, Avg Loss is 0.23989710211753845, epoch top1 0.4470149253731343, max top1 0.4470149253731343


  positive_feat = torch.tensor(batch_1[0].float())
  negative_feat = torch.tensor(batch_1[1].float())
34643it [01:18, 443.37it/s]


Running on dev files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

Running on test files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

Test Dataset 0.7381582429214077
Epoch 9, Avg Loss is 0.23186497390270233, epoch top1 0.4685820895522388, max top1 0.4685820895522388


  positive_feat = torch.tensor(batch_1[0].float())
  negative_feat = torch.tensor(batch_1[1].float())
34643it [01:15, 460.31it/s]


Running on dev files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

Running on test files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

Test Dataset 0.7517200317544324
Epoch 10, Avg Loss is 0.2250904142856598, epoch top1 0.4690298507462687, max top1 0.4690298507462687


  positive_feat = torch.tensor(batch_1[0].float())
  negative_feat = torch.tensor(batch_1[1].float())
34643it [01:18, 440.90it/s]


Running on dev files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

Running on test files
['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

Test Dataset 0.7633633236305901
Epoch 11, Avg Loss is 0.21904990077018738, epoch top1 0.4773134328358209, max top1 0.4773134328358209
