## Purpose
For leave one out experiments, autoencoder is used to reduce the dimensionality of the context property vector while training. 
This notebook consists of two sections
- Preparing data for loading in the model 
- During model training, converting the context property string into context property vector for each candidate therough function convert_to_matrix_vector with autoencoder and training the model.

It uses the general framework of table-linker-pipeline with some enhanced changes to work with context property vector and large data together.

For data: Download from AWS S3 - table-linker-datasets/Experiments/context_vector_{train/dev}data into Experiments/ folders
Alternatively recreate the data by running the datasets with context_property_vector.py

Train_autoencoder through autoencoder_code.py or use the existing saved model by defining path in autoencoder_saved_path.

Get properties by running calculate_all_properties.

Define the experiment train, dev, test data and more in Setup Tagged cell.

Both forward and reverse properties are considered.

In [15]:

import pandas as pd
import glob
import os
import numpy as np
from ast import literal_eval
import scipy.sparse as sp
import pickle
import tensorflow as tf
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Dense
import time
import sklearn.metrics
from sklearn.preprocessing import MinMaxScaler
from argparse import ArgumentParser, Namespace
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from itertools import chain
import copy
import shutil
from tqdm import tqdm

In [16]:
all_train_files = '../Experiments/context_vector_train_data/'
all_dev_files = '../Experiments/context_vector_dev_data/'
autoencoder_saved_path = 'saved_2000_100/saved_2000_1000'
features = ["monge_elkan","monge_elkan_aliases","jaro_winkler",
            "levenshtein","singleton", "num_occurences", 'context_score_3']
experiment_train_data = "2t_data,limaye_data,biotab_data,biodiv_data,t2dv2_data"
experiment_test_data = "semtab_data"

In [3]:
all_properties = pickle.load(open('all_properties.pkl', 'rb'))
len(all_properties)

4546

In [4]:
class AutoEncoders(Model):

    def __init__(self, layer_1_unit = 2000, layer_2_unit=100, layer_3_unit = None, output_units = None):
        super().__init__()
        self.encoder = Sequential(
            [
                Dense(layer_1_unit, activation="relu"),
                Dense(layer_2_unit, activation="relu"),
                #Dense(layer_3_unit, activation="relu")
            ]
        )

        self.decoder = Sequential(
            [
                #Dense(layer_2_unit, activation="relu"),
                Dense(layer_1_unit, activation="relu"),
                Dense(output_units, activation="sigmoid")
            ]
        )


    def call(self, inputs):
        encoded = self.encoder(inputs)
        decoded = self.decoder(encoded)
        return decoded

In [5]:
a_model = AutoEncoders(output_units = len(all))
print(a_model.layers)
a_model.load_weights(autoencoder_saved_path)
encoder_layer = a_model.get_layer('sequential')

[<tensorflow.python.keras.engine.sequential.Sequential object at 0x7f63dc41a070>, <tensorflow.python.keras.engine.sequential.Sequential object at 0x7f63dc41a9a0>]


In [6]:

def read_file(key):
    #resp = s3.get_object(Bucket = bucket, Key = key)
    try:
        df = pd.read_csv(key, sep = ',')
    except pd.errors.EmptyDataError:
        df = ''
        print('Empty csv file!')
    return df

def merge_files(args):
    # datapath = args.train_path
    df_list = []
    for fn in args.train_files:
        fid = fn.split('/')[-1][:-4]
        dataset_id = fn.split('/')[-2]
        df = read_file(fn)
        if not isinstance(df, pd.DataFrame) :
            continue

        df['table_id'] = fid
        df['dataset_id'] = dataset_id
        df['context_score'].fillna(0.0, inplace=True)
        if 'column-id' not in df.columns:
            df['column-id'] = fn.split('/')[-1] + df['column'].astype('str')

        df = df[extra_feat]
        df_list.append(df)
    return pd.concat(df_list)


def compute_normalization_factor(args, all_data):
    min_max_scaler_path = args.min_max_scaler_path
    all_data_features = all_data[features]
    scaler = MinMaxScaler()
    scaler.fit(all_data_features)
    #pickle.dump(scaler, open('./tmp/min_max_scaler_path.pkl', 'wb'))
    #s3_1.Bucket('table-linker-datasets').upload_file('./tmp/min_max_scaler_path.pkl', min_max_scaler_path)

    #save_file(min_max_scaler_path, scaler)
    return scaler

def generate_train_data(args, all_data, scaler, shuffle_by = None):
    num_cells_1 = 0
    scaler_path = args.min_max_scaler_path
    #scaler = pickle.load(open('./tmp/min_max_scaler_path.pkl', 'rb'))
    final_list = []
    sfeatures = copy.deepcopy(features) + ['context_property_vector']
    normalize_features = features
    evaluation_label = ['evaluation_label']
    positive_features_final = []
    negative_features_final = []
    super_groups = all_data.groupby(['column-id'])
    if shuffle_by == 'dataset':
        super_groups = all_data.groupby(['dataset_id'])
        for i, s_group in super_groups:
            pos_features_dataset = []
            neg_features_dataset = []
            grouped_obj = s_group.groupby(['column', 'row', 'column-id'])

            for cell in grouped_obj:
                num_cells_1 += 1
                cell[1][normalize_features] = scaler.transform(cell[1][normalize_features])
                pos_features = []
                neg_features = []
                a = cell[1][cell[1]['evaluation_label'] == 1]
                if a.empty:
                    continue
                pos_rows = cell[1][cell[1]['evaluation_label'].astype(int) == 1][sfeatures].to_numpy()
                if len(pos_rows) < 1:
                    continue
                if len(pos_rows) > 1:
                    print("here")
                for i in range(len(pos_rows)):
                    pos_features.append(pos_rows[i])
                neg_rows = cell[1][cell[1]['evaluation_label'].astype(int) == -1][sfeatures].to_numpy()
                for i in range(min(batch_size, len(neg_rows))):
                    neg_features.append(neg_rows[i])

                for k in range(len(neg_features) - len(pos_features)):
                    pos_features.append(random.choice(pos_rows))
                if len(pos_features) != len(neg_features):
                    continue
                pos_features_dataset.append(pos_features)
                neg_features_dataset.append(neg_features)
            if len(pos_features_dataset) > 0:
                c = list(zip(pos_features_dataset, neg_features_dataset))
                random.shuffle(c)
                pos_features_dataset, neg_features_dataset = zip(*c)
                positive_features_final.extend(pos_features_dataset)
                negative_features_final.extend(neg_features_dataset)
    elif shuffle_by == 'table':
        super_groups = all_data.groupby(['table_id'])
        for i, s_group in super_groups:
            pos_features_table = []
            neg_features_table = []
            file_name = i.split('-')[0]
            #print("entering ", file_name)
            grouped_obj = s_group.groupby(['column', 'row'])

            for cell in grouped_obj:
                num_cells_1 += 1
                cell[1][normalize_features] = scaler.transform(cell[1][normalize_features])
                pos_features = []
                neg_features = []
                a = cell[1][cell[1]['evaluation_label'] == 1]
                if a.empty:
                    continue
                pos_rows = cell[1][cell[1]['evaluation_label'].astype(int) == 1][sfeatures].to_numpy()
                if len(pos_rows) < 1:
                    continue
                if len(pos_rows) > 1:
                    print("here")
                for i in range(len(pos_rows)):
                    pos_features.append(pos_rows[i])
                neg_rows = cell[1][cell[1]['evaluation_label'].astype(int) == -1][sfeatures].to_numpy()
                for i in range(min(batch_size, len(neg_rows))):
                    neg_features.append(neg_rows[i])

                for k in range(len(neg_features) - len(pos_features)):
                    pos_features.append(random.choice(pos_rows))
                if len(pos_features) != len(neg_features):
                    continue
                random.shuffle(pos_features)

                random.shuffle(neg_features)
                pos_features_table.append(pos_features)
                neg_features_table.append(neg_features)
            if len(pos_features_table) > 0:
                c = list(zip(pos_features_table, neg_features_table))
                random.shuffle(c)
                pos_features_table, neg_features_table = zip(*c)
                positive_features_final.extend(pos_features_table)
                negative_features_final.extend(neg_features_table)
    else:
        for i, s_group in super_groups:
            file_name = i.split('-')[0]
            #print("entering ", file_name)
            grouped_obj = s_group.groupby(['column', 'row'])
            for cell in grouped_obj:
                num_cells_1 += 1
                cell[1][normalize_features] = scaler.transform(cell[1][normalize_features])
                pos_features = []
                neg_features = []
                a = cell[1][cell[1]['evaluation_label'] == 1]
                if a.empty:
                    continue
                pos_rows = cell[1][cell[1]['evaluation_label'].astype(int) == 1][sfeatures].to_numpy()
                if len(pos_rows) < 1:
                    continue
                if len(pos_rows) > 1:
                    print("here")
                for i in range(len(pos_rows)):
                    pos_features.append(pos_rows[i])
                neg_rows = cell[1][cell[1]['evaluation_label'].astype(int) == -1][sfeatures].to_numpy()
                for i in range(min(batch_size, len(neg_rows))):
                    neg_features.append(neg_rows[i])

                for k in range(len(neg_features) - len(pos_features)):
                    pos_features.append(random.choice(pos_rows))
                random.shuffle(pos_features)
                random.shuffle(neg_features)
                if len(pos_features) != len(neg_features):
                    print("HHHERRRERR")
                else:
                    positive_features_final.append(pos_features)
                    negative_features_final.append(neg_features)
                #print(len(positive_features_final), len(positive_features_final[3]))
                #print(len(negative_features_final), len(negative_features_final[3]))
    if shuffle_by == 'complete_shuffle':
        c = list(zip(positive_features_final, negative_features_final))
        random.shuffle(c)
        positive_features_final, positive_features_final = zip(*c)
    print(len(positive_features_final), len(positive_features_final[3]))
    print(len(negative_features_final), len(negative_features_final[3]))
    pickle.dump(positive_features_final, open('pos_semtab_c_feat_32.pkl', 'wb'))
    pickle.dump(negative_features_final, open('neg_semtab_c_feat_32.pkl', 'wb'))
    print(len(positive_features_final), len(positive_features_final[3]))
    print(len(negative_features_final), len(negative_features_final[3]))

    #save_file(args.pos_output, positive_features_final)
class T2DV2Dataset(Dataset):
    def __init__(self, pos_features, neg_features):
        self.pos_features = pos_features
        self.neg_features = neg_features

    def __len__(self):
        return len(self.pos_features)

    def __getitem__(self, idx):
        return self.pos_features[idx], self.neg_features[idx]


# Model
class PairwiseNetwork(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        # original 12x24, 24x12, 12x12, 12x1
        self.fc1 = nn.Linear(hidden_size, 2 * hidden_size)
        self.fc2 = nn.Linear(2 * hidden_size, 2 * hidden_size)
        self.fc3 = nn.Linear(2 * hidden_size, hidden_size)
        self.fc4 = nn.Linear(hidden_size, 1)

    def forward(self, pos_features, neg_features):
        # Positive pass
        x = F.relu(self.fc1(pos_features))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        pos_out = torch.sigmoid(self.fc4(x))

        # Negative Pass
        x = F.relu(self.fc1(neg_features))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        neg_out = torch.sigmoid(self.fc4(x))

        return pos_out, neg_out

    def predict(self, test_feat):
        x = F.relu(self.fc1(test_feat))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        test_out = torch.sigmoid(self.fc4(x))
        return test_out


# Pairwise Loss
class PairwiseLoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.m = 0

    def forward(self, pos_out, neg_out):
        distance = (1 - pos_out) + neg_out
        loss = torch.mean(torch.max(torch.tensor(0), distance))
        return loss


def generate_dataloader(positive_feat_path, negative_feat_path):
    pos_features = pickle.load(open(positive_feat_path, 'rb'))
    neg_features = pickle.load(open(negative_feat_path, 'rb'))
    print(pos_features[10])
    pos_features_flatten = list(chain.from_iterable(pos_features))
    neg_features_flatten = list(chain.from_iterable(neg_features))
    return pos_features_flatten, neg_features_flatten

In [8]:
train_files_path = [all_train_files_path + i for i in experiment_train_data.split(',')]
train_files = []
for train_path in train_files_path:
    set_of_files = glob.glob(train_path + '/*.csv')
    train_files.extend(set_of_files)
len(train_files)

453

In [9]:
dev_files_path = [all_dev_files_path + i for i in experiment_train_data.split(',')]
dev_files = []
for dev_path in dev_files_path:
    set_of_files = glob.glob(dev_path + '/*.csv')
    dev_files.extend(set_of_files)
len(dev_files)

161

In [27]:
test_files = []
test_files_location = [f'{all_dev_files_path}/{test_data}', f'{all_train_files_path}/{test_data}']
for k in test_files_location:
    test_files.extend(glob.glob(k + '/*.csv'))
len(test_files)

345

In [10]:
properties_list = list(all_properties)
col_indices = {properties_list[i]:i for i in range(len(properties_list))}
def convert_to_matrix_vector(data: pd.DataFrame, properties_list:list, pca_model=None, val = None):


    col_used_up = set()
    col = list(range(0, len(properties_list)))
    row = list(range(0, len(data)))
    rows, cols, vals = [], [], []

    #print(len(properties_list))
    features_vals = []
    for rows_ind in range(len(data)):
        feature_range = []
        for cols_ind in range(len(data[rows_ind])):
            if isinstance(data[rows_ind][cols_ind], str):
                props = literal_eval(data[rows_ind][cols_ind])
                #print(props)
                for prop in props:
                    rows.append(rows_ind)
                    cols.append(col_indices[prop])
                    col_used_up.add(cols_ind)
                    vals.append(props[prop])
            else:
                feature_range.append(data[rows_ind][cols_ind])

        features_vals.append(feature_range)
            
                    
    Y = np.array(features_vals)
    X = sp.csr_matrix((vals, (rows, cols)),  shape=(len(data), len(properties_list))).toarray()
    #print(Y[0], X[0])
    if val is not None:
        tmp = np.array([[val]]*len(X))
        tmp = np.concatenate((X, tmp), axis = 1)
        X = tmp
    X = encoder_layer.predict(X)
    complete = np.concatenate((Y, X), axis = 1)
    #print(complete[0])
    return complete  

In [11]:
extra_feat = ['column-id', 'column', 'row', 'evaluation_label', 'dataset_id', 'table_id', 'context_property_vector']
for f in features:
    extra_feat.append(f)
gen_training_data_args = Namespace(train_files=train_files, pos_output=pos_output, neg_output=neg_output,min_max_scaler_path=min_max_scaler_path)


In [25]:
all_data = merge_files(gen_training_data_args)
scaler = compute_normalization_factor(gen_training_data_args, all_data)

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


In [26]:
pickle.dump(scaler, open('min_max_scaler_semtab_c.pkl', 'wb'))


In [12]:
scaler = pickle.load(open('min_max_scaler_semtab_c.pkl', 'rb'))

In [None]:
batch_size = 32
generate_train_data(gen_training_data_args, all_data, scaler, shuffle_by = 'table')

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/nas/home/hrathod/anaconda3/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3418, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-14-0fc8a8ddca03>", line 2, in <module>
    generate_train_data(gen_training_data_args, all_data, scaler, shuffle_by = 'table')
NameError: name 'all_data' is not defined

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/nas/home/hrathod/anaconda3/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 2045, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'NameError' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/nas/home/hrathod/anaconda3/lib/python3.8/site-packages/IPython/core/ultratb.py", line 1170, in get_records
    return _fixed_getinnerframes(e

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/nas/home/hrathod/anaconda3/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3418, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-14-0fc8a8ddca03>", line 2, in <module>
    generate_train_data(gen_training_data_args, all_data, scaler, shuffle_by = 'table')
NameError: name 'all_data' is not defined

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/nas/home/hrathod/anaconda3/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 2045, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'NameError' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/nas/home/hrathod/anaconda3/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3338, in run_ast_nodes
    if (await self.run_

In [14]:
pos, neg = generate_dataloader('pos_semtab_c_feat_32.pkl', 'neg_semtab_c_feat_32.pkl')

[array([1.0, 0.9540343915343916, 1.0, 1.0, 1.0, 1.0, 0.7619047619047619,
       "{'_P703': 1.0}"], dtype=object), array([1.0, 0.9540343915343916, 1.0, 1.0, 1.0, 1.0, 0.7619047619047619,
       "{'_P703': 1.0}"], dtype=object), array([1.0, 0.9540343915343916, 1.0, 1.0, 1.0, 1.0, 0.7619047619047619,
       "{'_P703': 1.0}"], dtype=object), array([1.0, 0.9540343915343916, 1.0, 1.0, 1.0, 1.0, 0.7619047619047619,
       "{'_P703': 1.0}"], dtype=object), array([1.0, 0.9540343915343916, 1.0, 1.0, 1.0, 1.0, 0.7619047619047619,
       "{'_P703': 1.0}"], dtype=object), array([1.0, 0.9540343915343916, 1.0, 1.0, 1.0, 1.0, 0.7619047619047619,
       "{'_P703': 1.0}"], dtype=object), array([1.0, 0.9540343915343916, 1.0, 1.0, 1.0, 1.0, 0.7619047619047619,
       "{'_P703': 1.0}"], dtype=object), array([1.0, 0.9540343915343916, 1.0, 1.0, 1.0, 1.0, 0.7619047619047619,
       "{'_P703': 1.0}"], dtype=object), array([1.0, 0.9540343915343916, 1.0, 1.0, 1.0, 1.0, 0.7619047619047619,
       "{'_P703': 1.0}"

In [16]:

X_data = convert_to_matrix_vector(pos, list(all_properties), None, val=None)
print(X_data[0])
Y_data = convert_to_matrix_vector(neg, list(all_properties), None, val=None)
print(Y_data[0])

[1.         0.94636752 1.         1.         0.         1.
 0.50793651 1.4038142  1.48716819 0.78522557 1.19253445 1.39397407
 1.16565919 0.41754904 1.30857992 0.         1.08605814 1.45972729
 1.48519599 1.23276186 1.14797711 0.96738166 0.8668105  1.1433475
 1.14105797 1.33078563 1.12263727 1.12242019 0.         1.30930769
 0.         0.         1.29394495 0.         1.06381869 0.75672853
 0.65772074 1.47513938 1.01385379 1.32386243 1.14659166 0.
 1.24977136 1.31023633 0.         1.15629745 0.         1.25701642
 0.94261926 1.52466464 1.1084944  0.96437758 0.95448083 1.50748622
 1.08037639 1.32863319 1.31787014 1.52895212 1.46686018 1.57697868
 1.31908131 1.16493917 1.42814982 0.9749952  1.05988431 0.9735831
 0.82459557 0.9995544  0.52528316 0.         0.         0.
 1.45780265 1.14379609 1.54156792 1.21080244 1.34652817 0.71420246
 0.         1.25069368 0.         1.21468377 0.         0.
 0.75891441 1.02969909 1.41393042 1.40723419 1.08606851 1.2677027
 0.         1.61113811 1.19295

In [17]:
print(len(X_data[0]))

107


In [18]:
train_dataset = T2DV2Dataset(X_data, Y_data)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size)

In [19]:
class PairwiseNetwork(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        # original 12x24, 24x12, 12x12, 12x1
        self.fc1 = nn.Linear(hidden_size, 2*hidden_size)
        
        self.fc2 = nn.Linear(2*hidden_size, 2*hidden_size)
        #self.fc_x = nn.Linear(2*hidden_size, hidden_size)
        self.fc3 = nn.Linear(2*hidden_size, hidden_size)
        self.fc4 = nn.Linear(hidden_size, 1)

    def forward(self, pos_features, neg_features):
        # Positive pass
        x = F.relu(self.fc1(pos_features))
        x = F.relu(self.fc2(x))
        #x = F.relu(self.fc_x(x))
        x = F.relu(self.fc3(x))
        pos_out = torch.sigmoid(self.fc4(x))

        # Negative Pass
        x = F.relu(self.fc1(neg_features))
        x = F.relu(self.fc2(x))
        #x = F.relu(self.fc_x(x))
        x = F.relu(self.fc3(x))
        neg_out = torch.sigmoid(self.fc4(x))

        return pos_out, neg_out

    def predict(self, test_feat):
        x = F.relu(self.fc1(test_feat))
        x = F.relu(self.fc2(x))
        #x = F.relu(self.fc_x(x))
        x = F.relu(self.fc3(x))
        test_out = torch.sigmoid(self.fc4(x))
        return test_out


# Pairwise Loss
class PairwiseLoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.m = 0

    def forward(self, pos_out, neg_out):
        distance = (1 - pos_out) + neg_out
        loss = torch.mean(torch.max(torch.tensor(0), distance))
        return loss

In [20]:
def train(args):
    if torch.cuda.is_available():
        device = torch.device('cuda')

    else:
        device = torch.device('cpu')
    device = torch.device('cpu')
    train_dataset = T2DV2Dataset(X_data, Y_data)
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size)
    criterion = PairwiseLoss()
    EPOCHS = args.num_epochs
    model = PairwiseNetwork(len(X_data[0])).to(device=device)
    optimizer = Adam(model.parameters(), lr=args.lr)
    # scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.01)
    top1_max_prec = 0
    for epoch in range(EPOCHS):
        train_epoch_loss = 0
        avg_loss = 0
        model.train()
        for bid_1, batch_1 in tqdm(enumerate(train_dataloader), position=0, leave=True):
            # print("--------------")
            positive_feat = torch.tensor(batch_1[0].float())
            negative_feat = torch.tensor(batch_1[1].float())
            optimizer.zero_grad()
            # print(positive_feat.is_cuda, negative_feat.is_cuda)
            pos_out, neg_out = model(positive_feat, negative_feat)
            ##print(pos_out.is_cuda, neg_out.is_cuda, model.is_cuda)
            loss = criterion(pos_out, neg_out)
            # print(loss.is_cuda)
            loss.backward()
            optimizer.step()
            train_epoch_loss += loss
        avg_loss = train_epoch_loss / bid_1
        # scheduler.step()
        # Evaluation
        model.eval()
        top1_precision = infer_scores(args.min_max_scaler_path, dev_files, args.dev_output, model)
        # eval_data = merge_eval_files(args.dev_output)
        # res, candidate_eval_data = parse_eval_files_stats(eval_data, final_score_column)
        # top1_precision = res['num_tasks_with_model_score_top_one_accurate'] / res['num_tasks_with_gt']
        if top1_precision > top1_max_prec:
            #copy_files(args.dev_output, dev_predictions)
            top1_max_prec = top1_precision
            model_save_name = 'top1_{}_epoch_{}_loss_{}_batch_size_{}_learning_rate_{}.pth'.format(top1_max_prec, epoch,
                                                                                                   avg_loss, batch_size,
                                                                                                   learning_rate)
            best_model_path = args.model_save_path + model_save_name
            torch.save(model.state_dict(), './tmp/'+model_save_name)
            print("Test Dataset", infer_scores(args.min_max_scaler_path, test_files, args.dev_output, model, test = 1))
        #s3_1.Bucket('table-linker-datasets').upload_file('/tmp/'+model_save_name, best_model_path)

        print("Epoch {}, Avg Loss is {}, epoch top1 {}, max top1 {}".format(epoch, avg_loss, top1_precision,
                                                                            top1_max_prec))
    return best_model_path

In [22]:
!mkdir -p dev_feature_path_2
!mkdir -p dev_output_predictions_2

In [23]:
def parse_eval_files_stats(eval_data, method):
    res = {}
    candidate_eval_data = eval_data.groupby(['table_id', 'column', 'row'])['table_id'].count().reset_index(name="count")
    res['num_tasks_with_gt'] = len(eval_data[pd.notna(eval_data['GT_kg_id'])].groupby(['table_id', 'column', 'row']))
    num_tasks_with_model_score_top_one_accurate = []
    num_tasks_with_model_score_top_five_accurate = []
    num_tasks_with_model_score_top_ten_accurate = []
    has_gt_list = []
    has_gt_in_candidate = []
    for i, row in candidate_eval_data.iterrows():
        table_id, row_idx, col_idx = row['table_id'], row['row'], row['column']
        c_e_data = eval_data[
            (eval_data['table_id'] == table_id) & (eval_data['row'] == row_idx) & (eval_data['column'] == col_idx)]
        assert len(c_e_data) > 0
        if np.nan not in set(c_e_data['GT_kg_id']):
            has_gt_list.append(1)
        else:
            has_gt_list.append(0)
        if 1 in set(c_e_data['evaluation_label']):
            has_gt_in_candidate.append(1)
        else:
            has_gt_in_candidate.append(0)

        # rank on model score
        s_data = c_e_data.sort_values(by=[method], ascending=False)
        if s_data.iloc[0]['evaluation_label'] == 1:
            num_tasks_with_model_score_top_one_accurate.append(1)
        else:
            num_tasks_with_model_score_top_one_accurate.append(0)

    res['num_tasks_with_model_score_top_one_accurate'] = sum(num_tasks_with_model_score_top_one_accurate)
    # print(sum(num_tasks_with_model_score_top_one_accurate))
    return res['num_tasks_with_model_score_top_one_accurate'], res['num_tasks_with_gt']

In [24]:
features

['monge_elkan',
 'monge_elkan_aliases',
 'jaro_winkler',
 'levenshtein',
 'singleton',
 'num_occurences',
 'context_score_3']

In [25]:
def infer_scores(min_max_scaler_path, input_table_path, output_table_path, model, test = 0):
    normalize_features = features
    sfeatures = copy.deepcopy(features) + ['context_property_vector']
    print(features)
    number_of_cells_top_1 = 0
    number_of_cells_total = 0
    for file in input_table_path:
        file_name = file.split('/')[-1]
        try:
            d_sample = pd.read_csv(file)
        except pd.errors.EmptyDataError:
            continue
        if not isinstance(d_sample, pd.DataFrame):
            continue
        #print(d_sample.columns)
        d_sample[normalize_features] = scaler.transform(d_sample[normalize_features])
        sorted_df = d_sample
        sorted_df_features = sorted_df[sfeatures]
        arr = sorted_df_features.to_numpy()
        test_inp = convert_to_matrix_vector(arr, list(all_properties))
        test_tensor = torch.tensor(test_inp).float()
        scores = model.predict(test_tensor)
        scores_list = torch.squeeze(scores).tolist()
        test_df = d_sample
        test_df[final_score_column] = scores_list
        test_df['table_id'] = file_name
        test_df['dataset_id'] = " "
        #df_input_table.append(test_df)
        num_of_cells_with_correct_top_1, num_of_cells = parse_eval_files_stats(test_df, 'siamese_prediction')
        number_of_cells_top_1 += num_of_cells_with_correct_top_1
        number_of_cells_total += num_of_cells
        if test:
            test_df.to_csv('dev_output_predictions_2/'+file_name, index=False)
    # print(number_of_cells_top_1, number_of_cells_total, input_table_path)
    return number_of_cells_top_1 / number_of_cells_total

In [28]:
for lr in [0.00001, 0.000001]:
    training_args = Namespace(num_epochs=12, lr=lr, positive_feat_path=pos_output, negative_feat_path=neg_output,
                             dev_path=dev_files, dev_output='dev_output_predictions_2/',
                             model_save_path=model_save_path, min_max_scaler_path=min_max_scaler_path)
    best_model = train(training_args)

  positive_feat = torch.tensor(batch_1[0].float())
  negative_feat = torch.tensor(batch_1[1].float())
53it [00:00, 24.65it/s]

ERROR! Session/line number was not unique in database. History logging moved to new session 2063


34643it [03:25, 168.84it/s]


['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences', 'context_score_3']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences', 'context_score_3']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


Test Dataset 0.9440328129134692
Epoch 0, Avg Loss is 0.23636767268180847, epoch top1 0.6487313432835821, max top1 0.6487313432835821


  positive_feat = torch.tensor(batch_1[0].float())
  negative_feat = torch.tensor(batch_1[1].float())
34643it [04:22, 131.76it/s]


['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences', 'context_score_3']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


Epoch 1, Avg Loss is 0.1578371524810791, epoch top1 0.6385074626865672, max top1 0.6487313432835821


  positive_feat = torch.tensor(batch_1[0].float())
  negative_feat = torch.tensor(batch_1[1].float())
34643it [03:11, 181.16it/s]


['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences', 'context_score_3']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


Epoch 2, Avg Loss is 0.15182624757289886, epoch top1 0.6294029850746269, max top1 0.6487313432835821


  positive_feat = torch.tensor(batch_1[0].float())
  negative_feat = torch.tensor(batch_1[1].float())
34643it [03:08, 183.87it/s]


['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences', 'context_score_3']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


Epoch 3, Avg Loss is 0.1483161300420761, epoch top1 0.6221641791044776, max top1 0.6487313432835821


  positive_feat = torch.tensor(batch_1[0].float())
  negative_feat = torch.tensor(batch_1[1].float())
34643it [03:00, 192.03it/s]


['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences', 'context_score_3']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


Epoch 4, Avg Loss is 0.14506036043167114, epoch top1 0.5979850746268657, max top1 0.6487313432835821


  positive_feat = torch.tensor(batch_1[0].float())
  negative_feat = torch.tensor(batch_1[1].float())
34643it [02:55, 197.03it/s]


['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences', 'context_score_3']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


Epoch 5, Avg Loss is 0.1426839381456375, epoch top1 0.6014179104477612, max top1 0.6487313432835821


  positive_feat = torch.tensor(batch_1[0].float())
  negative_feat = torch.tensor(batch_1[1].float())
34643it [02:55, 197.91it/s]


['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences', 'context_score_3']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


Epoch 6, Avg Loss is 0.14213426411151886, epoch top1 0.5936567164179104, max top1 0.6487313432835821


  positive_feat = torch.tensor(batch_1[0].float())
  negative_feat = torch.tensor(batch_1[1].float())
34643it [02:55, 197.91it/s]


['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences', 'context_score_3']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


Epoch 7, Avg Loss is 0.14062510430812836, epoch top1 0.580820895522388, max top1 0.6487313432835821


  positive_feat = torch.tensor(batch_1[0].float())
  negative_feat = torch.tensor(batch_1[1].float())
34643it [03:02, 189.50it/s]


['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences', 'context_score_3']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


Epoch 8, Avg Loss is 0.13937774300575256, epoch top1 0.5714179104477612, max top1 0.6487313432835821


  positive_feat = torch.tensor(batch_1[0].float())
  negative_feat = torch.tensor(batch_1[1].float())
34643it [02:57, 194.89it/s]


['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences', 'context_score_3']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
0it [00:00, ?it/s]

Epoch 9, Avg Loss is 0.13896061480045319, epoch top1 0.5677611940298507, max top1 0.6487313432835821


  positive_feat = torch.tensor(batch_1[0].float())
  negative_feat = torch.tensor(batch_1[1].float())
34643it [03:00, 192.29it/s]


['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences', 'context_score_3']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


Epoch 10, Avg Loss is 0.13895872235298157, epoch top1 0.5662686567164179, max top1 0.6487313432835821


  positive_feat = torch.tensor(batch_1[0].float())
  negative_feat = torch.tensor(batch_1[1].float())
34643it [02:59, 193.05it/s]


['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences', 'context_score_3']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


Epoch 11, Avg Loss is 0.13848242163658142, epoch top1 0.5619402985074626, max top1 0.6487313432835821


  positive_feat = torch.tensor(batch_1[0].float())
  negative_feat = torch.tensor(batch_1[1].float())
34643it [02:31, 228.91it/s]


['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences', 'context_score_3']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences', 'context_score_3']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


Test Dataset 0.8706668430801799
Epoch 0, Avg Loss is 0.8333515524864197, epoch top1 0.5540298507462686, max top1 0.5540298507462686


  positive_feat = torch.tensor(batch_1[0].float())
  negative_feat = torch.tensor(batch_1[1].float())
34643it [02:34, 224.35it/s]


['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences', 'context_score_3']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences', 'context_score_3']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


Test Dataset 0.9061259592484784
Epoch 1, Avg Loss is 0.361972451210022, epoch top1 0.6144776119402985, max top1 0.6144776119402985


  positive_feat = torch.tensor(batch_1[0].float())
  negative_feat = torch.tensor(batch_1[1].float())
34643it [03:00, 192.29it/s]


['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences', 'context_score_3']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences', 'context_score_3']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


Test Dataset 0.9292140777983594
Epoch 2, Avg Loss is 0.24543015658855438, epoch top1 0.6338805970149254, max top1 0.6338805970149254


  positive_feat = torch.tensor(batch_1[0].float())
  negative_feat = torch.tensor(batch_1[1].float())
34643it [03:04, 187.87it/s]


['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences', 'context_score_3']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences', 'context_score_3']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


Test Dataset 0.9390711828526065
Epoch 3, Avg Loss is 0.2149815410375595, epoch top1 0.6399253731343284, max top1 0.6399253731343284


  positive_feat = torch.tensor(batch_1[0].float())
  negative_feat = torch.tensor(batch_1[1].float())
34643it [03:00, 191.41it/s]


['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences', 'context_score_3']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences', 'context_score_3']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


Test Dataset 0.9430404869012966
Epoch 4, Avg Loss is 0.20209573209285736, epoch top1 0.6438805970149254, max top1 0.6438805970149254


  positive_feat = torch.tensor(batch_1[0].float())
  negative_feat = torch.tensor(batch_1[1].float())
34643it [02:58, 194.06it/s]


['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences', 'context_score_3']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences', 'context_score_3']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


Test Dataset 0.9437020375760783
Epoch 5, Avg Loss is 0.19498853385448456, epoch top1 0.6440298507462686, max top1 0.6440298507462686


  positive_feat = torch.tensor(batch_1[0].float())
  negative_feat = torch.tensor(batch_1[1].float())
34643it [03:03, 189.23it/s]


['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences', 'context_score_3']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences', 'context_score_3']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


Test Dataset 0.9435035723736438
Epoch 6, Avg Loss is 0.19028590619564056, epoch top1 0.6444029850746269, max top1 0.6444029850746269


  positive_feat = torch.tensor(batch_1[0].float())
  negative_feat = torch.tensor(batch_1[1].float())
34643it [03:06, 185.53it/s]


['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences', 'context_score_3']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences', 'context_score_3']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


Test Dataset 0.9432389521037311
Epoch 7, Avg Loss is 0.18677401542663574, epoch top1 0.6451492537313432, max top1 0.6451492537313432


  positive_feat = torch.tensor(batch_1[0].float())
  negative_feat = torch.tensor(batch_1[1].float())
34643it [03:11, 181.08it/s]


['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences', 'context_score_3']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences', 'context_score_3']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


Test Dataset 0.9433712622386875
Epoch 8, Avg Loss is 0.1839444786310196, epoch top1 0.6462686567164179, max top1 0.6462686567164179


  positive_feat = torch.tensor(batch_1[0].float())
  negative_feat = torch.tensor(batch_1[1].float())
34643it [03:07, 184.57it/s]


['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences', 'context_score_3']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


Epoch 9, Avg Loss is 0.1815253049135208, epoch top1 0.6462686567164179, max top1 0.6462686567164179


  positive_feat = torch.tensor(batch_1[0].float())
  negative_feat = torch.tensor(batch_1[1].float())
34643it [03:10, 182.13it/s]


['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences', 'context_score_3']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
0it [00:00, ?it/s]

Epoch 10, Avg Loss is 0.1793840080499649, epoch top1 0.6455970149253731, max top1 0.6462686567164179


  positive_feat = torch.tensor(batch_1[0].float())
  negative_feat = torch.tensor(batch_1[1].float())
34643it [03:00, 191.68it/s]


['monge_elkan', 'monge_elkan_aliases', 'jaro_winkler', 'levenshtein', 'singleton', 'num_occurences', 'context_score_3']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


Epoch 11, Avg Loss is 0.17743109166622162, epoch top1 0.6458955223880597, max top1 0.6462686567164179
