In [1]:
import glob
import time
import os
import pandas as pd
import sklearn.metrics
from sklearn.preprocessing import MinMaxScaler
import pickle
from argparse import ArgumentParser, Namespace
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from itertools import chain
from tqdm import tqdm
import copy
import shutil
import pickle

In [2]:
es_url = 'http://ckg07:9200'
es_index = 'wikidatadwd-augmented'

# Input Paths

# GDrive Path: /table-linker-dataset/2019-iswc_challenge_data/t2dv2/canonical-with-context/t2dv2-train-canonical/
train_path = "/Users/amandeep/Github/table-linker/data/SemTab2020/Round4/train1-canonical"

# GDrive Path: /table-linker-dataset/2019-iswc_challenge_data/t2dv2/canonical-with-context/t2dv2-dev-canonical/
dev_path = "/Users/amandeep/Github/table-linker/data/SemTab2020/Round4/dev-canonical"

# GDrive Path: /table-linker-dataset/2019-iswc_challenge_data/t2dv2/ground_truth/Xinting_GT_csv
ground_truth_files = "/Users/amandeep/Github/table-linker/data/SemTab2020/Round4/round4_gt_with_labels"


# OUTPUT PATHS
output_path = "/Users/amandeep/Github/table-linker/data/SemTab2020/Round4/table-linker"
train_output_path = f'{output_path}/train1-output'
dev_output_path = f'{output_path}/dev-output'

# increase version to create a new folder for an experiment
PREVIOUS_VERSION = "v18"
VERSION = "v19"

train_candidate_path = f'{train_output_path}/{VERSION}/candidates'
train_string_feature_path = f'{train_output_path}/{VERSION}/string_features'
train_feature_path = f'{train_output_path}/{VERSION}/features'
train_context_feature_path = f'{train_output_path}/{VERSION}/context_features'
train_missing_candidates_path = f'{train_output_path}/{VERSION}/train_missing_candidates_path'

dev_candidate_path = f'{dev_output_path}/{VERSION}/candidates'
dev_feature_path = f'{dev_output_path}/{VERSION}/features'
dev_string_feature_path = f'{dev_output_path}/{VERSION}/string_features'
dev_context_feature_path = f'{dev_output_path}/{VERSION}/context_features'
dev_output_predictions = f'{dev_output_path}/{VERSION}/dev_predictions'
dev_predictions_top_k = f'{dev_output_path}/{VERSION}/dev_predictions_top_k'
dev_colorized_path = f'{dev_output_path}/{VERSION}/dev_predictions_colorized'
dev_metrics_path = f'{dev_output_path}/{VERSION}/dev_predictions_metrics'
dev_missing_candidates_path = f'{dev_output_path}/{VERSION}/dev_missing_candidates_path'

aux_field = 'graph_embedding_complex,class_count,property_count,context'


train_prop_count = f'{train_output_path}/{VERSION}/train_prop_count' 
train_class_count = f'{train_output_path}/{VERSION}/train_class_count'
train_context_path = f'{train_output_path}/{VERSION}/train_context'
train_graph_embedding = f'{train_output_path}/{VERSION}/train_graph_embedding'

dev_prop_count = f'{dev_output_path}/{VERSION}/dev_prop_count'
dev_class_count = f'{dev_output_path}/{VERSION}/dev_class_count'
dev_context_path = f'{dev_output_path}/{VERSION}/dev_context'
dev_graph_embedding = f'{dev_output_path}/{VERSION}/dev_graph_embedding'

temp_dir = f'{output_path}/temp'

tl_log_file =f'{temp_dir}/tl_log.txt'

pos_output = f'{temp_dir}/training_data/pos_features.pkl'
neg_output = f'{temp_dir}/training_data/neg_features.pkl'
min_max_scaler_path = f'{temp_dir}/training_data/normalization_factor.pkl'

final_score_column = 'siamese_prediction'
threshold = final_score_column+":median"

model_save_path = f'{dev_output_path}/{VERSION}/saved_models'
best_model_path = ''

copy_candidates_from_previous_version = True
copy_string_features_from_previous_version = True

In [3]:
!mkdir -p "$temp_dir"

!mkdir -p "$train_prop_count"
!mkdir -p "$dev_prop_count"
!mkdir -p "$train_class_count"
!mkdir -p "$dev_class_count"
!mkdir -p "$train_graph_embedding"
!mkdir -p "$dev_graph_embedding"
!mkdir -p "$train_context_path"
!mkdir -p "$dev_context_path"

!mkdir -p "$train_candidate_path"
!mkdir -p "$dev_candidate_path"

!mkdir -p "$train_feature_path"
!mkdir -p "$train_string_feature_path"
!mkdir -p "$train_context_feature_path"
!mkdir -p "$dev_feature_path"
!mkdir -p "$dev_string_feature_path"
!mkdir -p "$dev_context_feature_path"

!mkdir -p "$temp_dir/training_data"
!mkdir -p "$dev_output_predictions"
!mkdir -p "$model_save_path"
!mkdir -p "$dev_predictions_top_k"
!mkdir -p "$dev_colorized_path"
!mkdir -p "$dev_metrics_path"
!mkdir -p "$dev_missing_candidates_path"
!mkdir -p "$train_missing_candidates_path"

In [4]:
if copy_candidates_from_previous_version:
    !cp $dev_output_path/$PREVIOUS_VERSION/candidates/*csv $dev_output_path/$VERSION/candidates
    !cp $dev_output_path/$PREVIOUS_VERSION/dev_prop_count/* $dev_output_path/$VERSION/dev_prop_count
    !cp $dev_output_path/$PREVIOUS_VERSION/dev_class_count/* $dev_output_path/$VERSION/dev_class_count
    !cp $dev_output_path/$PREVIOUS_VERSION/dev_context/* $dev_output_path/$VERSION/dev_context
    !cp $dev_output_path/$PREVIOUS_VERSION/dev_graph_embedding/* $dev_output_path/$VERSION/dev_graph_embedding
    
    !cp $train_output_path/$PREVIOUS_VERSION/candidates/*csv $train_output_path/$VERSION/candidates
    !cp $train_output_path/$PREVIOUS_VERSION/train_prop_count/* $train_output_path/$VERSION/train_prop_count
    !cp $train_output_path/$PREVIOUS_VERSION/train_class_count/* $train_output_path/$VERSION/train_class_count
    !cp $train_output_path/$PREVIOUS_VERSION/train_context/* $train_output_path/$VERSION/train_context
    !cp $train_output_path/$PREVIOUS_VERSION/train_graph_embedding/* $train_output_path/$VERSION/train_graph_embedding

In [5]:
features = ["monge_elkan","monge_elkan_aliases","jaro_winkler","levenshtein","singleton","pgr_rts","context_score","smc_class_score","smc_property_score"]

## Candidate Generation

In [6]:
def candidate_generation(path, gt_path, output_path, class_count_path, prop_count_path, context_path, graph_embedding):
    file_list = glob.glob(path + '/*.csv')
    for i, file in tqdm(enumerate(file_list)):
        st = time.time()
        filename = file.split('/')[-1]
        gt_file = f"{ground_truth_files}/{filename}"
        output_file = f"{output_path}/{filename}"
#         if filename != '3OCW1LDZ.csv':
#             continue
        
        !tl --log-file $tl_log_file clean -c label -o label_clean "$file" / \
        --url $es_url --index $es_index \
        get-fuzzy-augmented-matches -c label_clean \
        --auxiliary-fields {aux_field} \
        --auxiliary-folder "$temp_dir" / \
        --url $es_url --index $es_index \
        get-exact-matches -c label_clean \
        --auxiliary-fields {aux_field} \
        --auxiliary-folder "$temp_dir" / \
        / get-ngram-matches -c label_clean  \
        --auxiliary-fields {aux_field} \
        --auxiliary-folder "$temp_dir" \
        / ground-truth-labeler --gt-file "$gt_file" > "$output_file"

        for field in aux_field.split(','):
            aux_list = []
            for f in glob.glob(f'{temp_dir}/*{field}.tsv'):
                aux_list.append(pd.read_csv(f, sep='\t', dtype=object))
            aux_df = pd.concat(aux_list).drop_duplicates(subset=['qnode'])
            if field == 'class_count':
                class_count_file = f"{class_count_path}/{filename.strip('.csv')}_class_count.tsv"
                aux_df.to_csv(class_count_file, sep='\t', index=False)
            elif field == 'property_count':
                prop_count_file = f"{prop_count_path}/{filename.strip('.csv')}_prop_count.tsv"
                aux_df.to_csv(prop_count_file, sep='\t', index=False)
            elif field == 'context':
                context_file = f"{context_path}/{filename.strip('.csv')}_context.tsv"
                aux_df.to_csv(context_file, sep='\t', index=False)
            else:
                graph_embedding_file = f"{graph_embedding}/{filename.strip('.csv')}_graph_embedding_complex.tsv"
                aux_df.to_csv(graph_embedding_file, sep='\t', index=False)
        

In [7]:
if not copy_candidates_from_previous_version:
    candidate_generation(train_path, ground_truth_files, train_candidate_path, train_class_count, train_prop_count, train_context_path,train_graph_embedding)

In [8]:
if not copy_candidates_from_previous_version:
    candidate_generation(dev_path, ground_truth_files, dev_candidate_path, dev_class_count, dev_prop_count, dev_context_path, dev_graph_embedding)

## Feature Generation

In [9]:
def string_feature_generation(candidate_dir, output_path):
    file_list = glob.glob(candidate_dir + '/*.csv')
    for i, file in tqdm(enumerate(file_list)):
        if os.path.getsize(file) == 0:
            continue
        filename = file.split('/')[-1]
        output_file = f"{output_path}/{filename}"
        !tl --log-file $tl_log_file string-similarity -i --method symmetric_monge_elkan:tokenizer=word -o monge_elkan --threshold 0.85 $file \
            / string-similarity -i --method symmetric_monge_elkan:tokenizer=word -c label_clean kg_aliases -o monge_elkan_aliases --threshold 0.85  \
            / string-similarity -i --method jaro_winkler -o jaro_winkler --threshold 0.85  \
            / string-similarity -i --method levenshtein -o levenshtein --threshold 0.85  \
            / create-singleton-feature -o singleton \
            > $output_file

In [10]:
if not copy_string_features_from_previous_version:
    string_feature_generation(train_candidate_path, train_string_feature_path)
else:
    !cp $dev_output_path/$PREVIOUS_VERSION/string_features/*csv $dev_output_path/$VERSION/string_features

In [11]:
if not copy_string_features_from_previous_version:
    string_feature_generation(dev_candidate_path, dev_string_feature_path)
else:
    !cp $train_output_path/$PREVIOUS_VERSION/string_features/*csv $train_output_path/$VERSION/string_features

In [12]:
def pre_pseudo_gt_feature_generation(string_features_path, context_path, output_path):
    file_list = glob.glob(string_features_path + '/*.csv')
    for i, file in tqdm(enumerate(file_list)):
        if os.path.getsize(file) == 0:
            continue
        filename = file.split('/')[-1]
        context_file = f"{context_path}/{filename[:-4]}_context.tsv"
        output_file = f"{output_path}/{filename}"
        !tl --log-file $tl_log_file pick-hc-candidates -o ignore_candidate $file \
        -s monge_elkan,monge_elkan_aliases,jaro_winkler,levenshtein \
        / context-match --debug --context-file $context_file --ignore-column-name ignore_candidate -o context_score \
        / kth-percentile -c context_score -o kth_percenter --ignore-column ignore_candidate --k-percentile 0.75  --minimum-cells 10 > $output_file

In [13]:
pre_pseudo_gt_feature_generation(train_string_feature_path,train_context_path, train_context_feature_path)

36it [19:30, 22.65s/it] 

  kp = KthPercentile(input_file=kwargs['input_file'],


118it [54:58, 20.13s/it]

  kp = KthPercentile(input_file=kwargs['input_file'],


258it [1:53:42, 19.25s/it] 

  kp = KthPercentile(input_file=kwargs['input_file'],


293it [2:07:19, 26.07s/it]


In [14]:
pre_pseudo_gt_feature_generation(dev_string_feature_path,dev_context_path, dev_context_feature_path)

51it [18:34, 21.85s/it]


In [15]:
dev_string_feature_path
dev_context_path
dev_context_feature_path

'/Users/amandeep/Github/table-linker/data/SemTab2020/Round4/table-linker/dev-output/v19/context_features'

In [16]:
def rest_of_features_generation(context_features_path, class_count_dir, property_count_dir, output_path):
    file_list = glob.glob(context_features_path + '/*.csv')
    for i, file in tqdm(enumerate(file_list)):
        filename = file.split('/')[-1]
        if os.path.getsize(file) == 0:
            continue

        class_count_file = f"{class_count_dir}/{filename[:-4]}_class_count.tsv"
        property_count_file = f"{property_count_dir}/{filename[:-4]}_prop_count.tsv"
        output_file = f"{output_path}/{filename}"
        !tl  --log-file $tl_log_file pgt-semantic-tf-idf $file \
            -o smc_class_score \
            --pagerank-column pagerank \
            --retrieval-score-column retrieval_score \
            --feature-file "$class_count_file" \
            --feature-name class_count \
            --high-confidence-column kth_percenter \
            / pgt-semantic-tf-idf \
            -o smc_property_score \
            --pagerank-column pagerank \
            --retrieval-score-column retrieval_score \
            --feature-file "$property_count_file" \
            --feature-name property_count \
            --high-confidence-column kth_percenter \
            > $output_file

In [17]:
rest_of_features_generation(train_context_feature_path, train_class_count, train_prop_count, train_feature_path)

36it [04:50,  4.96s/it]

  tfidf_unit = SemanticsFeature(kwargs['output_column_name'],
  tfidf_unit = SemanticsFeature(kwargs['output_column_name'],


118it [12:36,  7.10s/it]

  tfidf_unit = SemanticsFeature(kwargs['output_column_name'],
  tfidf_unit = SemanticsFeature(kwargs['output_column_name'],


258it [25:27,  4.25s/it]

  tfidf_unit = SemanticsFeature(kwargs['output_column_name'],
  tfidf_unit = SemanticsFeature(kwargs['output_column_name'],


293it [28:51,  5.91s/it]


In [18]:
rest_of_features_generation(dev_context_feature_path, dev_class_count, dev_prop_count, dev_feature_path)

51it [04:40,  5.50s/it]


### Generate Training Data

In [19]:
def merge_files(args):
    datapath = args.train_path
    df_list  = []
    for fn in glob.glob(f"{datapath}/*csv"):
        if os.path.getsize(fn) == 0:
            continue
        fid = fn.split('/')[-1][:-4]
        df = pd.read_csv(fn)
        df['table_id'] = fid
        df['context_score'].fillna(0.0, inplace=True)
        df_list.append(df)            
    return pd.concat(df_list) 

def compute_normalization_factor(args, all_data):
    min_max_scaler_path = args.min_max_scaler_path
    all_data_features = all_data[features]
    scaler = MinMaxScaler()
    scaler.fit(all_data_features)
    pickle.dump(scaler, open(min_max_scaler_path, 'wb'))
    return scaler

In [20]:
def generate_train_data(args):
    scaler_path = args.min_max_scaler_path
    scaler = pickle.load(open(scaler_path, 'rb'))
    final_list = []
    sfeatures = copy.deepcopy(features) + ['evaluation_label']
    normalize_features = features
    evaluation_label = ['evaluation_label']
    positive_features_final = []
    negative_features_final = []
    for i,file in enumerate(glob.glob(args.train_path + '/*.csv')):
        file_name = file.split('/')[-1]
        if os.path.getsize(file) == 0:
                continue
        d_sample = pd.read_csv(file)
        d_sample['context_score'].fillna(0.0, inplace=True)
        grouped_obj = d_sample.groupby(['column', 'row'])
        for cell in grouped_obj:
            cell[1][normalize_features] = scaler.transform(cell[1][normalize_features])
            pos_features = []
            neg_features = []
            a = cell[1][cell[1]['evaluation_label'] == 1]
            if a.empty:
                continue
            pos_rows = cell[1][(cell[1]['evaluation_label'].astype(int) == 1) & (cell[1]['ignore_candidate'].astype(int) == 0)][features].to_numpy()
            for i in range(len(pos_rows)):
                pos_features.append(pos_rows[i])
            neg_rows = cell[1][(cell[1]['evaluation_label'].astype(int) == -1) & (cell[1]['ignore_candidate'].astype(int) == 0)][features].to_numpy()
            for i in range(min(50,len(neg_rows))):
                neg_features.append(neg_rows[i])
            random.shuffle(pos_features)
            random.shuffle(neg_features)
            positive_features_final.append(pos_features)
            negative_features_final.append(neg_features)
            
    print(len(positive_features_final), len(positive_features_final[3]))
    print(len(negative_features_final), len(negative_features_final[3]))
    pickle.dump(positive_features_final,open(args.pos_output,'wb'))
    pickle.dump(negative_features_final,open(args.neg_output,'wb'))

In [21]:
gen_training_data_args = Namespace(train_path=train_feature_path, pos_output=pos_output, neg_output=neg_output, 
                 min_max_scaler_path=min_max_scaler_path)
all_data = merge_files(gen_training_data_args)
scaler = compute_normalization_factor(gen_training_data_args, all_data)

  all_data = merge_files(gen_training_data_args)
  all_data = merge_files(gen_training_data_args)


In [22]:
len(all_data)

1715832

In [23]:
all_data.head()

Unnamed: 0,column,row,label,context,filename,column-id,label_clean,kg_id,kg_labels,kg_aliases,...,reverse_context_property,reverse_context_similarity,reverse_context_property_similarity_q_node,kth_percenter,pgr_rts,smc_class_score,top5_smc_class_score,smc_property_score,top5_smc_property_score,table_id
0,0,1,Shalygino,2363|160|12,042AKDN1.csv,042AKDN1.csv-0,Shalygino,Q4519780,Shalygino|Schalyhyne,,...,,,,1,1.4455e-07,0.480665,Q7216840:0.102|Q2989457:0.090|Q203323:0.066|Q1...,0.345722,P1077:0.092|P1376:0.068|P2046:0.060|P1082:0.05...,042AKDN1
1,0,2,Chernelytsia,1564|305|31.4,042AKDN1.csv,042AKDN1.csv-0,Chernelytsia,Q4513379,Tschernelyzja|Chernelytsia,,...,,,,1,1.545824e-07,0.480665,Q7216840:0.102|Q2989457:0.090|Q203323:0.066|Q1...,0.295434,P1077:0.092|P1376:0.068|P2046:0.060|P1082:0.05...,042AKDN1
2,0,4,Stanytsia Luhanska,14543|40|14.6,042AKDN1.csv,042AKDN1.csv-0,Stanytsia Luhanska,Q4439422,Stanytsia Luhanska|Stanyzja Luhanska|Luganska,,...,,,,1,8.201895e-07,0.482599,Q7216840:0.102|Q2989457:0.090|Q203323:0.066|Q1...,0.357818,P1077:0.092|P1376:0.068|P2046:0.060|P1082:0.05...,042AKDN1
3,0,5,Slatyne,6483|129|7.53,042AKDN1.csv,042AKDN1.csv-0,Slatyne,Q4423206,Slatyne,,...,,,,1,1.04034e-07,0.480665,Q7216840:0.102|Q2989457:0.090|Q203323:0.066|Q1...,0.33959,P1077:0.092|P1376:0.068|P2046:0.060|P1082:0.05...,042AKDN1
4,0,8,Mykhailo-Kotsiubynske,3028|133|5.5,042AKDN1.csv,042AKDN1.csv-0,Mykhailo-Kotsiubynske,Q4297269,Mykhailo-Kotsiubynske|Mychajlo-Kozjubynske,,...,,,,1,1.859499e-07,0.480665,Q7216840:0.102|Q2989457:0.090|Q203323:0.066|Q1...,0.310358,P1077:0.092|P1376:0.068|P2046:0.060|P1082:0.05...,042AKDN1


In [24]:
pos = all_data[(all_data['evaluation_label'] == 1) & (all_data['ignore_candidate'] == 0)]
neg = all_data[(all_data['evaluation_label'] == -1) & (all_data['ignore_candidate'] == 0)]
print(len(pos))
print(len(neg))
pos.head()

14073
25764


Unnamed: 0,column,row,label,context,filename,column-id,label_clean,kg_id,kg_labels,kg_aliases,...,reverse_context_property,reverse_context_similarity,reverse_context_property_similarity_q_node,kth_percenter,pgr_rts,smc_class_score,top5_smc_class_score,smc_property_score,top5_smc_property_score,table_id
0,0,1,Shalygino,2363|160|12,042AKDN1.csv,042AKDN1.csv-0,Shalygino,Q4519780,Shalygino|Schalyhyne,,...,,,,1,1.4455e-07,0.480665,Q7216840:0.102|Q2989457:0.090|Q203323:0.066|Q1...,0.345722,P1077:0.092|P1376:0.068|P2046:0.060|P1082:0.05...,042AKDN1
1,0,2,Chernelytsia,1564|305|31.4,042AKDN1.csv,042AKDN1.csv-0,Chernelytsia,Q4513379,Tschernelyzja|Chernelytsia,,...,,,,1,1.545824e-07,0.480665,Q7216840:0.102|Q2989457:0.090|Q203323:0.066|Q1...,0.295434,P1077:0.092|P1376:0.068|P2046:0.060|P1082:0.05...,042AKDN1
2,0,4,Stanytsia Luhanska,14543|40|14.6,042AKDN1.csv,042AKDN1.csv-0,Stanytsia Luhanska,Q4439422,Stanytsia Luhanska|Stanyzja Luhanska|Luganska,,...,,,,1,8.201895e-07,0.482599,Q7216840:0.102|Q2989457:0.090|Q203323:0.066|Q1...,0.357818,P1077:0.092|P1376:0.068|P2046:0.060|P1082:0.05...,042AKDN1
3,0,5,Slatyne,6483|129|7.53,042AKDN1.csv,042AKDN1.csv-0,Slatyne,Q4423206,Slatyne,,...,,,,1,1.04034e-07,0.480665,Q7216840:0.102|Q2989457:0.090|Q203323:0.066|Q1...,0.33959,P1077:0.092|P1376:0.068|P2046:0.060|P1082:0.05...,042AKDN1
4,0,8,Mykhailo-Kotsiubynske,3028|133|5.5,042AKDN1.csv,042AKDN1.csv-0,Mykhailo-Kotsiubynske,Q4297269,Mykhailo-Kotsiubynske|Mychajlo-Kozjubynske,,...,,,,1,1.859499e-07,0.480665,Q7216840:0.102|Q2989457:0.090|Q203323:0.066|Q1...,0.310358,P1077:0.092|P1376:0.068|P2046:0.060|P1082:0.05...,042AKDN1


In [25]:
generate_train_data(gen_training_data_args)

  generate_train_data(gen_training_data_args)
  generate_train_data(gen_training_data_args)
  generate_train_data(gen_training_data_args)


12542 0
12542 0


### Model Definition

In [26]:
# Dataset
class T2DV2Dataset(Dataset):
    def __init__(self, pos_features, neg_features):
        self.pos_features = pos_features
        self.neg_features = neg_features
    
    def __len__(self):
        return len(self.pos_features)
    
    def __getitem__(self, idx):
        return self.pos_features[idx], self.neg_features[idx]

# Model
class PairwiseNetwork(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        #original 12x24, 24x12, 12x12, 12x1
        self.fc1 = nn.Linear(hidden_size, 2*hidden_size)
        self.fc2 = nn.Linear(2*hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, hidden_size)
        self.fc4 = nn.Linear(hidden_size, 1)
    
    def forward(self, pos_features, neg_features):
        # Positive pass
        x = F.relu(self.fc1(pos_features))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        pos_out = torch.sigmoid(self.fc4(x))
        
        # Negative Pass
        x = F.relu(self.fc1(neg_features))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        neg_out = torch.sigmoid(self.fc4(x))
        
        return pos_out, neg_out
    
    def predict(self, test_feat):
        x = F.relu(self.fc1(test_feat))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        test_out = torch.sigmoid(self.fc4(x))
        return test_out

# Pairwise Loss
class PairwiseLoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.m = 0
    
    def forward(self, pos_out, neg_out):
        distance = (1 - pos_out) + neg_out
        loss = torch.mean(torch.max(torch.tensor(0), distance))
        return loss

### Training

In [27]:
def generate_dataloader(positive_feat_path, negative_feat_path):
    pos_features = pickle.load(open(positive_feat_path, 'rb'))
    neg_features = pickle.load(open(negative_feat_path, 'rb'))

    pos_features_flatten = list(chain.from_iterable(pos_features))
    neg_features_flatten = list(chain.from_iterable(neg_features))

    train_dataset = T2DV2Dataset(pos_features_flatten, neg_features_flatten)
    train_dataloader = DataLoader(train_dataset, batch_size=64)
    return train_dataloader

def infer_scores(min_max_scaler_path, input_table_path, output_table_path, model):
    scaler = pickle.load(open(min_max_scaler_path, 'rb'))
    normalize_features = features
    for file in glob.glob(input_table_path + '/*.csv'):
        file_name = file.split('/')[-1]
        if os.path.getsize(file) == 0:
                continue
                
        d_sample = pd.read_csv(file)
        d_sample['context_score'].fillna(0.0, inplace=True)
        grouped_obj = d_sample.groupby(['column', 'row'])
        new_df_list = []
        pred = []
        for cell in grouped_obj:
            cell[1][normalize_features] = scaler.transform(cell[1][normalize_features])
            sorted_df = cell[1].sort_values('context_score', ascending=False)
            sorted_df_features = sorted_df[normalize_features]
            new_df_list.append(sorted_df)
            arr = sorted_df_features.to_numpy()
            test_inp = []
            for a in arr:
                test_inp.append(a)
            test_tensor = torch.tensor(test_inp).float()
            scores = model.predict(test_tensor)
            scores_list = torch.squeeze(scores).tolist()
            if not type(scores_list) is list:
                pred.append(scores_list)
            else:
                pred.extend(scores_list)
        test_df = pd.concat(new_df_list)
        test_df[final_score_column] = pred
        test_df.to_csv(f"{output_table_path}/{file_name}", index=False)

def train(args):
    if torch.cuda.is_available():
        device = torch.device('cuda')
    
    else:
        device = torch.device('cpu')
    train_dataloader = generate_dataloader(args.positive_feat_path, args.negative_feat_path)
    criterion = PairwiseLoss()
    EPOCHS = args.num_epochs
    model = PairwiseNetwork(len(features)).to(device=device)
    optimizer = Adam(model.parameters(), lr=args.lr)
    top1_max_prec = 0
    for epoch in range(EPOCHS):
        train_epoch_loss = 0
        avg_loss = 0
        model.train()
        for bid, batch in tqdm(enumerate(train_dataloader), position=0, leave=True):
            positive_feat = torch.tensor(batch[0].float())
            negative_feat = torch.tensor(batch[1].float())
            optimizer.zero_grad()
            pos_out, neg_out = model(positive_feat, negative_feat)
            loss = criterion(pos_out, neg_out)
            loss.backward()
            optimizer.step()
            train_epoch_loss += loss
        avg_loss = train_epoch_loss / bid

        # Evaluation
        model.eval()
        infer_scores(args.min_max_scaler_path, args.dev_path, args.dev_output, model)
        eval_data = merge_eval_files(args.dev_output)
        res, candidate_eval_data = parse_eval_files_stats(eval_data, final_score_column)
        top1_precision = res['num_tasks_with_model_score_top_one_accurate']/res['num_tasks_with_gt']
        if top1_precision > top1_max_prec:
            top1_max_prec = top1_precision
            model_save_name = 'epoch_{}_loss_{}_top1_{}.pth'.format(epoch, avg_loss, top1_max_prec)
            best_model_path = os.path.join(args.model_save_path, model_save_name)
            torch.save(model.state_dict(), best_model_path)
        
        print("Epoch {}, Avg Loss is {}, epoch top1 {}, max top1 {}".format(epoch, avg_loss, top1_precision, top1_max_prec))
    return best_model_path

In [28]:
def merge_eval_files(final_score_path):
    eval_file_names = []
    df_list = []
    for (dirpath, dirnames, filenames) in os.walk(final_score_path):
        for fn in filenames:
            if "csv" not in fn:
                continue
            abs_fn = os.path.join(dirpath, fn)
            assert os.path.isfile(abs_fn)
            if os.path.getsize(abs_fn) == 0:
                continue
            eval_file_names.append(abs_fn)
    
    for fn in eval_file_names:
        fid = fn.split('/')[-1].split('.csv')[0]
        df = pd.read_csv(fn)
        df['table_id'] = fid
        df_list.append(df)
    return pd.concat(df_list)

def parse_eval_files_stats(eval_data, method):
    res = {}
    candidate_eval_data = eval_data.groupby(['table_id', 'column', 'row'])['table_id'].count().reset_index(name="count")
    res['num_tasks_with_gt'] = len(eval_data[pd.notna(eval_data['GT_kg_id'])].groupby(['table_id', 'column', 'row']))
    num_tasks_with_model_score_top_one_accurate = []
    num_tasks_with_model_score_top_five_accurate = []
    num_tasks_with_model_score_top_ten_accurate = []
    has_gt_list = []
    has_gt_in_candidate = []
    for i, row in candidate_eval_data.iterrows():
        table_id, row_idx, col_idx = row['table_id'], row['row'], row['column']
        c_e_data = eval_data[(eval_data['table_id'] == table_id) & (eval_data['row'] == row_idx) & (eval_data['column'] == col_idx)]
        assert len(c_e_data) > 0
        if np.nan not in set(c_e_data['GT_kg_id']):
            has_gt_list.append(1)
        else:
            has_gt_list.append(0)
        if 1 in set(c_e_data['evaluation_label']):
            has_gt_in_candidate.append(1)
        else:
            has_gt_in_candidate.append(0)
                    
        #rank on model score
        s_data = c_e_data.sort_values(by=[method], ascending=False)
        if s_data.iloc[0]['evaluation_label'] == 1:
            num_tasks_with_model_score_top_one_accurate.append(1)
        else:
            num_tasks_with_model_score_top_one_accurate.append(0)
        if 1 in set(s_data.iloc[0:5]['evaluation_label']):
            num_tasks_with_model_score_top_five_accurate.append(1)
        else:
            num_tasks_with_model_score_top_five_accurate.append(0)
        if 1 in set(s_data.iloc[0:10]['evaluation_label']):
            num_tasks_with_model_score_top_ten_accurate.append(1)
        else:
            num_tasks_with_model_score_top_ten_accurate.append(0)
            
    res['num_tasks_with_model_score_top_one_accurate'] = sum(num_tasks_with_model_score_top_one_accurate)
    res['num_tasks_with_model_score_top_five_accurate'] = sum(num_tasks_with_model_score_top_five_accurate)
    res['num_tasks_with_model_score_top_ten_accurate'] = sum(num_tasks_with_model_score_top_ten_accurate)
    return res, candidate_eval_data

In [29]:
training_args = Namespace(num_epochs=20, lr=0.001, positive_feat_path=pos_output, negative_feat_path=neg_output,
                         dev_path=dev_feature_path, dev_output=dev_output_predictions,
                         model_save_path=model_save_path, min_max_scaler_path=min_max_scaler_path)

In [30]:
## Call Training
best_model_path = train(training_args)

  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
220it [00:00, 649.03it/s]


Epoch 0, Avg Loss is 0.7899935841560364, epoch top1 0.7430846605196982, max top1 0.7430846605196982


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
220it [00:00, 623.39it/s]


Epoch 1, Avg Loss is 0.37072333693504333, epoch top1 0.7393126571668064, max top1 0.7430846605196982


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
220it [00:00, 639.10it/s]


Epoch 2, Avg Loss is 0.21430431306362152, epoch top1 0.6106454316848282, max top1 0.7430846605196982


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
220it [00:00, 651.60it/s]


Epoch 3, Avg Loss is 0.18903601169586182, epoch top1 0.6102263202011735, max top1 0.7430846605196982


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
220it [00:00, 667.66it/s]


Epoch 4, Avg Loss is 0.180001363158226, epoch top1 0.6106454316848282, max top1 0.7430846605196982


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
220it [00:00, 658.02it/s]


Epoch 5, Avg Loss is 0.17531432211399078, epoch top1 0.6106454316848282, max top1 0.7430846605196982


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
220it [00:00, 643.70it/s]


Epoch 6, Avg Loss is 0.172470822930336, epoch top1 0.6098072087175188, max top1 0.7430846605196982


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
220it [00:00, 650.31it/s]


Epoch 7, Avg Loss is 0.17050376534461975, epoch top1 0.6098072087175188, max top1 0.7430846605196982


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
220it [00:00, 629.27it/s]


Epoch 8, Avg Loss is 0.16901221871376038, epoch top1 0.6102263202011735, max top1 0.7430846605196982


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
220it [00:00, 657.60it/s]


Epoch 9, Avg Loss is 0.16794174909591675, epoch top1 0.6106454316848282, max top1 0.7430846605196982


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
220it [00:00, 634.61it/s]


Epoch 10, Avg Loss is 0.16709168255329132, epoch top1 0.6110645431684828, max top1 0.7430846605196982


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
220it [00:00, 608.83it/s]


Epoch 11, Avg Loss is 0.16633741557598114, epoch top1 0.6110645431684828, max top1 0.7430846605196982


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
220it [00:00, 653.26it/s]


Epoch 12, Avg Loss is 0.16568487882614136, epoch top1 0.6114836546521375, max top1 0.7430846605196982


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
220it [00:00, 635.04it/s]


Epoch 13, Avg Loss is 0.1650778353214264, epoch top1 0.6114836546521375, max top1 0.7430846605196982


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
220it [00:00, 652.05it/s]


Epoch 14, Avg Loss is 0.16451068222522736, epoch top1 0.6119027661357921, max top1 0.7430846605196982


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
220it [00:00, 638.56it/s]


Epoch 15, Avg Loss is 0.1639711856842041, epoch top1 0.6119027661357921, max top1 0.7430846605196982


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
220it [00:00, 639.54it/s]


Epoch 16, Avg Loss is 0.16347399353981018, epoch top1 0.6114836546521375, max top1 0.7430846605196982


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
220it [00:00, 659.63it/s]


Epoch 17, Avg Loss is 0.16299942135810852, epoch top1 0.6127409891031014, max top1 0.7430846605196982


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
220it [00:00, 620.28it/s]


Epoch 18, Avg Loss is 0.16258816421031952, epoch top1 0.6127409891031014, max top1 0.7430846605196982


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
220it [00:00, 582.33it/s]


Epoch 19, Avg Loss is 0.16222070157527924, epoch top1 0.6131601005867561, max top1 0.7430846605196982


In [31]:
best_model_path

'/Users/amandeep/Github/table-linker/data/SemTab2020/Round4/table-linker/dev-output/v19/saved_models/epoch_0_loss_0.7899935841560364_top1_0.7430846605196982.pth'

In [None]:
best_model_path = '/Users/amandeep/Github/table-linker/data/SemTab2020/Round4/table-linker/dev-output/v17/saved_models/epoch_6_loss_0.18199089169502258_top1_0.7363788767812238.pth'

## Dev Prediction

In [32]:
def dev_prediction(dev_feature_path, dev_predictions_top_k, saved_model, output_column, min_max_scaler_path, k=5):
    for file in glob.glob(dev_feature_path + '/*.csv'):
        filename = file.split("/")[-1]
        print(filename)
        feature_str =  ",".join(features)
        if os.path.getsize(file) == 0:
            continue
        # location where the output generated by the predictions wil be stored.
        dev_output = f"{dev_predictions_top_k}/{filename}"
        !tl --log-file $tl_log_file predict-using-model $file -o $output_column \
            --features $feature_str \
            --ranking-model $saved_model \
            --normalization-factor $min_max_scaler_path \
            / create-pseudo-gt \
            --column-thresholds $threshold \
            --filter smc_class_score:0 \
            / get-kg-links -c pseudo_gt -k $k --k-rows \
            > $dev_output

In [33]:
def add_color(dev_predictions_top_k, dev_colorized_path, score_column, k=5):
    for file in glob.glob(dev_predictions_top_k + '/*.csv'):
        filename = file.split("/")[-1]
        print(filename)
        if os.path.getsize(file) == 0:
                    continue
                
        dev_color_file = f"{dev_colorized_path}/{filename[:-4]}.xlsx"
        !tl add-color $file -c "$score_column,evaluation_label" -k $k --output $dev_color_file

In [34]:
def compute_metrics(dev_predictions_top_k, dev_predictions_metrics, score_column, k=5):
    df_list = []
    for file in glob.glob(dev_predictions_top_k + '/*.csv'):
        filename = file.split("/")[-1]
        if os.path.getsize(file) == 0:
                    continue
                
        dev_metrics_file = f"{dev_predictions_metrics}/{filename}"
        !tl --log-file $tl_log_file metrics $file -k $k -c $score_column --tag $filename> $dev_metrics_file
        df_list.append(pd.read_csv(dev_metrics_file))
    return pd.concat(df_list)

In [35]:
def compute_custom_metrics(dev_predictions_top_k):
    df_list = []
    for file in glob.glob(dev_predictions_top_k+"/*.csv"):
        filename = file.split("/")[-1]
        print(filename)
        if os.path.getsize(file) == 0:
                    continue
        df = pd.read_csv(file)
        col_df = []
        for col, coldf in df.groupby(by=["column"]):
            rows = 0
            pgt_rows = 0
            pgt_recall = 0
            unignored_rows = 0
            unignored_candidates = 0
            unignored_correct = 0
            ignored_correct = 0
            kth_perc_rows = 0
            kth_perc_correct = 0
            kth_perc_candidates = 0
            for row, rowdf in coldf.groupby(by=["row"]):
                rows += 1
                p_count = rowdf[(rowdf["pseudo_gt"] == 1)].shape[0]
                if p_count > 0:
                    pgt_rows += 1
                p_recall = rowdf[((rowdf["pseudo_gt"] == 1) & (rowdf["evaluation_label"] == 1))].shape[0]
                pgt_recall += p_recall
                ignore_0_count = rowdf[rowdf["ignore_candidate"] == 0].shape[0]
                if ignore_0_count > 0:
                    unignored_rows += 1
                unignored_candidates += ignore_0_count
                unignored_correct += rowdf[((rowdf["ignore_candidate"] == 0) & (rowdf["evaluation_label"] == 1))].shape[0]
                ignored_correct += rowdf[((rowdf["ignore_candidate"] == 1) & (rowdf["evaluation_label"] == 1))].shape[0]
                kth_perc_1_count = rowdf[rowdf["kth_percenter"] == 1].shape[0]
                if kth_perc_1_count > 0:
                    kth_perc_rows += 1
                kth_perc_candidates += kth_perc_1_count
                kth_perc_correct += rowdf[((rowdf["kth_percenter"] == 1) & (rowdf["evaluation_label"] == 1))].shape[0]
            unique_rows = coldf["label"].nunique()
            col_df.append(pd.DataFrame([{
                "filename":filename,
                "column": col,
                "rows": rows,
                "unique_rows": unique_rows,
                "pgt_rows": pgt_rows,
                "pgt_recall": pgt_recall,
                "pgt_accuracy": pgt_recall/pgt_rows if pgt_rows!=0 else 0,
                "unignored_rows": unignored_rows,
                "unignored_candidates": unignored_candidates,
                "unignored_correct": unignored_correct,
                "ignored_correct": ignored_correct,
                "ignore_candidate_accuracy": unignored_correct/unignored_rows if unignored_rows != 0 else 0,
                "kth_percenter_rows": kth_perc_rows,
                "kth_percenter_candidates": kth_perc_candidates,
                "kth_percenter_correct": kth_perc_correct,
                "kth_percenter_accuracy": kth_perc_correct/kth_perc_rows
            }]))
        df_list.append(pd.concat(col_df))
    return pd.concat(df_list)

In [36]:
print(dev_feature_path, dev_predictions_top_k, best_model_path, final_score_column, min_max_scaler_path)

/Users/amandeep/Github/table-linker/data/SemTab2020/Round4/table-linker/dev-output/v19/features /Users/amandeep/Github/table-linker/data/SemTab2020/Round4/table-linker/dev-output/v19/dev_predictions_top_k /Users/amandeep/Github/table-linker/data/SemTab2020/Round4/table-linker/dev-output/v19/saved_models/epoch_0_loss_0.7899935841560364_top1_0.7430846605196982.pth siamese_prediction /Users/amandeep/Github/table-linker/data/SemTab2020/Round4/table-linker/temp/training_data/normalization_factor.pkl


In [37]:
dev_prediction(dev_feature_path, dev_predictions_top_k, best_model_path, final_score_column, min_max_scaler_path, k=5)

4PKEEJU4.csv
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_i[output_column] = -1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[r

In [38]:
metrics_df = compute_custom_metrics(dev_predictions_top_k)

4PKEEJU4.csv
4DC5O5I4.csv
58E7A5WL.csv
0FQOOJPU.csv
5IXA0RAI.csv
0LF7RI6N.csv
2PKG4E2V.csv
50NWQJ1T.csv
44NDHWR1.csv
39W7XXTI.csv
3WXFYEAX.csv
28D7RBJT.csv
1GUVMENF.csv
0QWF60VG.csv
3OCW1LDZ.csv
4V4O0CTS.csv
29D1VZHF.csv
4FG1UN8O.csv
0LZ0M8W4.csv
3B54GZSX.csv
1ZFRQBQS.csv
0WBTX8LY.csv
1NS33P8C.csv
2TGNKH1P.csv
53OUTCE4.csv
5TJI4XTK.csv
4DPRZWVL.csv
4SOL8H0M.csv
080HU8A5.csv
3OAZEVOY.csv
13BLTPJD.csv
3M5QXPWN.csv
0G9YPQC0.csv
1YPLVLS9.csv
0CETTKTA.csv
1GOKLC0K.csv
2QFYH2N9.csv
00ECUL14.csv
2E6QBLCA.csv
1T91CHXV.csv
59W76Q0Y.csv
5PKTGQ6Q.csv
3JNFST2K.csv
1XIWQBSF.csv
2YCSL7OH.csv
2FSRG0OI.csv
0P8H49LQ.csv
2FXR6BX7.csv
1PTL0CX1.csv
0TQXSY28.csv
0JSF530F.csv


In [None]:
metrics_df = compute_metrics(dev_predictions_top_k, dev_metrics_path, 'pseudo_gt', k=200)

In [40]:
metrics_df

Unnamed: 0,filename,column,rows,unique_rows,pgt_rows,pgt_recall,pgt_accuracy,unignored_rows,unignored_candidates,unignored_correct,ignored_correct,ignore_candidate_accuracy,kth_percenter_rows,kth_percenter_candidates,kth_percenter_correct,kth_percenter_accuracy
0,4PKEEJU4.csv,0,20,20,5,5,1.0,10,10,10,10,1.0,10,10,10,1.0
0,4PKEEJU4.csv,1,20,7,3,3,1.0,7,7,7,13,1.0,5,5,5,1.0
0,4PKEEJU4.csv,2,20,5,3,3,1.0,5,8,5,15,1.0,5,8,5,1.0
0,4DC5O5I4.csv,0,20,20,5,5,1.0,10,10,10,10,1.0,10,10,10,1.0
0,58E7A5WL.csv,0,22,18,10,8,0.8,10,12,10,12,1.0,10,12,10,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,2FXR6BX7.csv,4,20,20,5,5,1.0,10,10,10,10,1.0,10,10,10,1.0
0,1PTL0CX1.csv,0,20,20,5,5,1.0,10,10,10,10,1.0,10,10,10,1.0
0,1PTL0CX1.csv,2,20,20,5,5,1.0,10,10,10,10,1.0,9,9,9,1.0
0,0TQXSY28.csv,0,20,20,5,5,1.0,10,10,10,10,1.0,10,10,10,1.0


In [39]:
metrics_df['recall'].mean()

KeyError: 'recall'

In [None]:
metrics_df.to_csv(f"{dev_metrics_path}/metrics_200_og.csv", index=False)

In [None]:
metrics_df = compute_custom_metrics(dev_predictions_top_k, dev_metrics_path, final_score_column, k=1)

In [None]:
metrics_df = compute_metrics(dev_predictions_top_k, dev_metrics_path, 'pseudo_gt', k=1)

In [None]:
metrics_df

In [None]:
metrics_df['precision'].mean()

In [41]:
metrics_df.to_csv(f"{dev_metrics_path}/metrics_1.csv", index=False)

In [42]:
add_color(dev_predictions_top_k, dev_colorized_path, final_score_column)

4PKEEJU4.csv
add-color Time: 0.14677190780639648s
4DC5O5I4.csv
add-color Time: 0.08221101760864258s
58E7A5WL.csv
add-color Time: 0.10615706443786621s
0FQOOJPU.csv
add-color Time: 0.10671710968017578s
5IXA0RAI.csv
add-color Time: 0.08566594123840332s
0LF7RI6N.csv
add-color Time: 0.11015892028808594s
2PKG4E2V.csv
add-color Time: 0.14330315589904785s
50NWQJ1T.csv
add-color Time: 0.1396961212158203s
44NDHWR1.csv
add-color Time: 0.17977285385131836s
39W7XXTI.csv
add-color Time: 0.12196207046508789s
3WXFYEAX.csv
add-color Time: 0.0745081901550293s
28D7RBJT.csv
add-color Time: 0.09549689292907715s
1GUVMENF.csv
add-color Time: 0.0959320068359375s
0QWF60VG.csv
add-color Time: 0.1136789321899414s
3OCW1LDZ.csv
add-color Time: 0.11468911170959473s
4V4O0CTS.csv
add-color Time: 0.10395312309265137s
29D1VZHF.csv
add-color Time: 0.11680197715759277s
4FG1UN8O.csv
add-color Time: 0.1550600528717041s
0LZ0M8W4.csv
add-color Time: 0.1181497573852539s
3B54GZSX.csv
add-color Time: 0.12894678115844727s
1ZFRQB

In [None]:
def find_missing_correct_candidates(candidates_path, missing_correct_candidates_path):
     for file in tqdm(glob.glob(candidates_path + '/*.csv')):
        filename = file.split("/")[-1]
        if os.path.getsize(file) == 0:
                    continue
        missing_file = f"{missing_correct_candidates_path}/{filename}"
        !tl check-candidates "$file" > "$missing_file"

In [None]:
find_missing_correct_candidates(dev_candidate_path, dev_missing_candidates_path)

In [None]:
def concat_files(files_path):
    df_list = []
    for file in glob.glob(files_path + '/*.csv'):
        filename = file.split("/")[-1]
        print(filename)
        if os.path.getsize(file) == 0:
                    continue
        df = pd.read_csv(file)
        df['filename'] = filename
        df_list.append(df)
    return pd.concat(df_list)

In [None]:
missing_df = concat_files(dev_missing_candidates_path)

In [None]:
missing_df

In [None]:
missing_df.to_csv(f"{dev_missing_candidates_path}/missing_concatenated.csv", index=False)

In [None]:
find_missing_correct_candidates(train_candidates_path, train_missing_candidates_path)

In [None]:
train_missing_df = concat_files(train_missing_candidates_path)

In [None]:
len(train_missing_df)

In [None]:
train_missing_df

In [None]:
train_missing_df.to_csv(f"{train_missing_candidates_path}/missing_concatenated.csv", index=False)