# Package loading 

In [31]:
# Imports 
import os
import json
import argparse
import time
import ipdb
import spacy
import torch
import optuna
import pickle

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import numpy as np
from tqdm import tqdm
from collections import deque

import torch.optim as optim
from torchtext import data
import torch.nn as nn
import torch.nn.functional as F
from torchsummary import summary
from torch.utils.data import Dataset, TensorDataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from transformers import RobertaTokenizer, BertModel, TransfoXLTokenizer, TransfoXLModel, AdamW
from transformers import BigBirdTokenizer, BigBirdForSequenceClassification
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import LongformerTokenizer, LongformerForSequenceClassification
from transformers import XLNetTokenizer, XLNetForSequenceClassification

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report

# Data Exploration

In [3]:
# train_path = "/nfs/home/kabenamualus/Research/task-dataset-metric-extraction/data/paperwithcode/5Neg10unk/twofoldwithunk/fold1/train.tsv"
# valid_path = "/nfs/home/kabenamualus/Research/task-dataset-metric-extraction/data/paperwithcode/5Neg10unk/twofoldwithunk/fold1/dev.tsv"

# train_path = "/nfs/home/kabenamualus/Research/task-dataset-metric-extraction/data/paperwithcode/new/60Neg800unk/twofoldwithunk/fold1/train.tsv"
# valid_path = "/nfs/home/kabenamualus/Research/task-dataset-metric-extraction/data/paperwithcode/new/60Neg800unk/twofoldwithunk/fold1/dev.tsv"

train_path = "/nfs/home/kabenamualus/Research/task-dataset-metric-extraction/data/ibm/exp/few-shot-setup/NLP-TDMS/paperVersion/train.tsv"
valid_path = "/nfs/home/kabenamualus/Research/task-dataset-metric-extraction/data/ibm/exp/few-shot-setup/NLP-TDMS/paperVersion/test.tsv"
output_path = "/nfs/home/kabenamualus/Research/task-dataset-metric-extraction/data/ibm/exp/few-shot-setup/NLP-TDMS/paperVersion/torch/SciBert/"

model_pt_path = "/nfs/home/kabenamualus/Research/task-dataset-metric-extraction/data/ibm/exp/few-shot-setup/NLP-TDMS/paperVersion/torch/SciBert/Model_SciBert_avg_metric_0.9001.pt"

N_EPOCHS = 3
model_name = "SciBert"
max_input_len = 512
# output_path = "/nfs/home/kabenamualus/Research/task-dataset-metric-extraction/data/paperwithcode/new/60Neg800unk/twofoldwithunk/fold1/"
bs = 16

processors = {
      "Bert": [BertTokenizer, BertForSequenceClassification, "bert-base-uncased"],
      "SciBert": [BertTokenizer, BertForSequenceClassification, "allenai/scibert_scivocab_uncased"],
      "XLNet": [XLNetTokenizer, XLNetForSequenceClassification, "xlnet-base-cased"],
      "BigBird": [BigBirdTokenizer, BigBirdForSequenceClassification, "google/bigbird-roberta-base"],
      "Longformer": [LongformerTokenizer, LongformerForSequenceClassification, "allenai/longformer-base-4096"],
    }

In [4]:
train_df = pd.read_csv(train_path, 
                    sep="\t", names=["label", "title", "TDM", "Context"])

valid_df = pd.read_csv(valid_path, 
                   sep="\t", names=["label", "title", "TDM", "Context"])

In [5]:
train_df.head()

Unnamed: 0,label,title,TDM,Context
0,True,D16-1036.pdf,unknow,Multi-view Response Selection for Human-Comput...
1,False,D16-1036.pdf,question answering; SQuAD; F1,Multi-view Response Selection for Human-Comput...
2,False,D16-1036.pdf,relation prediction; FB15K-237; H@1,Multi-view Response Selection for Human-Comput...
3,False,D16-1036.pdf,word sense disambiguation; SemEval 2013; F1,Multi-view Response Selection for Human-Comput...
4,False,D16-1036.pdf,language modeling; 1B Words / Google Billion W...,Multi-view Response Selection for Human-Comput...


In [6]:
valid_df.head()

Unnamed: 0,label,title,TDM,Context
0,True,1803.11175.pdf,sentiment analysis; SUBJ; Accuracy,Universal Sentence Encoder We present models f...
1,True,1803.11175.pdf,text classification; TREC; Error,Universal Sentence Encoder We present models f...
2,False,1803.11175.pdf,question answering; SQuAD; F1,Universal Sentence Encoder We present models f...
3,False,1803.11175.pdf,relation prediction; FB15K-237; H@1,Universal Sentence Encoder We present models f...
4,False,1803.11175.pdf,word sense disambiguation; SemEval 2013; F1,Universal Sentence Encoder We present models f...


# Model

In [7]:
if model_name in processors.keys():
    selected_processor = processors[model_name]
else:
    print(f"Model not available check selected model only {list(processors.keys())} as supported")
    quit()

if model_name == "SciBert":
    tokenizer = selected_processor[0].from_pretrained("bert-base-uncased")
else:
    tokenizer = selected_processor[0].from_pretrained(selected_processor[2])

init_token = tokenizer.cls_token
eos_token = tokenizer.sep_token
pad_token = tokenizer.pad_token
unk_token = tokenizer.unk_token

print(init_token, eos_token, pad_token, unk_token)

[CLS] [SEP] [PAD] [UNK]


In [8]:
init_token_idx = tokenizer.convert_tokens_to_ids(init_token)
eos_token_idx = tokenizer.convert_tokens_to_ids(eos_token)
pad_token_idx = tokenizer.convert_tokens_to_ids(pad_token)
unk_token_idx = tokenizer.convert_tokens_to_ids(unk_token)

print(init_token_idx, eos_token_idx, pad_token_idx, unk_token_idx)

101 102 0 100


In [9]:
if model_name == "SciBert":
    max_input_length = tokenizer.max_model_input_sizes["bert-base-uncased"]
else:
    max_input_length = tokenizer.max_model_input_sizes[selected_processor[2]]

if not max_input_length:
    max_input_length = max_input_len

print(f"Maximun sequence lenght {max_input_length}")

Maximun sequence lenght 512


In [10]:
class TransformersNLI(Dataset):
    def __init__(self, tokenizer, max_input_length):
        self.label_dict = {'True': 0, 'False': 1} # Default {'entailment': 0, 'contradiction': 1, 'neutral': 2}
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length        
        
    def _truncate_seq_pair(self, tokens_a, tokens_b, max_length):
        """Truncates a sequence pair in place to the maximum length."""

        # This is a simple heuristic which will always truncate the longer sequence
        # one token at a time. This makes more sense than truncating an equal percent
        # of tokens from each, since if one sequence is very short then each token
        # that's truncated likely contains more information than a longer sequence.
        while True:
            total_length = len(tokens_a) + len(tokens_b)
            if total_length <= max_length:
                break
            if len(tokens_a) > len(tokens_b):
                tokens_a.pop()
            else:
                tokens_b.pop()
        
    def load_data(self, df):
        MAX_LEN = self.max_input_length
        token_ids = []
        mask_ids = []
        seg_ids = []
        y = []

        premise_list = df['TDM'].to_list()           # df['sentence1'].to_list()
        hypothesis_list = df['Context'].to_list()    # df['sentence2'].to_list()
        label_list = df['label'].to_list()           # df['gold_label'].to_list()

        for (premise, hypothesis, label) in tqdm(zip(premise_list, hypothesis_list, label_list), total=len(label_list)):
            premise_id = self.tokenizer.encode(premise, add_special_tokens = False)
            hypothesis_id = self.tokenizer.encode(hypothesis, add_special_tokens = False)
            # ignore the warning as the ong sequence issuw is taken care of here 
            self._truncate_seq_pair(premise_id, hypothesis_id, MAX_LEN-3) # -3 to account for the special characters 
            
            pair_token_ids = [self.tokenizer.cls_token_id] + premise_id \
                            + [self.tokenizer.sep_token_id] + hypothesis_id \
                            + [self.tokenizer.sep_token_id]
            premise_len = len(premise_id)
            hypothesis_len = len(hypothesis_id)

            segment_ids = torch.tensor([0] * (premise_len + 2) + [1] * (hypothesis_len + 1))  # sentence 0 and sentence 1
            attention_mask_ids = torch.tensor([1] * (premise_len + hypothesis_len + 3))  # mask padded values

            token_ids.append(torch.tensor(pair_token_ids))
            seg_ids.append(segment_ids)
            mask_ids.append(attention_mask_ids)
            # we have str(label) to have the key work proprely 
            y.append(self.label_dict[str(label)]) # y.append(self.label_dict[label]) 
            
        token_ids = pad_sequence(token_ids, batch_first=True)
        mask_ids = pad_sequence(mask_ids, batch_first=True)
        seg_ids = pad_sequence(seg_ids, batch_first=True)
        y = torch.tensor(y)
        dataset = TensorDataset(token_ids, mask_ids, seg_ids, y)

        print(len(dataset))

        return dataset

    def get_train_data(self, train_df, batch_size=32, shuffle=True):
        train_data = self.load_data(train_df)
                    
        train_loader = DataLoader(
            train_data,
            shuffle=shuffle,
            batch_size=batch_size
            )

        return train_loader

    def get_valid_data(self, valid_df, batch_size=32, shuffle=True):
        valid_data = self.load_data(valid_df)
                    
        valid_loader = DataLoader(
            valid_data,
            shuffle=shuffle,
            batch_size=batch_size
            )

        return valid_loader

    def get_inference_data(self, test_df, batch_size=32, shuffle=False):
        test_data = self.load_data(test_df)
                    
        test_loader = DataLoader(
            test_data,
            shuffle=shuffle,
            batch_size=batch_size
            )

        return test_loader

In [11]:
TDM_dataset = TransformersNLI(tokenizer, max_input_length)

In [12]:
# train_loader = TDM_dataset.get_train_data(train_df, batch_size=bs, shuffle=True)
# valid_loader = TDM_dataset.get_valid_data(valid_df, batch_size=bs, shuffle=True)

In [13]:
# if os.path.exists(f'{output_path}train_loader_{bs}_seq_{max_input_length}.pth'):
#     train_loader = torch.load(f'{output_path}train_loader_{bs}_seq_{max_input_length}.pth')
# else:
#     train_loader = TDM_dataset.get_train_data(train_df, batch_size=bs, shuffle=True)
#     # Save dataloader
#     torch.save(train_loader, f'{output_path}train_loader_{bs}_seq_{max_input_length}.pth')

# if os.path.exists(f'{output_path}valid_loader_{bs}_seq_{max_input_length}.pth'):
#     valid_loader = torch.load(f'{output_path}valid_loader_{bs}_seq_{max_input_length}.pth')
# else:
#     valid_loader = TDM_dataset.get_valid_data(valid_df, batch_size=bs, shuffle=True)
#     # Save dataloader
#     torch.save(valid_loader, f'{output_path}valid_loader_{bs}_seq_{max_input_length}.pth')

In [20]:
if os.path.exists(f'{output_path}train_loader_{bs}_seq_{max_input_length}.pth'):
    train_loader = torch.load(f'{output_path}train_loader_{bs}_seq_{max_input_length}.pth')
    # os.remove(f'{output_path}train_loader_{bs}_seq_{max_input_length}.pth')
else:
    train_loader = TDM_dataset.get_train_data(train_df, batch_size=bs, shuffle=True)
    # Save dataloader
    torch.save(train_loader, f'{output_path}train_loader_{bs}_seq_{max_input_length}.pth')

if os.path.exists(f'{output_path}valid_loader_{bs}_seq_{max_input_length}.pth'):
    # valid_loader = torch.load(f'{output_path}valid_loader_{bs}_seq_{max_input_length}.pth')
    os.remove(f'{output_path}valid_loader_{bs}_seq_{max_input_length}.pth')
# else:
#     valid_loader = TDM_dataset.get_valid_data(valid_df, batch_size=bs, shuffle=True)
#     # Save dataloader
#     torch.save(valid_loader, f'{output_path}valid_loader_{bs}_seq_{max_input_length}.pth')

# train_loader = TDM_dataset.get_train_data(train_df, batch_size=bs, shuffle=True)
valid_loader = TDM_dataset.get_valid_data(valid_df, batch_size=bs, shuffle=False)

100%|██████████| 13071/13071 [01:28<00:00, 147.07it/s]


13071


## Build the Model

In [21]:
model = selected_processor[1].from_pretrained(
                                selected_processor[2], num_labels=2)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

if torch.cuda.device_count() > 1:
    print("Let's use", torch.cuda.device_count(), "GPUs!")
    model = nn.DataParallel(model)
else:
    print(f"Device: {device}")

model = model.to(device)

param_optimizer = list(model.named_parameters())

no_decay = ['bias', 'gamma', 'beta']

optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
    'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
    'weight_decay_rate': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5, correct_bias=False)

Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification we

Device: cuda


In [22]:
def count_parameters(model):
    return (sum(p.numel() for p in model.parameters() if p.requires_grad), sum(p.numel() for p in model.parameters() if not p.requires_grad))

print(f'The model has {count_parameters(model)[0]:,} trainable parameters')
print(f'The model has {count_parameters(model)[1]:,} non-trainable parameters')

The model has 109,920,002 trainable parameters
The model has 0 non-trainable parameters


In [23]:
class AverageMeter(object):
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [29]:
def train(model, iterator, optimizer, epoch):
    
    model.train()
    
    train_loss = AverageMeter()
    train_acc = AverageMeter()
    train_macro_p = AverageMeter()
    train_macro_r = AverageMeter()
    train_macro_f1 = AverageMeter()
    train_micro_p = AverageMeter()
    train_micro_r = AverageMeter()
    train_micro_f1 = AverageMeter()
    
    for batch_idx, (pair_token_ids, mask_ids, seg_ids, y) in tqdm(enumerate(iterator), total=len(iterator)):
        
        optimizer.zero_grad()
        
        pair_token_ids = pair_token_ids.to(device)
        mask_ids = mask_ids.to(device)
        seg_ids = seg_ids.to(device)
        labels = y.to(device)
#         ipdb.set_trace()

#         loss, prediction = model(pair_token_ids, 
#                             token_type_ids=seg_ids, 
#                             attention_mask=mask_ids, 
#                             labels=labels).values()          
        
        outputs = model(pair_token_ids, 
                        token_type_ids=seg_ids, 
                        attention_mask=mask_ids, 
                        labels=labels)
        
        loss = outputs.loss
        prediction = outputs.logits

        loss.backward()
        optimizer.step()
                
        prediction = torch.log_softmax(prediction, dim=1).argmax(dim=1)

        train_acc.update(prediction.eq(labels.view_as(prediction)).sum().item()/len(labels)) # accuracy_score(labels.cpu(), prediction.cpu())
        train_loss.update(loss.item())  
        train_macro_p.update(precision_score(labels.cpu(), prediction.cpu(), average ='macro'))
        train_macro_r.update(recall_score(labels.cpu(), prediction.cpu(), average ='macro'))
        train_macro_f1.update(f1_score(labels.cpu(), prediction.cpu(), average ='macro'))
        train_micro_p.update(precision_score(labels.cpu(), prediction.cpu(), average ='micro'))
        train_micro_r.update(recall_score(labels.cpu(), prediction.cpu(), average ='micro'))
        train_micro_f1.update(f1_score(labels.cpu(), prediction.cpu(), average ='micro'))
        
        if (batch_idx + 1) % 1000 == 0:
            print(f"[epoch {epoch+1}] [iter {(batch_idx + 1)}/{len(iterator)}]")
            print('------------------------------------------------------------')
            print(f"Train Accuracy Score: {train_acc.avg}; Train loss : {train_loss.avg}")
            print(f"Macro Precision: {train_macro_p.avg}; Macro Recall : {train_macro_r.avg}; Macro F1 : {train_macro_f1.avg}")
            print(f"Micro Precision: {train_micro_p.avg}; Micro Recall : {train_micro_r.avg}; Micro F1 : {train_micro_f1.avg}")
            print('------------------------------------------------------------')
            
    return train_loss.avg, train_acc.avg, train_macro_p.avg, train_macro_r.avg, train_macro_f1.avg, train_micro_p.avg, train_micro_r.avg, train_micro_f1.avg


def evaluate(model, iterator, optimizer):
        
    model.eval()
    val_loss = AverageMeter()
    val_acc = AverageMeter()
    val_macro_p = AverageMeter()
    val_macro_r = AverageMeter()
    val_macro_f1 = AverageMeter()
    val_micro_p = AverageMeter()
    val_micro_r = AverageMeter()
    val_micro_f1 = AverageMeter()
    
    with torch.no_grad():
    
        for batch_idx, (pair_token_ids, mask_ids, seg_ids, y) in tqdm(enumerate(iterator), total=len(iterator)):
#             optimizer.zero_grad()
            pair_token_ids = pair_token_ids.to(device)
            mask_ids = mask_ids.to(device)
            seg_ids = seg_ids.to(device)
            labels = y.to(device)

            outputs = model(pair_token_ids, 
                        token_type_ids=seg_ids, 
                        attention_mask=mask_ids, 
                        labels=labels)
            
            loss = outputs.loss
            prediction = outputs.logits

            prediction = torch.log_softmax(prediction, dim=1).argmax(dim=1)
            
            ipdb.set_trace()

            val_acc.update(prediction.eq(labels.view_as(prediction)).sum().item()/len(labels)) # accuracy_score(labels.cpu(), prediction.cpu())
            val_macro_p.update(precision_score(labels.cpu(), prediction.cpu(), average ='macro'))
            val_macro_r.update(recall_score(labels.cpu(), prediction.cpu(), average ='macro'))
            val_macro_f1.update(f1_score(labels.cpu(), prediction.cpu(), average ='macro'))
            val_micro_p.update(precision_score(labels.cpu(), prediction.cpu(), average ='micro'))
            val_micro_r.update(recall_score(labels.cpu(), prediction.cpu(), average ='micro'))
            val_micro_f1.update(f1_score(labels.cpu(), prediction.cpu(), average ='micro'))
            val_loss.update(loss.item())        

    
    val_macro_avg_p, val_macro_avg_r, val_macro_avg_f1 = val_macro_p.avg, val_macro_r.avg, val_macro_f1.avg 
    val_micro_avg_p, val_micro_avg_r, val_micro_avg_f1 = val_micro_p.avg, val_micro_r.avg, val_micro_f1.avg 

    print('------------------------------------------------------------')
    print(f"Validation Accuracy Score : {val_acc.avg}; Vadidation loss : {val_loss.avg}")
    print(f"Macro Precision : {val_macro_avg_p}; Macro Recall : {val_macro_avg_r}; Macro F1 : {val_macro_avg_f1}")
    print(f"Micro Precision : {val_micro_avg_p}; Micro Recall : {val_micro_avg_r}; Micro F1 : {val_micro_avg_f1}")
    print('------------------------------------------------------------')
    
    return val_loss.avg, val_acc.avg, val_macro_avg_p, val_macro_avg_r, val_macro_avg_f1, val_micro_avg_p, val_micro_avg_r, val_micro_avg_f1

def predict_TDM_from_pdf(model, tokenizer, iterator, output_path):
    model.eval()
    with torch.no_grad():
    
        for batch_idx, (pair_token_ids, mask_ids, seg_ids, y) in tqdm(enumerate(iterator), total=len(iterator)):
            pair_token_ids = pair_token_ids.to(device)
            mask_ids = mask_ids.to(device)
            seg_ids = seg_ids.to(device)
            labels = y.to(device)

            outputs = model(pair_token_ids, 
                        token_type_ids=seg_ids, 
                        attention_mask=mask_ids, 
                        labels=labels)
        
            loss = outputs.loss
            prediction = outputs.logits

            prediction_scalled = torch.sigmoid(prediction)
            
            with open(f"{output_path}test_results.tsv", "a+", encoding="utf-8") as text_file:
                for true, false in prediction_scalled.cpu():
                    text_file.write(str(true.item())+"\t"+str(false.item())+"\n")

def get_top_n_prediction_label(path_to_test_file, path_to_prediction_file, output_path, n = 5):
    """
    This function return the label with the highest proba
    """
    top5 = deque()
    with open(f"{path_to_test_file}") as f:
        txt_test_files = f.read().splitlines()
    with open(f"{path_to_prediction_file}") as f:
        txt_prediction_files = f.read().splitlines()
    
    for example, prediction in zip(txt_test_files, txt_prediction_files):
        true_prob, false_prob = prediction.split("\t")
        true_prob, false_prob = float(true_prob), float(false_prob)
        if true_prob > false_prob:
            label = example.split("\t")[2]
            top5.append((label, true_prob))
    results = deque(sorted(top5, key=lambda x: x[1] if x else x, reverse=False), n)
    with open(f"{output_path}test_top_{n}_tdm.tsv", "w+", encoding="utf-8") as text_file:
        for tdm in results:
            text_file.write(f"{tdm[0]}\t{tdm[1]}\n")
    return results

def write_evaluation_result(val_macro_avg_p, val_macro_avg_r, val_macro_avg_f1, val_micro_avg_p, val_micro_avg_r, val_micro_avg_f1, output_path):
    with open(f"{output_path}evaluation_tdm_results.tsv", "w+", encoding="utf-8") as text_file:
        text_file.write(f"Macro P\tMacro R\t Macro F1\t Micro P\t Micro R\t Micro F1\n")
        text_file.write(f"{val_macro_avg_p}\t{val_macro_avg_r}\t{val_macro_avg_f1}\t{val_micro_avg_p}\t{val_micro_avg_r}\t{val_micro_avg_f1}\n")

In [25]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [24]:
# best_valid_loss = 0.30 #float('inf')
# best_valid_f1 = 0.5

# for epoch in range(N_EPOCHS):

#     start_time = time.time()
    
# #     train_loss, train_acc, train_macro_avg_p, train_macro_avg_r, train_macro_avg_f1, train_micro_avg_p, train_micro_avg_r, train_micro_avg_f1 = train(model, train_loader, optimizer, epoch)
#     valid_loss, valid_acc, val_macro_avg_p, val_macro_avg_r, val_macro_avg_f1, val_micro_avg_p, val_micro_avg_r, val_micro_avg_f1 = evaluate(model, valid_loader, optimizer)
    
#     end_time = time.time()

#     epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
#     print(f'Epoch: {epoch+1:02} Final | Epoch Time: {epoch_mins}m {epoch_secs}s')
#     print('------------------------------------------------------------')
#     print(f"Train Accuracy Score: {train_acc}; Train loss : {train_loss}")
#     print(f"Macro Precision: {train_macro_avg_p}; Macro Recall : {train_macro_avg_r}; Macro F1 : {train_macro_avg_f1}")
#     print(f"Micro Precision: {train_micro_avg_p}; Micro Recall : {train_micro_avg_r}; Micro F1 : {train_micro_avg_f1}")
#     print('------------------------------------------------------------')
    
#     valid_metric_avg = (val_macro_avg_p + val_macro_avg_r + val_macro_avg_f1+val_micro_avg_p + val_micro_avg_r + val_micro_avg_f1)/6
    
    
    
#     if valid_metric_avg > best_valid_metric_avg : #and abs(valid_loss - best_valid_loss) < 1e-1
#         best_valid_metric_avg = valid_metric_avg
#         print('Saving Model ...')
#         torch.save(model.state_dict(), f'{output_path}Model_{model_name}_avg_metric_{str(best_valid_metric_avg)[:4]}.pt')
#         print('****************************************************************************')
#         print('best record: [epoch %d], [val loss %.5f], [val acc %.5f], [val avg. metric %.5f]' % (epoch, valid_loss, valid_acc, valid_metric_avg))
#         print(f"Macro Precision : {val_macro_avg_p}; Macro Recall : {val_macro_avg_r}; Macro F1 : {val_macro_avg_f1}")
#         print(f"Micro Precision : {val_micro_avg_p}; Micro Recall : {val_micro_avg_r}; Micro F1 : {val_micro_avg_f1}")
#         print('****************************************************************************')

## Inference

We'll then use the model to test the sentiment of some sequences. We tokenize the input sequence, trim it down to the maximum length, add the special tokens to either side, convert it to a tensor, add a fake batch dimension and then pass it through our model.

In [26]:
model_pt_path = "/nfs/home/kabenamualus/Research/task-dataset-metric-extraction/data/ibm/exp/few-shot-setup/NLP-TDMS/paperVersion/torch/SciBert/Model_SciBert_avg_metric_0.9001.pt"
# model_pt_path = "/nfs/home/kabenamualus/Research/task-dataset-metric-extraction/data/paperwithcode/new/60Neg800unk/twofoldwithunk/fold1/Model_SciBert_avg_metric_0.95.pt"

In [27]:
# Reload the best model
# model.load_state_dict(torch.load('Model_f1_0.93.pt'))
model.load_state_dict(torch.load(model_pt_path))

<All keys matched successfully>

In [32]:
valid_loss, valid_acc, val_macro_avg_p, val_macro_avg_r, val_macro_avg_f1, val_micro_avg_p, val_micro_avg_r, val_micro_avg_f1 = evaluate(model, valid_loader, optimizer)

  0%|          | 0/817 [00:00<?, ?it/s]

> [0;32m<ipython-input-29-b05de942b1ce>[0m(95)[0;36mevaluate[0;34m()[0m
[0;32m     94 [0;31m[0;34m[0m[0m
[0m[0;32m---> 95 [0;31m            [0mval_acc[0m[0;34m.[0m[0mupdate[0m[0;34m([0m[0mprediction[0m[0;34m.[0m[0meq[0m[0;34m([0m[0mlabels[0m[0;34m.[0m[0mview_as[0m[0;34m([0m[0mprediction[0m[0;34m)[0m[0;34m)[0m[0;34m.[0m[0msum[0m[0;34m([0m[0;34m)[0m[0;34m.[0m[0mitem[0m[0;34m([0m[0;34m)[0m[0;34m/[0m[0mlen[0m[0;34m([0m[0mlabels[0m[0;34m)[0m[0;34m)[0m [0;31m# accuracy_score(labels.cpu(), prediction.cpu())[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     96 [0;31m            [0mval_macro_p[0m[0;34m.[0m[0mupdate[0m[0;34m([0m[0mprecision_score[0m[0;34m([0m[0mlabels[0m[0;34m.[0m[0mcpu[0m[0;34m([0m[0;34m)[0m[0;34m,[0m [0mprediction[0m[0;34m.[0m[0mcpu[0m[0;34m([0m[0;34m)[0m[0;34m,[0m [0maverage[0m [0;34m=[0m[0;34m'macro'[0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  label


*** NameError: name 'label' is not defined


ipdb>  n


> [0;32m<ipython-input-29-b05de942b1ce>[0m(96)[0;36mevaluate[0;34m()[0m
[0;32m     95 [0;31m            [0mval_acc[0m[0;34m.[0m[0mupdate[0m[0;34m([0m[0mprediction[0m[0;34m.[0m[0meq[0m[0;34m([0m[0mlabels[0m[0;34m.[0m[0mview_as[0m[0;34m([0m[0mprediction[0m[0;34m)[0m[0;34m)[0m[0;34m.[0m[0msum[0m[0;34m([0m[0;34m)[0m[0;34m.[0m[0mitem[0m[0;34m([0m[0;34m)[0m[0;34m/[0m[0mlen[0m[0;34m([0m[0mlabels[0m[0;34m)[0m[0;34m)[0m [0;31m# accuracy_score(labels.cpu(), prediction.cpu())[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 96 [0;31m            [0mval_macro_p[0m[0;34m.[0m[0mupdate[0m[0;34m([0m[0mprecision_score[0m[0;34m([0m[0mlabels[0m[0;34m.[0m[0mcpu[0m[0;34m([0m[0;34m)[0m[0;34m,[0m [0mprediction[0m[0;34m.[0m[0mcpu[0m[0;34m([0m[0;34m)[0m[0;34m,[0m [0maverage[0m [0;34m=[0m[0;34m'macro'[0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     97 [0;31m            [0mval_macro_r[

ipdb>  labels


tensor([0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0')


ipdb>  prediction


tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0')


ipdb>  classification_report(labels.cpu(), prediction.cpu(), labels=[1, 0], output_dict=True)['macro avg']


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'precision': 0.4375, 'recall': 0.5, 'f1-score': 0.4666666666666667, 'support': 16}


ipdb>  f1_score(labels.cpu(), prediction.cpu(), average ='macro'


*** SyntaxError: unexpected EOF while parsing


ipdb>  f1_score(labels.cpu(), prediction.cpu(), average ='macro')


0.4666666666666667


ipdb>  f1_score(labels.cpu(), prediction.cpu(), average ='macro', zero_division=1)


0.4666666666666667


ipdb>  f1_score(labels.cpu(), prediction.cpu(), average ='macro', zero_division=0)


0.4666666666666667


ipdb>  f1_score(labels.cpu(), prediction.cpu(), average ='macro')


0.4666666666666667


ipdb>  f1_score(labels.cpu(), prediction.cpu(), average ='macro')


0.4666666666666667


ipdb>  precision_score(labels.cpu(), prediction.cpu(), average ='micro')


0.875


ipdb>  f1_score(labels.cpu(), prediction.cpu(), average ='micro')


0.875


ipdb>  q


  0%|          | 0/817 [05:12<?, ?it/s]


BdbQuit: 

In [26]:
# test_df = pd.read_csv("../data/paperwithcode/new/60Neg800unk/twofoldwithunk/fold1/test_results.tsv", 
#                    sep="\t", names=["true", "false"])

# test_df = pd.read_csv("../data/paperwithcode/new/jar/10Neg20unk/testOutput.tsv", 
#                    sep="\t", names=["label", "title", "TDM", "Context"])

test_df = valid_df

test_df.head()

Unnamed: 0,label,title,TDM,Context
0,True,1803.11175.pdf,sentiment analysis; SUBJ; Accuracy,Universal Sentence Encoder We present models f...
1,True,1803.11175.pdf,text classification; TREC; Error,Universal Sentence Encoder We present models f...
2,False,1803.11175.pdf,question answering; SQuAD; F1,Universal Sentence Encoder We present models f...
3,False,1803.11175.pdf,relation prediction; FB15K-237; H@1,Universal Sentence Encoder We present models f...
4,False,1803.11175.pdf,word sense disambiguation; SemEval 2013; F1,Universal Sentence Encoder We present models f...


In [27]:
# test_loader = TDM_dataset.get_inference_data(test_df, batch_size=16, shuffle=False) # this shuffle should be false to preserve the order 

100%|██████████| 2655/2655 [00:39<00:00, 67.23it/s]


2655


In [112]:
with open(valid_path) as f:
    list_prediction_inputs = f.read().splitlines()

In [113]:
len(list_prediction_inputs)

13071

In [114]:
review_text = list_prediction_inputs[0]

In [115]:
review_text[:100]

'true\t1803.11175.pdf\tsentiment analysis; SUBJ; Accuracy\tUniversal Sentence Encoder We present models '

In [119]:
encoded_review = tokenizer.encode_plus(
  list_prediction_inputs[-200],
  max_length=max_input_length,
  add_special_tokens=True,
  return_token_type_ids=False,
  pad_to_max_length=True,
  return_attention_mask=True,
  return_tensors='pt',
)

input_ids = encoded_review['input_ids'].to(device)
attention_mask = encoded_review['attention_mask'].to(device)

outputs = model(input_ids, attention_mask)

prediction_scalled = torch.sigmoid(outputs.logits)
# _, prediction = torch.max(output, dim=1)
# print(f'Review text: {review_text}')

print(f'Output  : {prediction_scalled}')
print(f'Outputs logits  : {outputs.logits}')

Output  : tensor([[0.4158, 0.6538]], device='cuda:0', grad_fn=<SigmoidBackward>)
Outputs logits  : tensor([[-0.3399,  0.6357]], device='cuda:0', grad_fn=<AddmmBackward>)


Output  : tensor([[0.1245, 0.8405]], device='cuda:0', grad_fn=<SigmoidBackward>)
Outputs logits  : tensor([[-1.9507,  1.6618]], device='cuda:0', grad_fn=<AddmmBackward>)


In [30]:
# sample = iter(test_loader)
# sample.next()

In [None]:
# def predict_TDM_from_pdf(model, tokenizer, sentence):
#     model.eval()
#     tokens = tokenizer.tokenize(sentence)
#     tokens = tokens[:max_input_length-2]
#     indexed = [init_token_idx] + tokenizer.convert_tokens_to_ids(tokens) + [eos_token_idx]
#     tensor = torch.LongTensor(indexed).to(device)
#     tensor = tensor.unsqueeze(0)
#     prediction = torch.sigmoid(model(tensor))
#     return prediction.item()

In [49]:
def predict_TDM_from_pdf(model, tokenizer, iterator):
    model.eval()
    with torch.no_grad():
    
        for batch_idx, (pair_token_ids, mask_ids, seg_ids, y) in tqdm(enumerate(iterator), total=len(iterator)):
            pair_token_ids = pair_token_ids.to(device)
            mask_ids = mask_ids.to(device)
            seg_ids = seg_ids.to(device)
            labels = y.to(device)

            loss, prediction = model(pair_token_ids, 
                                 token_type_ids=seg_ids, 
                                 attention_mask=mask_ids, 
                                 labels=labels).values()

            prediction_scalled = torch.sigmoid(prediction)
            
            with open("test_results.tsv", "a+", encoding="utf-8") as text_file:
                for true, false in prediction_scalled.cpu():
                    text_file.write(str(true.item())+"\t"+str(false.item())+"\n")

In [50]:
predict_TDM_from_pdf(model, tokenizer, test_loader)

100%|██████████| 166/166 [00:31<00:00,  5.35it/s]


In [51]:
from collections import deque

def get_top_n_prediction_label(path_to_test_file, path_to_prediction_file, n = 5):
    """
    This function return the label with the highest proba
    """
    top5 = deque()
    with open(f"{path_to_test_file}") as f:
        txt_test_files = f.read().splitlines()
    with open(f"{path_to_prediction_file}") as f:
        txt_prediction_files = f.read().splitlines()
    
    highest = 0
    for example, prediction in zip(txt_test_files, txt_prediction_files):
        true_prob, false_prob = prediction.split("\t")
        true_prob, false_prob = float(true_prob), float(false_prob)
        if true_prob > false_prob:
            label = example.split("\t")[2]
            highest = true_prob
            top5.append((label, true_prob))
    return deque(sorted(top5, key=lambda x: x[1] if x else x, reverse=False), n)

In [52]:
get_top_n_prediction_label(
    path_to_test_file="../data/paperwithcode/new/jar/10Neg20unk/testOutput.tsv",
    path_to_prediction_file="test_results.tsv", 
    n = 1)

deque([('Image Clustering; Extended Yale-B; Accuracy', 0.8984434008598328)])