# Package loading 

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import numpy as np

from tqdm import tqdm
import ipdb
import spacy
# spacy.load("en_core_web_sm")
import torch
from torchtext import data
import torch.nn as nn
import torch.nn.functional as F
from torchsummary import summary

from torch.utils.data import Dataset, TensorDataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import pickle

from transformers import RobertaTokenizer, BertTokenizer, BertModel, TransfoXLTokenizer, TransfoXLModel, AdamW
from transformers import BigBirdTokenizer, BigBirdForSequenceClassification
from transformers import LongformerTokenizer, LongformerForSequenceClassification
from transformers import XLNetTokenizer, TFXLNetForSequenceClassification

import optuna
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support

# Data Exploration

In [2]:
train_df = pd.read_csv("../data/paperwithcode/new/60Neg800unk/twofoldwithunk/fold1/train.tsv", 
                    sep="\t", names=["label", "title", "TDM", "Context"])

test_df = pd.read_csv("../data/paperwithcode/new/60Neg800unk/twofoldwithunk/fold1/test.tsv", 
                   sep="\t", names=["label", "title", "TDM", "Context"])

In [3]:
train_df.head()

Unnamed: 0,label,title,TDM,Context
0,True,1810.02575v1.pdf,Semantic Segmentation; Nighttime Driving; mIoU,Dark Model Adaptation: Semantic Image Segmenta...
1,False,1810.02575v1.pdf,Extractive Text Summarization; DebateSum; ROUGE-L,Dark Model Adaptation: Semantic Image Segmenta...
2,False,1810.02575v1.pdf,Action Recognition; Something-Something V1; To...,Dark Model Adaptation: Semantic Image Segmenta...
3,False,1810.02575v1.pdf,Multi-Object Tracking; MOTS20; sMOTSA,Dark Model Adaptation: Semantic Image Segmenta...
4,False,1810.02575v1.pdf,Continuous Control; PyBullet Ant; Return,Dark Model Adaptation: Semantic Image Segmenta...


In [4]:
test_df.head()

Unnamed: 0,label,title,TDM,Context
0,True,1707.03497v2.pdf,Atari Games; Atari 2600 Seaquest; Score,Value Prediction Network This paper proposes a...
1,True,1707.03497v2.pdf,Atari Games; Atari 2600 Amidar; Score,Value Prediction Network This paper proposes a...
2,True,1707.03497v2.pdf,Atari Games; Atari 2600 Krull; Score,Value Prediction Network This paper proposes a...
3,True,1707.03497v2.pdf,Atari Games; Atari 2600 Alien; Score,Value Prediction Network This paper proposes a...
4,True,1707.03497v2.pdf,Atari Games; Atari 2600 Enduro; Score,Value Prediction Network This paper proposes a...


# Model

In [5]:
model_key = 'google/bigbird-roberta-base'
# model_key = 'allenai/longformer-base-4096'
# model_key = 'allenai/scibert_scivocab_uncased'
# model_key = 'xlnet-base-cased'

# tokenizer = RobertaTokenizer.from_pretrained('roberta-base') # roberta-base, bert-base-uncased
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer = BigBirdTokenizer.from_pretrained(model_key)
# tokenizer = LongformerTokenizer.from_pretrained(model_key)
# tokenizer =  XLNetTokenizer.from_pretrained(model_key)

init_token = tokenizer.cls_token
eos_token = tokenizer.sep_token
pad_token = tokenizer.pad_token
unk_token = tokenizer.unk_token

print(init_token, eos_token, pad_token, unk_token)

[CLS] [SEP] <pad> <unk>


In [6]:
init_token_idx = tokenizer.convert_tokens_to_ids(init_token)
eos_token_idx = tokenizer.convert_tokens_to_ids(eos_token)
pad_token_idx = tokenizer.convert_tokens_to_ids(pad_token)
unk_token_idx = tokenizer.convert_tokens_to_ids(unk_token)

print(init_token_idx, eos_token_idx, pad_token_idx, unk_token_idx)

65 66 0 100


In [7]:
max_input_length = tokenizer.max_model_input_sizes[model_key]

print(max_input_length)

def tokenize_and_cut(sentence):
    tokens = tokenizer.tokenize(sentence) 
    tokens = tokens[:max_input_length-2]
    return tokens

4096


In [8]:
class TransformersNLI(Dataset):
    def __init__(self, train_df, val_df, tokenizer, max_input_length, base_path="./"):
        self.label_dict = {'True': 0, 'False': 1} # Default {'entailment': 0, 'contradiction': 1, 'neutral': 2}
        self.train_df = train_df
        self.val_df = val_df
        self.base_path = base_path
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.train_data = None
        self.val_data = None
#         self.init_data()
        
    def init_data(self):
        self.train_data = self.load_data(self.train_df)
        self.val_data = self.load_data(self.val_df)
        
    def _truncate_seq_pair(self, tokens_a, tokens_b, max_length):
        """Truncates a sequence pair in place to the maximum length."""

        # This is a simple heuristic which will always truncate the longer sequence
        # one token at a time. This makes more sense than truncating an equal percent
        # of tokens from each, since if one sequence is very short then each token
        # that's truncated likely contains more information than a longer sequence.
        while True:
            total_length = len(tokens_a) + len(tokens_b)
            if total_length <= max_length:
                break
            if len(tokens_a) > len(tokens_b):
                tokens_a.pop()
            else:
                tokens_b.pop()
        
    def load_data(self, df):
        MAX_LEN = self.max_input_length
        token_ids = []
        mask_ids = []
        seg_ids = []
        y = []

        premise_list = df['TDM'].to_list()           # df['sentence1'].to_list()
        hypothesis_list = df['Context'].to_list()    # df['sentence2'].to_list()
        label_list = df['label'].to_list()           # df['gold_label'].to_list()

        for (premise, hypothesis, label) in tqdm(zip(premise_list, hypothesis_list, label_list), total=len(label_list)):
            premise_id = self.tokenizer.encode(premise, add_special_tokens = False)
            hypothesis_id = self.tokenizer.encode(hypothesis, add_special_tokens = False)
#             if len(premise_id)+len(hypothesis_id) >= MAX_LEN:
#                 ipdb.set_trace()
#                 pass
            # ignore the warning as the ong sequence issuw is taken care of here 
            self._truncate_seq_pair(premise_id, hypothesis_id, MAX_LEN-3) # -3 to account for the special characters 
            
            pair_token_ids = [self.tokenizer.cls_token_id] + premise_id \
                            + [self.tokenizer.sep_token_id] + hypothesis_id \
                            + [self.tokenizer.sep_token_id]
            premise_len = len(premise_id)
            hypothesis_len = len(hypothesis_id)

            segment_ids = torch.tensor([0] * (premise_len + 2) + [1] * (hypothesis_len + 1))  # sentence 0 and sentence 1
            attention_mask_ids = torch.tensor([1] * (premise_len + hypothesis_len + 3))  # mask padded values

            token_ids.append(torch.tensor(pair_token_ids))
            seg_ids.append(segment_ids)
            mask_ids.append(attention_mask_ids)
            # we have str(label) to have the key work proprely 
            y.append(self.label_dict[str(label)]) # y.append(self.label_dict[label]) 
            
        token_ids = pad_sequence(token_ids, batch_first=True)
        mask_ids = pad_sequence(mask_ids, batch_first=True)
        seg_ids = pad_sequence(seg_ids, batch_first=True)
        y = torch.tensor(y)
        dataset = TensorDataset(token_ids, mask_ids, seg_ids, y)
        print(len(dataset))
        return dataset

    def get_data_loaders(self, batch_size=32, shuffle=True):
        self.init_data()
        train_loader = DataLoader(
          self.train_data,
          shuffle=shuffle,
          batch_size=batch_size
        )

        val_loader = DataLoader(
          self.val_data,
          shuffle=shuffle,
          batch_size=batch_size
        )

        return train_loader, val_loader
    
    def get_inference_data(self, test_df, batch_size=32, shuffle=False):
        test_data = self.load_data(test_df)
                       
        test_loader = DataLoader(
          test_data,
          shuffle=shuffle,
          batch_size=batch_size
        )

        val_loader = DataLoader(
          self.val_data,
          shuffle=shuffle,
          batch_size=batch_size
        )

        return test_loader

In [9]:
TDM_dataset = TransformersNLI(train_df, test_df, tokenizer, max_input_length, base_path="./")

100%|██████████| 256008/256008 [15:06<00:00, 282.54it/s]
  0%|          | 102/108456 [00:00<04:40, 385.73it/s]

256008


100%|██████████| 108456/108456 [06:38<00:00, 272.26it/s]


108456


In [10]:
train_loader, valid_loader = TDM_dataset.get_data_loaders(batch_size=6)

## Build the Model

In [11]:
# model = RobertaModel.from_pretrained('roberta-base') # bert-base-cased, bert-large-cased
# bert = BertModel.from_pretrained('bert-base-uncased')
# model = TransfoXLModel.from_pretrained('transfo-xl-wt103')

In [12]:
# import torch.nn as nn

# class BERTGRUSentiment(nn.Module):
#     def __init__(self,
#                  bert,
#                  hidden_dim,
#                  output_dim,
#                  n_layers,
#                  bidirectional,
#                  dropout):
        
#         super().__init__()
        
#         self.bert = bert
        
#         embedding_dim = bert.config.to_dict()['hidden_size']
        
#         self.rnn = nn.GRU(embedding_dim,
#                           hidden_dim,
#                           num_layers = n_layers,
#                           bidirectional = bidirectional,
#                           batch_first = True,
#                           dropout = 0 if n_layers < 2 else dropout)
        
#         self.out = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        
#         self.dropout = nn.Dropout(dropout)
        
#     def forward(self, text):
        
#         #text = [batch size, sent len]
                
#         with torch.no_grad():
#             embedded = self.bert(text)[0]
                
#         #embedded = [batch size, sent len, emb dim]
        
#         _, hidden = self.rnn(embedded)
        
#         #hidden = [n layers * n directions, batch size, emb dim]
        
#         if self.rnn.bidirectional:
#             hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
#         else:
#             hidden = self.dropout(hidden[-1,:,:])
                
#         #hidden = [batch size, hid dim]
        
#         output = self.out(hidden)
        
#         #output = [batch size, out dim]
        
#         return output

In [13]:
# HIDDEN_DIM = 256
# OUTPUT_DIM = 1
# N_LAYERS = 2
# BIDIRECTIONAL = True
# DROPOUT = 0.25

# model = BERTGRUSentiment(bert,
#                          HIDDEN_DIM,
#                          OUTPUT_DIM,
#                          N_LAYERS,
#                          BIDIRECTIONAL,
#                          DROPOUT)

In [14]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [15]:
model = BigBirdForSequenceClassification.from_pretrained(model_key, num_labels=2)
# LongformerForSequenceClassification.from_pretrained(model_key, num_labels=2)
model = model.to(device)
# criterion = criterion.to(device)

Some weights of the model checkpoint at google/bigbird-roberta-base were not used when initializing BigBirdForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BigBirdForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BigBirdForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BigBirdForSequenceClassifica

In [16]:
import torch.optim as optim

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

# optimizer = optim.SGD(model.parameters(), lr = 1e-3, momentum=0.9, 
#                       weight_decay=0, dampening=0, nesterov=True)

optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5, correct_bias=False)

# optimizer = optim.Adam(model.parameters(), lr = 1e-2, betas=(0.9, 0.999), 
#                       weight_decay=0.0, amsgrad=False)

# optimizer = optim.Adam(model.parameters())
# criterion = nn.CrossEntropyLoss(weight=w)
# criterion = nn.CrossEntropyLoss()

# criterion = nn.CrossEntropyLoss()
# model = BERTGRUSentiment(model,
#                          HIDDEN_DIM,
#                          OUTPUT_DIM,
#                          N_LAYERS,
#                          BIDIRECTIONAL,
#                          DROPOUT)



In [17]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 128,060,930 trainable parameters


In [18]:
# for name, param in model.named_parameters():                
#     if name.startswith('bert'):
#         param.requires_grad = True

In [19]:
# def count_parameters(model):
#     return sum(p.numel() for p in model.parameters() if p.requires_grad)

# print(f'The model has {count_parameters(model):,} trainable parameters')

In [20]:
class AverageMeter(object):
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [21]:
def train(model, iterator, optimizer):
    
    model.train()
    
    train_loss = AverageMeter()
    train_acc = AverageMeter()
    train_f1 = AverageMeter()
    
    for batch_idx, (pair_token_ids, mask_ids, seg_ids, y) in tqdm(enumerate(iterator), total=len(iterator)):
        
        optimizer.zero_grad()
        
        pair_token_ids = pair_token_ids.to(device)
        mask_ids = mask_ids.to(device)
        seg_ids = seg_ids.to(device)
        labels = y.to(device)

        loss, prediction = model(pair_token_ids, 
                             token_type_ids=seg_ids, 
                             attention_mask=mask_ids, 
                             labels=labels).values()          
        
        loss.backward()
        optimizer.step()
                
        prediction = torch.log_softmax(prediction, dim=1).argmax(dim=1)
   
        train_acc.update(prediction.eq(labels.view_as(prediction)).sum().item()/len(labels)) # accuracy_score(labels.cpu(), prediction.cpu())
        train_f1.update(f1_score(labels.cpu(), prediction.cpu(), average ='macro'))
        train_loss.update(loss.item())  
        
        if (batch_idx + 1) % 1000 == 0:
            print('[epoch %d], [iter %d / %d], [train loss %.5f], [train acc %.5f], [train f1 %.5f]' % (
                epoch, batch_idx + 1, len(iterator), train_loss.avg, train_acc.avg, train_f1.avg))
            
    return train_loss.avg, train_acc.avg, train_f1.avg

def evaluate(model, iterator, optimizer):
    
    model.eval()
    val_loss = AverageMeter()
    val_acc = AverageMeter()
    val_f1 = AverageMeter()
    
    with torch.no_grad():
    
        for batch_idx, (pair_token_ids, mask_ids, seg_ids, y) in tqdm(enumerate(iterator), total=len(iterator)):
#             optimizer.zero_grad()
            pair_token_ids = pair_token_ids.to(device)
            mask_ids = mask_ids.to(device)
            seg_ids = seg_ids.to(device)
            labels = y.to(device)

            loss, prediction = model(pair_token_ids, 
                                 token_type_ids=seg_ids, 
                                 attention_mask=mask_ids, 
                                 labels=labels).values()

            prediction = torch.log_softmax(prediction, dim=1).argmax(dim=1)
   
            val_acc.update(prediction.eq(labels.view_as(prediction)).sum().item()/len(labels)) # accuracy_score(labels.cpu(), prediction.cpu())
            val_f1.update(f1_score(labels.cpu(), prediction.cpu(), average ='macro'))
            val_loss.update(loss.item())        

    
    print('------------------------------------------------------------')
    print(f"Accuracy Score : {val_acc.avg}; F1 Score : {val_f1.avg}")
    print('------------------------------------------------------------')
    
    return val_loss.avg, val_acc.avg, val_f1.avg

In [22]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
N_EPOCHS = 2

best_valid_loss = 0.30 #float('inf')
best_valid_f1 = 0.5

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc, train_f1 = train(model, train_loader, optimizer)
    valid_loss, valid_acc, valid_f1 = evaluate(model, valid_loader, optimizer)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_f1*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_f1*100:.2f}%')
    
    if valid_f1 > best_valid_f1 : #and abs(valid_loss - best_valid_loss) < 1e-1
        best_valid_f1 = valid_f1
        print('Saving Model ...')
        torch.save(model.state_dict(), 'Model_f1_'+str(best_valid_f1)[:4]+'.pt')
        print('*****************************************************')
        print('best record: [epoch %d], [val loss %.5f], [val acc %.5f], [val f1 %.5f]' % (epoch, valid_loss, valid_acc, valid_f1))
        print('*****************************************************')

  2%|▏         | 1000/42668 [37:14<25:51:05,  2.23s/it]

[epoch 0], [iter 1000 / 42668], [train loss 0.41968], [train acc 0.85583], [train f1 0.65853]


  5%|▍         | 2000/42668 [1:14:28<25:14:48,  2.23s/it]

[epoch 0], [iter 2000 / 42668], [train loss 0.41917], [train acc 0.85467], [train f1 0.65229]


  7%|▋         | 3000/42668 [1:51:41<24:34:26,  2.23s/it]

[epoch 0], [iter 3000 / 42668], [train loss 0.41910], [train acc 0.85417], [train f1 0.64812]


  9%|▉         | 4000/42668 [2:28:53<23:59:47,  2.23s/it]

[epoch 0], [iter 4000 / 42668], [train loss 0.42317], [train acc 0.85138], [train f1 0.64355]


 12%|█▏        | 5000/42668 [3:06:06<23:20:45,  2.23s/it]

[epoch 0], [iter 5000 / 42668], [train loss 0.42193], [train acc 0.85190], [train f1 0.64573]


 14%|█▍        | 6000/42668 [3:43:18<22:43:01,  2.23s/it]

[epoch 0], [iter 6000 / 42668], [train loss 0.41990], [train acc 0.85294], [train f1 0.64831]


 16%|█▋        | 7000/42668 [4:20:31<22:08:51,  2.24s/it]

[epoch 0], [iter 7000 / 42668], [train loss 0.41941], [train acc 0.85312], [train f1 0.64872]


 19%|█▊        | 8000/42668 [4:57:44<21:29:13,  2.23s/it]

[epoch 0], [iter 8000 / 42668], [train loss 0.41874], [train acc 0.85342], [train f1 0.64872]


 21%|██        | 9000/42668 [5:34:59<20:52:18,  2.23s/it]

[epoch 0], [iter 9000 / 42668], [train loss 0.41854], [train acc 0.85348], [train f1 0.64950]


 23%|██▎       | 10000/42668 [6:12:11<20:15:09,  2.23s/it]

[epoch 0], [iter 10000 / 42668], [train loss 0.41921], [train acc 0.85305], [train f1 0.64924]


 26%|██▌       | 11000/42668 [6:49:25<19:39:50,  2.24s/it]

[epoch 0], [iter 11000 / 42668], [train loss 0.41857], [train acc 0.85339], [train f1 0.64950]


 28%|██▊       | 12000/42668 [7:26:39<19:01:32,  2.23s/it]

[epoch 0], [iter 12000 / 42668], [train loss 0.41868], [train acc 0.85331], [train f1 0.64938]


 30%|███       | 13000/42668 [8:03:52<18:23:55,  2.23s/it]

[epoch 0], [iter 13000 / 42668], [train loss 0.41915], [train acc 0.85300], [train f1 0.64883]


 33%|███▎      | 14000/42668 [8:41:06<17:46:29,  2.23s/it]

[epoch 0], [iter 14000 / 42668], [train loss 0.41878], [train acc 0.85317], [train f1 0.64878]


 35%|███▌      | 15000/42668 [9:18:20<17:08:41,  2.23s/it]

[epoch 0], [iter 15000 / 42668], [train loss 0.41958], [train acc 0.85269], [train f1 0.64809]


 37%|███▋      | 16000/42668 [9:55:34<16:32:24,  2.23s/it]

[epoch 0], [iter 16000 / 42668], [train loss 0.41951], [train acc 0.85271], [train f1 0.64832]


 40%|███▉      | 17000/42668 [10:32:45<15:54:03,  2.23s/it]

[epoch 0], [iter 17000 / 42668], [train loss 0.41975], [train acc 0.85255], [train f1 0.64809]


 42%|████▏     | 18000/42668 [11:09:56<15:17:38,  2.23s/it]

[epoch 0], [iter 18000 / 42668], [train loss 0.41960], [train acc 0.85261], [train f1 0.64821]


 45%|████▍     | 19000/42668 [11:47:07<14:39:18,  2.23s/it]

[epoch 0], [iter 19000 / 42668], [train loss 0.41981], [train acc 0.85248], [train f1 0.64781]


 47%|████▋     | 20000/42668 [12:24:17<14:01:56,  2.23s/it]

[epoch 0], [iter 20000 / 42668], [train loss 0.41972], [train acc 0.85253], [train f1 0.64829]


 49%|████▉     | 21000/42668 [13:01:26<13:25:17,  2.23s/it]

[epoch 0], [iter 21000 / 42668], [train loss 0.41960], [train acc 0.85259], [train f1 0.64862]


 52%|█████▏    | 22000/42668 [13:38:34<12:47:32,  2.23s/it]

[epoch 0], [iter 22000 / 42668], [train loss 0.41997], [train acc 0.85238], [train f1 0.64841]


 54%|█████▍    | 23000/42668 [14:15:43<12:10:25,  2.23s/it]

[epoch 0], [iter 23000 / 42668], [train loss 0.41953], [train acc 0.85262], [train f1 0.64899]


 56%|█████▌    | 24000/42668 [14:52:51<11:36:02,  2.24s/it]

[epoch 0], [iter 24000 / 42668], [train loss 0.41959], [train acc 0.85257], [train f1 0.64906]


 59%|█████▊    | 25000/42668 [15:30:00<10:56:04,  2.23s/it]

[epoch 0], [iter 25000 / 42668], [train loss 0.42006], [train acc 0.85229], [train f1 0.64865]


 61%|██████    | 26000/42668 [16:07:08<10:22:28,  2.24s/it]

[epoch 0], [iter 26000 / 42668], [train loss 0.42006], [train acc 0.85228], [train f1 0.64864]


 63%|██████▎   | 27000/42668 [16:44:16<9:41:50,  2.23s/it] 

[epoch 0], [iter 27000 / 42668], [train loss 0.42004], [train acc 0.85228], [train f1 0.64869]


 66%|██████▌   | 28000/42668 [17:21:25<9:04:51,  2.23s/it]

[epoch 0], [iter 28000 / 42668], [train loss 0.41973], [train acc 0.85245], [train f1 0.64896]


 68%|██████▊   | 29000/42668 [17:58:33<8:27:30,  2.23s/it]

[epoch 0], [iter 29000 / 42668], [train loss 0.42014], [train acc 0.85219], [train f1 0.64879]


 70%|███████   | 30000/42668 [18:35:41<7:50:31,  2.23s/it]

[epoch 0], [iter 30000 / 42668], [train loss 0.42017], [train acc 0.85217], [train f1 0.64903]


 73%|███████▎  | 31000/42668 [19:12:49<7:13:19,  2.23s/it]

[epoch 0], [iter 31000 / 42668], [train loss 0.42009], [train acc 0.85220], [train f1 0.64912]


 75%|███████▍  | 32000/42668 [19:49:57<6:36:12,  2.23s/it]

[epoch 0], [iter 32000 / 42668], [train loss 0.42023], [train acc 0.85211], [train f1 0.64883]


 77%|███████▋  | 33000/42668 [20:27:06<5:59:03,  2.23s/it]

[epoch 0], [iter 33000 / 42668], [train loss 0.42036], [train acc 0.85202], [train f1 0.64873]


 79%|███████▉  | 33627/42668 [20:50:23<5:35:42,  2.23s/it]

## Inference

We'll then use the model to test the sentiment of some sequences. We tokenize the input sequence, trim it down to the maximum length, add the special tokens to either side, convert it to a tensor, add a fake batch dimension and then pass it through our model.

In [24]:
# Reload the best model
model.load_state_dict(torch.load('Model_f1_0.93.pt'))

<All keys matched successfully>

In [26]:
# test_df = pd.read_csv("../data/paperwithcode/new/60Neg800unk/twofoldwithunk/fold1/test_results.tsv", 
#                    sep="\t", names=["true", "false"])

test_df = pd.read_csv("../data/paperwithcode/new/jar/10Neg20unk/testOutput.tsv", 
                   sep="\t", names=["label", "title", "TDM", "Context"])

test_df.head()

Unnamed: 0,label,title,TDM,Context
0,True,1203.1005v3.pdf,Single Image Deraining; Rain100H; PSNR,"Sparse Subspace Clustering: Algorithm, Theory,..."
1,True,1203.1005v3.pdf,Question Answering; YahooCQA; P@1,"Sparse Subspace Clustering: Algorithm, Theory,..."
2,True,1203.1005v3.pdf,Atari Games; Atari 2600 Private Eye; Score,"Sparse Subspace Clustering: Algorithm, Theory,..."
3,True,1203.1005v3.pdf,Speech Recognition; MediaSpeech; WER for Turkish,"Sparse Subspace Clustering: Algorithm, Theory,..."
4,True,1203.1005v3.pdf,3D Point Cloud Classification; ModelNet40; Mea...,"Sparse Subspace Clustering: Algorithm, Theory,..."


In [27]:
test_loader = TDM_dataset.get_inference_data(test_df, batch_size=16, shuffle=False) # this shuffle should be false to preserve the order 

100%|██████████| 2655/2655 [00:39<00:00, 67.23it/s]


2655


In [30]:
# sample = iter(test_loader)
# sample.next()

In [None]:
# def predict_TDM_from_pdf(model, tokenizer, sentence):
#     model.eval()
#     tokens = tokenizer.tokenize(sentence)
#     tokens = tokens[:max_input_length-2]
#     indexed = [init_token_idx] + tokenizer.convert_tokens_to_ids(tokens) + [eos_token_idx]
#     tensor = torch.LongTensor(indexed).to(device)
#     tensor = tensor.unsqueeze(0)
#     prediction = torch.sigmoid(model(tensor))
#     return prediction.item()

In [49]:
def predict_TDM_from_pdf(model, tokenizer, iterator):
    model.eval()
    with torch.no_grad():
    
        for batch_idx, (pair_token_ids, mask_ids, seg_ids, y) in tqdm(enumerate(iterator), total=len(iterator)):
            pair_token_ids = pair_token_ids.to(device)
            mask_ids = mask_ids.to(device)
            seg_ids = seg_ids.to(device)
            labels = y.to(device)

            loss, prediction = model(pair_token_ids, 
                                 token_type_ids=seg_ids, 
                                 attention_mask=mask_ids, 
                                 labels=labels).values()

            prediction_scalled = torch.sigmoid(prediction)
            
            with open("test_results.tsv", "a+", encoding="utf-8") as text_file:
                for true, false in prediction_scalled.cpu():
                    text_file.write(str(true.item())+"\t"+str(false.item())+"\n")

In [50]:
predict_TDM_from_pdf(model, tokenizer, test_loader)

100%|██████████| 166/166 [00:31<00:00,  5.35it/s]


In [51]:
from collections import deque

def get_top_n_prediction_label(path_to_test_file, path_to_prediction_file, n = 5):
    """
    This function return the label with the highest proba
    """
    top5 = deque()
    with open(f"{path_to_test_file}") as f:
        txt_test_files = f.read().splitlines()
    with open(f"{path_to_prediction_file}") as f:
        txt_prediction_files = f.read().splitlines()
    
    highest = 0
    for example, prediction in zip(txt_test_files, txt_prediction_files):
        true_prob, false_prob = prediction.split("\t")
        true_prob, false_prob = float(true_prob), float(false_prob)
        if true_prob > false_prob:
            label = example.split("\t")[2]
            highest = true_prob
            top5.append((label, true_prob))
    return deque(sorted(top5, key=lambda x: x[1] if x else x, reverse=False), n)

In [52]:
get_top_n_prediction_label(
    path_to_test_file="../data/paperwithcode/new/jar/10Neg20unk/testOutput.tsv",
    path_to_prediction_file="test_results.tsv", 
    n = 1)

deque([('Image Clustering; Extended Yale-B; Accuracy', 0.8984434008598328)])