In [1]:
import numpy as np

import pandas as pd
import torch 
from pytorch_pretrained_bert import BertTokenizer
from metal.mmtl.metal_model import MetalModel
from metal.mmtl.BERT_tasks import create_tasks
import metal.mmtl.dataset as dataset
from metal.mmtl.scorer import Scorer
from metal.utils import convert_labels

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [2]:

debug_path = '/dfs/scratch0/jdunnmon/mmtl/sota_quest/debugging/COLA/COLA_19_48_08/'
task_name = 'COLA' # SST2
model_path = '/dfs/scratch0/jdunnmon/mmtl/sota_quest/debugging/COLA/COLA_19_48_08/best_model.pth'
bert_model = 'bert-base-uncased'
max_len = 256
bert_output_dim = 768
dl_kwargs = {"batch_size": 32, 'shuffle': False}

In [3]:
dataset_cls = getattr(dataset, task_name.upper() + "Dataset")
dev_ds = dataset_cls(
    split="dev",
    bert_model=bert_model,
    max_len=max_len,
    max_datapoints=-1,
)
dev_dl = dev_ds.get_dataloader(**dl_kwargs)




In [4]:
tasks = create_tasks(
        task_names=[task_name],
        bert_model=bert_model,
        split_prop=0.8,
        max_len=max_len,
        dl_kwargs={"batch_size": 100},
        bert_output_dim=bert_output_dim,
        max_datapoints=10,
    )

Loading COLA Dataset








In [5]:
model = MetalModel(tasks, verbose=False, device=0)
model.load_state_dict(torch.load(model_path, map_location="cuda:0")['model'])

In [16]:
data = {
    'sentence1': [],
    'sentence2': [],
    'label': [],
    'score' : []
}
max_batches = 100
tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=True)
from tqdm import tqdm
count = 0
for x, y in tqdm(list(dev_dl)):
    for tokens_idx in x[0]:
        tokens = tokenizer.convert_ids_to_tokens(tokens_idx.numpy())
        phrases = ' '.join(tokens).replace('[PAD]', '').replace('[CLS]', '').split('[SEP]')
        data['sentence1'] += [phrases[0]]
        if len(phrases) > 1:
            data['sentence2'] += [phrases[1]] 
        else:
            data['sentence2'] += ['NA']
    scores = model.calculate_output(x, [task_name])[task_name].detach().cpu().numpy()[:, 0] # .flatten()
    data['score'] += list(scores)
    #data['label'] += list(y.numpy())
    data['label'] += list(convert_labels(y, 'categorical', 'onezero').numpy())
    count += 1
    if count > max_batches:
        break

100%|██████████| 33/33 [00:00<00:00, 33.96it/s]


In [17]:
df_error = pd.DataFrame(data, columns=['sentence1', 'sentence2', 'score', 'label'])

In [18]:
df_error.head()

Unnamed: 0,sentence1,sentence2,score,label
0,the sailors rode the breeze clear of the rock...,,0.990917,1
1,the weights made the rope stretch over the pu...,,0.961473,1
2,the mechanical doll wr ##ig ##gled itself loo...,,0.076215,1
3,"if you had eaten more , you would want less .",,0.995765,1
4,"as you eat the most , you want the least .",,0.106393,0


In [19]:
df_error['pred'] = 1* (df_error.score > 0.5)
df_error['is_wrong'] = df_error['pred'] != df_error['label']

In [20]:
def print_random_pred(df):
    row = df.iloc[np.random.randint(df.shape[0])]
    print(f'sentence1: \t{row.sentence1}')
    print(f'sentence2: \t{row.sentence2}')
    print('score: \t{:.4f}'.format(row.score))    
    print(f'label: \t{row.label}')    

In [28]:
df_error.head()

Unnamed: 0,sentence1,sentence2,score,label,pred,is_wrong
0,the sailors rode the breeze clear of the rock...,,0.990917,1,1,False
1,the weights made the rope stretch over the pu...,,0.961473,1,1,False
2,the mechanical doll wr ##ig ##gled itself loo...,,0.076215,1,0,True
3,"if you had eaten more , you would want less .",,0.995765,1,1,False
4,"as you eat the most , you want the least .",,0.106393,0,0,False


In [22]:
np.mean(df_error.pred==df_error.label)

0.8207094918504314

## Wrong predictions

In [25]:
for i in range(10):
    print_random_pred(df_error[df_error.is_wrong])
    print()

sentence1: 	 was sunk . 
sentence2: 	            
score: 	0.5685
label: 	0

sentence1: 	 only churchill remembered churchill giving the blood , sweat and tears speech . 
sentence2: 	
score: 	0.9908
label: 	0

sentence1: 	 protect you ! 
sentence2: 	           
score: 	0.9903
label: 	0

sentence1: 	 it isn ' t because sue said anything bad about me that i ' m angry . 
sentence2: 	  
score: 	0.3534
label: 	1

sentence1: 	 what they feared most was to be no one available to help them . 
sentence2: 	                 
score: 	0.9954
label: 	0

sentence1: 	 i won ' t have some money . 
sentence2: 	              
score: 	0.9900
label: 	0

sentence1: 	 leslie told us about us . 
sentence2: 	         
score: 	0.9645
label: 	0

sentence1: 	 the proof this set is rec ##urs ##ive is difficult . 
sentence2: 	                
score: 	0.9915
label: 	0

sentence1: 	 the defendants denies the all ##ega ##tion . 
sentence2: 	            
score: 	0.9855
label: 	0

sentence1: 	 newsweek about crime appear

## Correct predictions

In [26]:
for i in range(10):
    print_random_pred(df_error[df_error.is_wrong==False])
    print()

sentence1: 	 some my jobs are in jeopardy . 
sentence2: 	        
score: 	0.1227
label: 	0

sentence1: 	 something happened i couldn ' t really talk about . 
sentence2: 	        
score: 	0.9910
label: 	1

sentence1: 	 picture of bill , this girl in the red coat will put a picture of bill on your desk before tomorrow . 
sentence2: 	       
score: 	0.2432
label: 	0

sentence1: 	 smith loaned a valuable collection of manuscripts to the library . 
sentence2: 	                  
score: 	0.9933
label: 	1

sentence1: 	 the children amused . 
sentence2: 	           
score: 	0.0494
label: 	0

sentence1: 	 nobody told susan . 
sentence2: 	          
score: 	0.9962
label: 	1

sentence1: 	 there is a nurse available . 
sentence2: 	       
score: 	0.9960
label: 	1

sentence1: 	 cohen proved the independence of the continuum hypothesis . 
sentence2: 	   
score: 	0.9937
label: 	1

sentence1: 	 that is the reason why he resigned . 
sentence2: 	     
score: 	0.9950
label: 	1

sentence1: 	 a pound was w

In [27]:
#df_error.to_csv(f'{debug_path}/dev_error_analysis.tsv', sep='\t')