In [1]:
import torch 

import metal.mmtl.dataset as dataset
import numpy as np
import pandas as pd

from metal.mmtl.BERT_tasks import create_tasks
from metal.mmtl.metal_model import MetalModel
from metal.mmtl.scorer import Scorer
from metal.utils import convert_labels
from pytorch_pretrained_bert import BertTokenizer

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


### Load Model and Data

In [2]:
model_path = '/dfs/scratch0/jdunnmon/mmtl/sota_quest/debugging/COLA/COLA_19_48_08/best_model.pth'
csv_path = "/".join(model_path.split('/')[0:-2]) #set to -1 if permissions exist
task_name = model_path.split('/')[-3]

bert_model = 'bert-base-uncased'
max_len = 256
bert_output_dim = 768
dl_kwargs = {"batch_size": 32, 'shuffle': False}

In [3]:
#Get DataLoader
dataset_cls = getattr(dataset, task_name.upper() + "Dataset")
dev_ds = dataset_cls(
    split="dev",
    bert_model=bert_model,
    max_len=max_len,
    max_datapoints=-1,
)
dev_dl = dev_ds.get_dataloader(**dl_kwargs)

#Load best model for specified task
tasks = create_tasks(
        task_names=[task_name],
        bert_model=bert_model,
        split_prop=0.8,
        max_len=max_len,
        dl_kwargs={"batch_size": 1},
        bert_output_dim=bert_output_dim,
        max_datapoints=10,
    )

model = MetalModel(tasks, verbose=False, device=-1)
model.load_state_dict(torch.load(model_path)['model'])

HBox(children=(IntProgress(value=0, max=1043), HTML(value='')))




100%|██████████| 407873900/407873900 [00:29<00:00, 13768723.88B/s]


Loading COLA Dataset


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




### Generate DataFrame of Predictions and True Labels

In [4]:
data = {
    'sentence1': [],
    'sentence2': [],
    'label': [],
    'score' : []
}
max_batches = 100
tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=True)
from tqdm import tqdm
count = 0
for x, y in tqdm(list(dev_dl)):
    for tokens_idx in x[0]:
        tokens = tokenizer.convert_ids_to_tokens(tokens_idx.numpy())
        phrases = ' '.join(tokens).replace('[PAD]', '').replace('[CLS]', '').split('[SEP]')
        data['sentence1'] += [phrases[0]]
        if len(phrases) > 1:
            data['sentence2'] += [phrases[1]] 
        else:
            data['sentence2'] += ['NA']
    scores = model.calculate_output(x, [task_name])[task_name].detach().cpu().numpy()[:, 0] # .flatten()
    data['score'] += list(scores)
    data['label'] += list(convert_labels(y, 'categorical', 'onezero').numpy())
    count += 1
    if count > max_batches:
        break
        

df_error = pd.DataFrame(data, columns=['sentence1', 'sentence2', 'score', 'label'])
df_error['pred'] = 1* (df_error.score > 0.5)
df_error['is_wrong'] = df_error['pred'] != df_error['label']

100%|██████████| 33/33 [00:01<00:00, 32.98it/s]


### Saving and Loading Error DataFrame

In [5]:
def save_dataframe(df,filepath):
    df.to_csv(filepath, sep='\t')
    print('Saved dataframe to: ', filepath)
    
def load_dataframe(filepath):
    df = pd.read_csv(filepath, sep='\t')
    return df

filepath = f'{csv_path}/dev_error_analysis.tsv'
save_dataframe(df_error,filepath)
df_error = load_dataframe(filepath)
df_error.head()

Saved dataframe to:  /dfs/scratch0/jdunnmon/mmtl/sota_quest/debugging/COLA/dev_error_analysis.tsv


Unnamed: 0.1,Unnamed: 0,sentence1,sentence2,score,label,pred,is_wrong
0,0,the sailors rode the breeze clear of the rock...,,0.98593,1,1,False
1,1,the weights made the rope stretch over the pu...,,0.993275,1,1,False
2,2,the mechanical doll wr ##ig ##gled itself loo...,,0.76591,1,1,False
3,3,"if you had eaten more , you would want less .",,0.994813,1,1,False
4,4,"as you eat the most , you want the least .",,0.216188,0,0,False


### Looking at Random Examples

In [6]:
def print_random_pred(df):
    row = df.iloc[np.random.randint(df.shape[0])]
    print(f'sentence1: \t{row.sentence1}')
    print(f'sentence2: \t{row.sentence2}')
    print('score: \t{:.4f}'.format(row.score))    
    print(f'label: \t{row.label}')   

In [9]:
print("INCORRECT PREDICTIONS")
for i in range(3):
    print_random_pred(df_error[df_error.is_wrong==True])
    print()
    
print("CORRECT PREDICTIONS")
for i in range(3):
    print_random_pred(df_error[df_error.is_wrong==False])
    print()

INCORRECT PREDICTIONS
sentence1: 	 the horse raced past the barn fell . 
sentence2: 	     
score: 	0.3070
label: 	1

sentence1: 	 she talked to john or mary but i don ' t know which . 
sentence2: 	                
score: 	0.3192
label: 	1

sentence1: 	 the book that i like - everyone else in the class hates . 
sentence2: 	     
score: 	0.9870
label: 	0

CORRECT PREDICTIONS
sentence1: 	 the idea dismay ##ed the prime minister that the dome was dull . 
sentence2: 	     
score: 	0.9949
label: 	1

sentence1: 	 john made bill master of himself . 
sentence2: 	    
score: 	0.9932
label: 	1

sentence1: 	 which topic did you get bored because mary talked about ? 
sentence2: 	  
score: 	0.3979
label: 	0



## Sandbox for Error Analysis

**1. We want to look at examples that are "barely" wrong and "barely" right since we have hope for boosts here.**

In [26]:
def print_barely_wrong_pred(df,thresh=0.05):
    df_temp = df[df.is_wrong==True]
    idx = np.where(np.abs(df_temp.score - 0.5) <= thresh)[0]
    row = df_temp.iloc[np.random.choice(list(idx))]
    
    print(f'sentence1: \t{row.sentence1}')
    print(f'sentence2: \t{row.sentence2}')
    print('score: \t{:.4f}'.format(row.score))    
    print(f'label: \t{row.label}')  
    
def print_barely_right_pred(df,thresh=0.05):
    df_temp = df[df.is_wrong==False]
    idx = np.where(np.abs(df_temp.score - 0.5) <= thresh)[0]
    row = df_temp.iloc[np.random.choice(list(idx))]
    
    print(f'sentence1: \t{row.sentence1}')
    print(f'sentence2: \t{row.sentence2}')
    print('score: \t{:.4f}'.format(row.score))    
    print(f'label: \t{row.label}')   

In [27]:
print("BARELY WRONG")
for i in range(3):
    print_barely_wrong_pred(df_error)
    print()
    
print("BARELY RIGHT")
for i in range(3):
    print_barely_wrong_pred(df_error)
    print()

BARELY WRONG
sentence1: 	 anson demon ##ized 
sentence2: 	          
score: 	0.5431
label: 	0

sentence1: 	 ellen said about the present conditions . 
sentence2: 	        
score: 	0.5495
label: 	0

sentence1: 	 was sunk . 
sentence2: 	            
score: 	0.5269
label: 	0

BARELY RIGHT
sentence1: 	 john bought a book on the table . 
sentence2: 	       
score: 	0.4643
label: 	1

sentence1: 	 anson demon ##ized 
sentence2: 	          
score: 	0.5431
label: 	0

sentence1: 	 john bought a book on the table . 
sentence2: 	       
score: 	0.4643
label: 	1



**2. We also want to look at examples we got completely wrong since that could point to a systematic bias in the data/model. It could also help us find examples in the dataset that are mislabeled by human annotators**

In [28]:
def print_very_wrong_pred(df,thresh=0.95):
    df_temp = df[df.is_wrong==True]
    idx = np.where(np.abs(df_temp.score - df_temp.label) >= thresh)[0]
    row = df_temp.iloc[np.random.choice(list(idx))]
    
    print(f'sentence1: \t{row.sentence1}')
    print(f'sentence2: \t{row.sentence2}')
    print('score: \t{:.4f}'.format(row.score))    
    print(f'label: \t{row.label}') 

In [29]:
print("VERY WRONG")
for i in range(3):
    print_very_wrong_pred(df_error)
    print()

VERY WRONG
sentence1: 	 john bought a dog for himself to play with . 
sentence2: 	 
score: 	0.9962
label: 	0

sentence1: 	 sandy was trying to work out which students would be able to solve a certain problem , but she wouldn ' t tell us which one . 
sentence2: 	      
score: 	0.9925
label: 	0

sentence1: 	 gould ' s performance of bach on the piano doesn ' t please me anywhere as much as ross ' s on the harpsichord . 
sentence2: 	      
score: 	0.9872
label: 	0



**3. To find systematic errors, we can also look for correlations between certain features and the incorrectness a la Socratic**


We can make this way more sophisticated by perhaps using embeddings instead of this simple BoW featurization.

In [30]:
#Create a vector of correct/incorrect predictions
is_wrong_label = np.array(df_error.is_wrong.astype(float))

#Create BoW featurization [WIP]
