In [1]:
import pandas as pd
import numpy as np
import json

In [2]:
dev_df=pd.read_csv('./output/dev_hypothesis_evidences.csv')
dev_df

Unnamed: 0,hypothesis,evidences,label
0,There were only 3 adverse events in NCT0089450...,"[""Adverse Events 1:"", "" Total: 10/71 (14.08%)...",Contradiction
1,"compared to cohort 1 of NCT00475670, there are...","[""Adverse Events 1:"", "" Total: 0/3 (0.00%)"", ...",Entailment
2,There were 0 observed cases of Tibia or Fibula...,"[""Adverse Events 1:"", "" Total: 16/149 (10.74%...",Contradiction
3,There 55 more Participants With Best Tumor Res...,"[""Outcome Measurement: "", "" Percentage of Par...",Contradiction
4,Women classified as high-risk of developing br...,"[""Inclusion Criteria:"", "" Gail risk >= 1.7% a...",Entailment
...,...,...,...
75,There were 4 cases of Febrile neutropenia and ...,"[""Adverse Events 1:"", "" Febrile neutropenia 4...",Contradiction
76,At least 1 patient in NCT00022516 suffered fro...,"[""Adverse Events 1:"", "" Total: 0/0"", "" Leuko...",Entailment
77,Every patient in the Palbociclib+Letrozole Aus...,"[""Outcome Measurement: "", "" Number of Partici...",Entailment
78,Patients currently prescribed Diuretics are ex...,"[""Exclusion Criteria:"", "" medication(s) known...",Entailment


In [3]:
hypothesis_lst=dev_df['hypothesis'].values.tolist()
len(hypothesis_lst)

80

In [4]:
evidence_lst=dev_df['evidences'].apply(lambda l:' '.join(json.loads(l))).values.tolist()
len(evidence_lst)

80

In [5]:
label2id={"Contradiction":0,"Entailment":1}
label_lst=dev_df['label'].apply(lambda x:label2id[x]).values.tolist()
len(label_lst)

80

In [6]:
import random
import math
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F

from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [7]:
# text_tok=AutoTokenizer.from_pretrained('bert-base-uncased')
# text_clf=AutoModelForSequenceClassification.from_pretrained('bert-base-uncased',num_labels=2)
#roberta-base
text_tok=AutoTokenizer.from_pretrained('roberta-base')
text_clf=AutoModelForSequenceClassification.from_pretrained('roberta-base',num_labels=2)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classi

In [8]:
class InputSequence:
    
    def __init__(self,tok,l_text,l_text2,l_label,batch_size=64,gpu=True):
        
        self.data_len=len(l_text)
        self.data_idx=[i for i in range(self.data_len)]
        self.texts=tok(l_text,l_text2,padding=True, truncation=True, max_length=512, return_tensors='pt')
        self.l_label=np.array(l_label)
        print('tokenize done')
        
        self.batch_size=batch_size
        self.gpu=gpu
        
    def on_epoch_end(self):
        random.shuffle(self.data_idx)
        
    def __getitem__(self,i):
        start=i*self.batch_size
        batch_idx=self.data_idx[start:min(start+self.batch_size,self.data_len)]
        
        return_texts=dict([(k,self.texts[k][batch_idx]) for k in self.texts])
        return_labels=torch.from_numpy(
            self.l_label[batch_idx].astype(np.int64)
        )
        
        if self.gpu:
            return_texts=dict([(k,return_texts[k].cuda()) for k in return_texts])
            return_labels=return_labels.cuda()
        
        return return_texts,return_labels
    
    def __len__(self):
        return math.ceil(1.0*self.data_len/self.batch_size)
    

In [9]:
testing_data=InputSequence(text_tok,hypothesis_lst,evidence_lst,label_lst,gpu=True)

tokenize done


In [10]:
scores=[]
# model_names=['bert-base-uncased']+[
#     './output/clf_models/bert-base-uncased_epoch_{}.pt'.format(format(epoch,'05d'))
#     for epoch in range(10)
# ]
#roberta-base
model_names=['roberta-base']+[
    './output/clf_models/roberta-base_epoch_{}.pt'.format(format(epoch,'05d'))
    for epoch in range(10)
]
for model_name in model_names:
    scores.append([])
    clf=AutoModelForSequenceClassification.from_pretrained(model_name).cuda()
    with torch.no_grad():
        for batch in range(len(testing_data)):
            batch_texts,batch_labels=testing_data[batch]
            scores[-1].append(F.softmax(clf(**batch_texts).logits,dim=1).detach().cpu().numpy())
            print('model:',model_name,'batch:',batch,end='\r')
    scores[-1]=np.concatenate(scores[-1],axis=0)
    clf.cpu()

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classi

model: ./output/clf_models/roberta-base_epoch_00009.pt batch: 1

In [11]:
from sklearn.metrics import average_precision_score,f1_score,precision_score,recall_score,accuracy_score

y_true=label_lst
results=[]
for epoch in range(len(scores)):
    y_prob=scores[epoch][:,1]
    y_pred=[1 if a>0.5 else 0 for a in y_prob]
    results.append([
        'pretrained' if epoch==0 else epoch,
        average_precision_score(y_true,y_prob),
        f1_score(y_true,y_pred),
        precision_score(y_true,y_pred),
        recall_score(y_true,y_pred),
        accuracy_score(y_true,y_pred)
    ])

import pandas as pd

pd.DataFrame(results,columns=['epoch','AVG_PREC','F1','PREC','REC','ACC'])

Unnamed: 0,epoch,AVG_PREC,F1,PREC,REC,ACC
0,pretrained,0.525703,0.677686,0.5125,1.0,0.5125
1,1,0.445184,0.655462,0.5,0.95122,0.4875
2,2,0.434511,0.0,0.0,0.0,0.475
3,3,0.426259,0.241379,0.411765,0.170732,0.45
4,4,0.423549,0.117647,0.3,0.073171,0.4375
5,5,0.466932,0.644068,0.493506,0.926829,0.475
6,6,0.413002,0.297297,0.333333,0.268293,0.35
7,7,0.407725,0.495238,0.40625,0.634146,0.3375
8,8,0.383538,0.365591,0.326923,0.414634,0.2625
9,9,0.384274,0.386364,0.361702,0.414634,0.325
