In [1]:
import pandas as pd
import numpy as np
import json

In [2]:
train_df=pd.read_csv('./output/train_hypothesis_evidences.csv')
train_df

Unnamed: 0,hypothesis,evidences,label
0,All the primary trial participants do not rece...,"[""INTERVENTION 1: "", "" Diagnostic (FLT PET)"",...",Contradiction
1,"Patients with Platelet count over 100,000/mm¬¨...","["" PATIENT CHARACTERISTICS:"", "" ANC 1,500/m...",Contradiction
2,Heart-related adverse events were recorded in ...,"[""Adverse Events 1:"", "" Supraventricular tach...",Entailment
3,Adult Patients with histologic confirmation of...,"[""Inclusion Criteria:"", "" Patients with histo...",Contradiction
4,Laser Therapy is in each cohort of the primary...,"[""INTERVENTION 1: "", "" Laser Therapy Alone"", ...",Contradiction
...,...,...,...
1695,"Adequate blood, kidney, and hepatic function a...","[""Inclusion Criteria:"", "" Postmenopausal wome...",Entailment
1696,The Ridaforolimus + Dalotuzumab + Exemestane g...,"[""Outcome Measurement: "", "" 1. Progression-fr...",Contradiction
1697,The only difference between the interventions ...,"[""INTERVENTION 1: "", "" Prone"", ""Prone positio...",Entailment
1698,Patients must have a white blood cell count ab...,"["" WBC > 1,500/mm\u00b3""]",Entailment


In [3]:
hypothesis_lst=train_df['hypothesis'].values.tolist()
len(hypothesis_lst)

1700

In [4]:
evidence_lst=train_df['evidences'].apply(lambda l:' '.join(json.loads(l))).values.tolist()
len(evidence_lst)

1700

In [5]:
label2id={"Contradiction":0,"Entailment":1}
label_lst=train_df['label'].apply(lambda x:label2id[x]).values.tolist()
len(label_lst)

1700

In [6]:
class InputSequence:
    
    def __init__(self,tok,l_text,l_text2,l_label,batch_size=64,gpu=True,task_prefix = "Detect entailment: "):
        
        self.data_len=len(l_text)
        self.data_idx=[i for i in range(self.data_len)]
        self.texts=tok([task_prefix+' '.join([a,b]) for a,b in zip(l_text,l_text2)],
                       padding="longest",
                       max_length=512,#max_source_length,
                       truncation=True,
                       return_tensors="pt",
                      )
        self.labels=tok(["Entailment" if lab==1 else "Contradiction" for lab in l_label],
                         padding="longest",
                         max_length=128,#,
                         truncation=True,
                         return_tensors="pt",
                        )
        print('tokenize done')
        
        self.batch_size=batch_size
        self.gpu=gpu
        
    def on_epoch_end(self):
        random.shuffle(self.data_idx)
        
    def __getitem__(self,i):
        start=i*self.batch_size
        batch_idx=self.data_idx[start:min(start+self.batch_size,self.data_len)]
        
        return_texts=dict([(k,self.texts[k][batch_idx]) for k in self.texts])
        return_labels=dict([(k,self.labels[k][batch_idx]) for k in self.labels])
        
        if self.gpu:
            return_texts=dict([(k,return_texts[k].cuda()) for k in return_texts])
            return_labels=dict([(k,return_labels[k].cuda()) for k in return_labels])
        
        return return_texts,return_labels
    
    def __len__(self):
        return math.ceil(1.0*self.data_len/self.batch_size)
    

In [7]:
import random
import math
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F

from transformers import T5Tokenizer, T5ForConditionalGeneration

text_tok = T5Tokenizer.from_pretrained("t5-base")
text_clf = T5ForConditionalGeneration.from_pretrained("t5-base")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [8]:
training_data=InputSequence(text_tok,hypothesis_lst,evidence_lst,label_lst,gpu=True)
len(training_data)

tokenize done


27

In [9]:
class Model(nn.Module):
    def __init__(self,clf):
        super(Model, self).__init__()
        self.clf=clf
        # self.loss=nn.CrossEntropyLoss()
    
    def forward(self, texts, labels, gpu=True):
        
        input_ids, attention_mask = texts['input_ids'], texts['attention_mask']
        labels = labels['input_ids']
        # replace padding token id's of the labels by -100 so it's ignored by the loss
        labels[labels == text_tok.pad_token_id] = -100
        
        loss=self.clf(input_ids=input_ids, attention_mask=attention_mask, labels=labels).loss
        
        return loss

In [10]:
model=Model(text_clf)

In [11]:
bat_s=32
l_rate=1e-5

training_data.batch_size=bat_s

model.cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=l_rate)
total_epoch_num=10
for epoch in range(total_epoch_num):
    training_data.on_epoch_end()
    loss_sum=0.0
    loss_count=0
    for batch in range(len(training_data)):
        optimizer.zero_grad()
        batch_texts,batch_labels=training_data[batch]
        loss_count+=len(batch_texts['input_ids'])
        loss = model(
            batch_texts,batch_labels
        )
        print('epoch:',epoch,'batch:',batch,'loss:',loss.item(),end='\n' if batch==0 or batch+1==len(training_data) or (batch+1)%1000==0 else '\r')
        loss_sum += 1.0*loss.item()*len(batch_texts['input_ids'])
        loss.backward()
        optimizer.step()
    #T5
    model.clf.save_pretrained('./output/clf_models/t5-base_epoch_{}.pt'.format(format(epoch,'05d')))
_=model.cpu()

epoch: 0 batch: 0 loss: 7.865568161010742
epoch: 0 batch: 53 loss: 1.4363217353820884
epoch: 1 batch: 0 loss: 0.8371789455413818
epoch: 1 batch: 53 loss: 0.20250956714153296
epoch: 2 batch: 0 loss: 0.2891809940338135
epoch: 2 batch: 53 loss: 0.18026919662952423
epoch: 3 batch: 0 loss: 0.24582313001155853
epoch: 3 batch: 53 loss: 0.27415791153907776
epoch: 4 batch: 0 loss: 0.20681999623775482
epoch: 4 batch: 53 loss: 0.20907138288021088
epoch: 5 batch: 0 loss: 0.20660193264484406
epoch: 5 batch: 53 loss: 0.27594047784805367
epoch: 6 batch: 0 loss: 0.19517120718955994
epoch: 6 batch: 53 loss: 0.25715890526771545
epoch: 7 batch: 0 loss: 0.20219534635543823
epoch: 7 batch: 53 loss: 0.15706640481948853
epoch: 8 batch: 0 loss: 0.1940225064754486
epoch: 8 batch: 53 loss: 0.22445161640644073
epoch: 9 batch: 0 loss: 0.19233182072639465
epoch: 9 batch: 53 loss: 0.13230130076408386


# the following 2 hyperparameters are task-specific
max_source_length = 512
max_target_length = 128

# Suppose we have the following 2 training examples:
input_sequence_1 = "Welcome to NYC"
output_sequence_1 = "Bienvenue à NYC"

input_sequence_2 = "HuggingFace is a company"
output_sequence_2 = "HuggingFace est une entreprise"

# encode the inputs
task_prefix = "Detect entailment: "
input_sequences = [input_sequence_1, input_sequence_2]

encoding = tokenizer(
    [task_prefix + sequence for sequence in input_sequences],
    padding="longest",
    max_length=max_source_length,
    truncation=True,
    return_tensors="pt",
)

input_ids, attention_mask = encoding.input_ids, encoding.attention_mask

# encode the targets
target_encoding = tokenizer(
    [output_sequence_1, output_sequence_2],
    padding="longest",
    max_length=max_target_length,
    truncation=True,
    return_tensors="pt",
)
labels = target_encoding.input_ids

# replace padding token id's of the labels by -100 so it's ignored by the loss
labels[labels == tokenizer.pad_token_id] = -100

# forward pass
loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels).loss
loss.item()