# NLP Final COPA Project: RoBERTa

In [92]:
import pandas as pd
import torch
from torch import nn, optim
from transformers import RobertaTokenizer, RobertaConfig, RobertaModel
from datetime import datetime

In [93]:
pd.read_json('COPA/train.jsonl', lines=True).head()

Unnamed: 0,premise,choice1,choice2,question,label,idx
0,My body cast a shadow over the grass.,The sun was rising.,The grass was cut.,cause,0,0
1,The woman tolerated her friend's difficult beh...,The woman knew her friend was going through a ...,The woman felt that her friend took advantage ...,cause,0,1
2,The women met for coffee.,The cafe reopened in a new location.,They wanted to catch up with each other.,cause,1,2
3,The runner wore shorts.,The forecast predicted high temperatures.,She planned to run along the beach.,cause,0,3
4,The guests of the party hid behind the couch.,It was a surprise party.,It was a birthday party.,cause,0,4


In [3]:
def load_data(filename):
    raw_data = pd.read_json(filename, lines=True)
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    dataset = []
    
    for i, row in raw_data.iterrows(): # yeah, yeah, I know...
        prem = torch.tensor(tokenizer(row['premise'])['input_ids'])
        h1 = torch.tensor(tokenizer(row['choice1'])['input_ids'])
        h2 = torch.tensor(tokenizer(row['choice2'])['input_ids'])
        dataset.append((torch.cat((prem, h1, h2)), torch.tensor(row['label']).to(torch.float32)))
        
    return dataset

In [91]:
train_data = load_data('COPA/train.jsonl')
print(f'Training data loaded (length {len(train_data)})')
dev_data = load_data('COPA/dev.jsonl')
print(f'Dev data loaded (length {len(dev_data)})')
test_data = load_data('COPA/test.jsonl')
print(f'Test data loaded (length {len(test_data)})')

Training data loaded (length 350)
Dev data loaded (length 50)
Test data loaded (length 100)


In [94]:
train_data[6]

(tensor([    0,   133, 32551,   376,    66,     9,     5,  6399,     4,     2,
             0,   100, 38471,     5,  6399,     4,     2,     0,   100, 13819,
         12552,     5,  6399,     4,     2]),
 tensor(1.))

In [87]:
class COPAClassifier(nn.Module):
    def __init__(self, output_size=1):
        super().__init__()
        
        # default pretrained layer
        self.roberta = RobertaModel.from_pretrained('roberta-base')
#         self.roberta.requires_grad = False
        self.classifier = nn.Linear(768, output_size)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        return self.classifier(self.roberta(input_ids=x)[1])
        
    def predict(self, x):
        return 1 if self.sigmoid(self.forward(x)) > 0.5 else 0

In [88]:
copa = COPAClassifier()

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [89]:
# 3) Now we train our model. 
start_time = datetime.now()
epochs = 1
bce = nn.BCELoss()
sigmoid = nn.Sigmoid()
optimizer = optim.Adam(copa.parameters(), lr=0.05)

for i in range(epochs):
    print('### Epoch: ' + str(i+1) + ' ###')
    av_loss = 0
    copa.train()
    for (x, y) in train_data:
        optimizer.zero_grad()
        
        # a) calculate probs / get an output
        out = copa(x.unsqueeze(0))
        y_hat = sigmoid(out.squeeze(0).squeeze(0))
        
        # b) compute loss
        loss = bce(y_hat, y)
        av_loss += loss
        
        # c) get the gradient
        loss.backward()

        # d) update the weights
        optimizer.step()
    print(av_loss/len(train_data))

end_time = datetime.now()
print(f'Training completed in {str(end_time - start_time)}')

### Epoch: 1 ###
tensor(50.3668, grad_fn=<DivBackward0>)
Training completed in 0:08:59.028229


In [90]:
correct = 0
for (x, y) in train_data:
    y_hat = copa.predict(x.unsqueeze(0))
    if y_hat == y:
        correct += 1

acc = correct / len(train_data)
print(acc)

0.49142857142857144


## Notes
LSTM on top of BERT is probably a bad idea - overlaps; might be better to just use a fully connected linear layer  
Cause/effect token as extra input to the model