In [None]:
!pip install wget

In [None]:
!pip install transformers

In [3]:
import os
import sys
import wget 
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

In [4]:
import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
if torch.cuda.is_available():     
    device = torch.device("cuda")
    print("GPU is available!")
else:
    device = torch.device("cpu")

GPU is available!


Data               | Task                                      | Metric
-------------------|-------------------------------------------|----------------
CoLA               | Sentence is grammatical or not grammatical|Matthews
SST-2              | Review is positive negative or neutral    |Accuracy
RTE                | Sentence 1 -> Sentence 2?                 |Accuracy

## Data preprocessing 

In [6]:
rte_link = "https://dl.fbaipublicfiles.com/glue/data/RTE.zip"
wget.download(rte_link, "./RTE.zip")

'./RTE.zip'

In [7]:
!unzip RTE.zip


Archive:  RTE.zip
   creating: RTE/
  inflating: RTE/dev.tsv             
  inflating: RTE/test.tsv            
  inflating: RTE/train.tsv           


Data preprocessing: removing first line with columns names, encoding str labels

In [8]:
rte_data = pd.read_csv("/content/RTE/train.tsv", delimiter="\t", names=["index", "sentence1", "sentence2", "label"], index_col=False)
rte_data.drop(axis=1, index=0, inplace=True)
rte_data["label"] = (rte_data["label"] == "entailment").astype(int)
print("Total of pairs of train sentences", rte_data.shape[0])
print("Sample of tokenized sentences: ", rte_data["sentence1"][1], rte_data["sentence2"][1])
rte_data.head(5)

Total of pairs of train sentences 2490
Sample of tokenized sentences:  No Weapons of Mass Destruction Found in Iraq Yet. Weapons of Mass Destruction Found in Iraq.


Unnamed: 0,index,sentence1,sentence2,label
1,0,No Weapons of Mass Destruction Found in Iraq Yet.,Weapons of Mass Destruction Found in Iraq.,0
2,1,"A place of sorrow, after Pope John Paul II die...",Pope Benedict XVI is the new leader of the Rom...,1
3,2,Herceptin was already approved to treat the si...,Herceptin can be used to treat breast cancer.,1
4,3,"Judie Vivian, chief executive at ProMedica, a ...",The previous name of Ho Chi Minh City was Saigon.,1
5,4,A man is due in court later charged with the m...,Paul Stewart Hutchinson is accused of having s...,0


In [9]:
dev_rte_data = pd.read_csv("/content/RTE/dev.tsv", delimiter="\t", names=["index", "sentence1", "sentence2", "label"], index_col=False)
dev_rte_data.drop(axis=1, index=0, inplace=True)
dev_rte_data["label"] = (dev_rte_data["label"] == "entailment").astype(int)
print("Total of pairs of test sentences", dev_rte_data.shape[0])
print("Sample of tokenized sentences: ", dev_rte_data["sentence1"][1], dev_rte_data["sentence2"][1])

Total of pairs of test sentences 277
Sample of tokenized sentences:  Dana Reeve, the widow of the actor Christopher Reeve, has died of lung cancer at age 44, according to the Christopher Reeve Foundation. Christopher Reeve had an accident.


In [10]:
test_rte_data, test_rte_labels = dev_rte_data[["sentence1", "sentence2"]], dev_rte_data["label"]

Train-validation split for fine-tuning models

In [11]:
train_rte_data, val_rte_data, train_rte_labels, val_rte_labels = train_test_split(rte_data[["sentence1", "sentence2"]], rte_data["label"], train_size=0.8)

In [12]:
print(train_rte_data.shape)
print(val_rte_data.shape)
print(train_rte_labels.shape)
print(val_rte_labels.shape)

(1992, 2)
(498, 2)
(1992,)
(498,)


## BERT for sequence classification 

In [28]:
model_class, tokenizer_class, weights = (ppb.BertForSequenceClassification, ppb.BertTokenizer, 'bert-base-uncased')
# Load pretrained model and tokenizer
tokenizer = tokenizer_class.from_pretrained(weights, do_lower_case=True)
model = model_class.from_pretrained(weights)
model.cuda()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

As BERT requires token_type_ids parameter to separate sentences, for each pair of sentences I extract segment_mask: nulls for 1st sentence tokens, ones for 2nd sentence tokens

In [20]:
def prepare_dataset(df, df_labels):
    MAX_LEN = 150
    tokenized = []
    attention_mask = []
    segment_mask = []
    labels = []
    for (sentence1, sentence2, label) in zip(df["sentence1"].values, df["sentence2"].values, df_labels.values):
        sentence1_t = tokenizer.encode(sentence1, add_special_tokens=False)
        sentence2_t = tokenizer.encode(sentence2, add_special_tokens=False)
        seq_t = [tokenizer.cls_token_id] + sentence1_t + [tokenizer.sep_token_id] + sentence2_t + [tokenizer.sep_token_id]
        seq_t = torch.tensor(seq_t)
        if len(seq_t) <= MAX_LEN:
            tokenized.append(seq_t)
            labels.append(label)

            N1 = len(sentence1_t)
            N2 = len(sentence2_t)
        
            seq_attention_mask = torch.tensor([1] * (N1 + N2 + 3))
            attention_mask.append(seq_attention_mask)
            seq_segment_mask = torch.tensor([0] * (N1 + 2) + [1] * (N2 + 1))
            segment_mask.append(seq_segment_mask)

    padded_data = pad_sequence(tokenized, batch_first=True)
    padded_attention_mask = pad_sequence(attention_mask, batch_first=True)
    padded_segment_mask = pad_sequence(segment_mask, batch_first=True)

    return padded_data, padded_attention_mask, padded_segment_mask, labels

In [25]:
train_data, train_att_mask, train_seg_mask, train_labels = prepare_dataset(train_rte_data, train_rte_labels)
test_data, test_att_mask, test_seg_mask, test_labels = prepare_dataset(test_rte_data, test_rte_labels)
val_data, val_att_mask, val_seg_mask, val_labels = prepare_dataset(val_rte_data, val_rte_labels)

Learning rate value and number of epochs are recommended as optimal hyperparameters for fine-tuning in Hugging Face tutorials

In [29]:
train_losses = []
val_losses = []
N_train = train_data.shape[0]
N_val = val_data.shape[0]
batch_size = 16
optimizer = ppb.AdamW(model.parameters(), lr=2e-5)


for epoch in range(3):
    model.train()
    loss_epoch = 0
    metric_epoch = 0
    for i in range(0, N_train, batch_size):
        model.zero_grad() 
        x_batch = torch.tensor(train_data[i:i+batch_size]).to(device)
        att_mask_batch = torch.tensor(train_att_mask[i:i+batch_size]).to(device)
        seg_mask_batch = torch.tensor(train_seg_mask[i:i+batch_size]).to(device)
        labels_batch = torch.tensor(train_labels[i:i+batch_size]).to(device) 

        loss, logits = model(x_batch, attention_mask=att_mask_batch, 
                     token_type_ids=seg_mask_batch, labels=labels_batch).values()

        loss_epoch += loss.item()

        pred_labels = torch.argmax(logits, dim=1)
        pred_labels = pred_labels.detach().cpu().numpy()
        true_labels = train_labels[i:i+batch_size]
        metric_epoch += accuracy_score(pred_labels, true_labels)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

    n_batches = N_train // batch_size
    print("Average train loss ", epoch, "epoch: ", loss_epoch / n_batches)
    print("Average train accuracy ", epoch, "epoch", metric_epoch / n_batches)          

    model.eval()
    loss_epoch = 0
    metric_epoch = 0
    for i in range(0, N_val, batch_size):
        x_batch = torch.tensor(val_data[i:i+batch_size]).to(device)
        att_mask_batch = torch.tensor(val_att_mask[i:i+batch_size]).to(device)
        seg_mask_batch = torch.tensor(val_seg_mask[i:i+batch_size]).to(device)
        labels_batch = torch.tensor(val_labels[i:i+batch_size]).to(device) 

        with torch.no_grad():
            loss, logits = model(x_batch, attention_mask=att_mask_batch, 
                     token_type_ids=seg_mask_batch, labels=labels_batch).values()
            
            loss_epoch += loss.item()

            pred_labels = torch.argmax(logits, dim=1)
            pred_labels = pred_labels.detach().cpu().numpy()
            true_labels = val_labels[i:i+batch_size]
            metric_epoch += accuracy_score(pred_labels, true_labels)

    n_batches = N_val // batch_size
    print("Average val loss ", epoch, "epoch: ", loss_epoch / n_batches)
    print("Average  val accuracy ", epoch, "epoch", metric_epoch / n_batches)

Average train loss  0 epoch:  0.6878416359934032
Average train accuracy  0 epoch 0.563034188034188
Average val loss  0 epoch:  0.6225482153481451
Average  val accuracy  0 epoch 0.6551724137931034
Average train loss  1 epoch:  0.5149449875466844
Average train accuracy  1 epoch 0.7590811965811965
Average val loss  1 epoch:  0.6173298944687021
Average  val accuracy  1 epoch 0.6982758620689655
Average train loss  2 epoch:  0.3017148079398351
Average train accuracy  2 epoch 0.8899572649572649
Average val loss  2 epoch:  0.8748351525643776
Average  val accuracy  2 epoch 0.6831896551724138


In [32]:
model.eval()
N_test = test_data.shape[0]
n_batch = N_test // batch_size 
total_metric = 0

for i in range(0, N_test, batch_size):
      x_batch = torch.tensor(test_data[i:i+batch_size]).to(device)
      att_mask_batch = torch.tensor(test_att_mask[i:i+batch_size]).to(device)
      seg_mask_batch = torch.tensor(test_seg_mask[i:i+batch_size]).to(device)
      labels_batch = torch.tensor(test_labels[i:i+batch_size]).to(device) 

      with torch.no_grad():
          logits = model(x_batch, token_type_ids=seg_mask_batch, 
                         attention_mask=att_mask_batch).logits
          pred_labels = torch.argmax(logits, dim=1)
          pred_labels = pred_labels.detach().cpu().numpy()
          true_labels = test_labels[i:i+batch_size]
          total_metric += accuracy_score(pred_labels, true_labels)

print("Average metric value on the test set:", total_metric / n_batch)

Average metric value on the test set: 0.7734375


## RoBERTa for sequence classification

In [49]:
model_class, tokenizer_class, weights = (ppb.RobertaForSequenceClassification, ppb.RobertaTokenizer, 'roberta-base')
# Load pretrained model and tokenizer
tokenizer = tokenizer_class.from_pretrained(weights, do_lower_case=True)
model = model_class.from_pretrained(weights)
model.cuda()

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

In [35]:
print("Total of model parameters", len(list(model.parameters())))

Total of model parameters 201


In [36]:
def prepare_dataset_for_roberta(df, df_labels):
    MAX_LEN = 150
    tokenized = []
    attention_mask = []
    labels = []

    for (sentence1, sentence2, label) in zip(df["sentence1"].values, df["sentence2"].values, df_labels.values):
        sentence1_t = tokenizer.encode(sentence1, add_special_tokens=False)
        sentence2_t = tokenizer.encode(sentence2, add_special_tokens=False)
        seq_t = [tokenizer.cls_token_id] + sentence1_t + [tokenizer.sep_token_id] + sentence2_t + [tokenizer.sep_token_id]
        seq_t = torch.tensor(seq_t)
        if len(seq_t) <= MAX_LEN:
            tokenized.append(seq_t)
            labels.append(label)
            seq_attention_mask = torch.tensor([1] * (len(seq_t)))
            attention_mask.append(seq_attention_mask)

    padded_data = pad_sequence(tokenized, batch_first=True)
    padded_attention_mask = pad_sequence(attention_mask, batch_first=True)

    return padded_data, padded_attention_mask, labels

In [38]:
train_data, train_att_mask, train_labels = prepare_dataset_for_roberta(train_rte_data, train_rte_labels)
test_data, test_att_mask, test_labels = prepare_dataset_for_roberta(test_rte_data, test_rte_labels)
val_data, val_att_mask, val_labels = prepare_dataset_for_roberta(val_rte_data, val_rte_labels)

In [40]:
print(train_data.shape)
print(test_data.shape)
print(val_data.shape)

torch.Size([1871, 150])
torch.Size([259, 146])
torch.Size([465, 150])


In [46]:
train_losses = []
val_losses = []
N_train = train_data.shape[0]
N_val = val_data.shape[0]
batch_size = 16
optimizer = ppb.AdamW(model.parameters(), lr=2e-5)


for epoch in range(4):
    model.train()
    loss_epoch = 0
    metric_epoch = 0
    for i in range(0, N_train, batch_size):
        model.zero_grad() 
        x_batch = torch.tensor(train_data[i:i+batch_size]).to(device)
        att_mask_batch = torch.tensor(train_att_mask[i:i+batch_size]).to(device)
        labels_batch = torch.tensor(train_labels[i:i+batch_size]).to(device) 

        loss, logits = model(x_batch, attention_mask=att_mask_batch, 
                             labels=labels_batch).values()

        loss_epoch += loss.item()

        pred_labels = torch.argmax(logits, dim=1)
        pred_labels = pred_labels.detach().cpu().numpy()
        true_labels = train_labels[i:i+batch_size]
        metric_epoch += accuracy_score(pred_labels, true_labels)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

    n_batches = N_train // batch_size
    print("Average train loss ", epoch, "epoch: ", loss_epoch / n_batches)
    print("Average train accuracy ", epoch, "epoch", metric_epoch / n_batches)          

    model.eval()
    loss_epoch = 0
    metric_epoch = 0
    for i in range(0, N_val, batch_size):
        x_batch = torch.tensor(val_data[i:i+batch_size]).to(device)
        att_mask_batch = torch.tensor(val_att_mask[i:i+batch_size]).to(device)
        labels_batch = torch.tensor(val_labels[i:i+batch_size]).to(device) 

        with torch.no_grad():
            loss, logits = model(x_batch, attention_mask=att_mask_batch, 
                                 labels=labels_batch).values()
            
            loss_epoch += loss.item()

            pred_labels = torch.argmax(logits, dim=1)
            pred_labels = pred_labels.detach().cpu().numpy()
            true_labels = val_labels[i:i+batch_size]
            metric_epoch += accuracy_score(pred_labels, true_labels)

    n_batches = N_val // batch_size
    print("Average val loss ", epoch, "epoch: ", loss_epoch / n_batches)
    print("Average  val accuracy ", epoch, "epoch", metric_epoch / n_batches)

Average train loss  0 epoch:  0.6912930186452537
Average train accuracy  0 epoch 0.5542385057471265
Average val loss  0 epoch:  0.6451064738734015
Average  val accuracy  0 epoch 0.7068965517241379
Average train loss  1 epoch:  0.5655121284312216
Average train accuracy  1 epoch 0.7218749999999999
Average val loss  1 epoch:  0.5858345427389803
Average  val accuracy  1 epoch 0.7521551724137931
Average train loss  2 epoch:  0.426902521979706
Average train accuracy  2 epoch 0.8248922413793104
Average val loss  2 epoch:  0.7371087765385365
Average  val accuracy  2 epoch 0.7629310344827587
Average train loss  3 epoch:  0.2767002041580091
Average train accuracy  3 epoch 0.915948275862069
Average val loss  3 epoch:  0.9309791931561355
Average  val accuracy  3 epoch 0.7887931034482759


In [48]:
model.eval()
N_test = test_data.shape[0]
n_batch = N_test // batch_size 
total_metric = 0

for i in range(0, N_test, batch_size):
      x_batch = torch.tensor(test_data[i:i+batch_size]).to(device)
      att_mask_batch = torch.tensor(test_att_mask[i:i+batch_size]).to(device)
      labels_batch = torch.tensor(test_labels[i:i+batch_size]).to(device) 

      with torch.no_grad():
          logits = model(x_batch, attention_mask=att_mask_batch).logits
          pred_labels = torch.argmax(logits, dim=1)
          pred_labels = pred_labels.detach().cpu().numpy()
          true_labels = test_labels[i:i+batch_size]
          total_metric += accuracy_score(pred_labels, true_labels)

print("Average metric value on the test set:", total_metric / n_batch)

Average metric value on the test set: 0.7955729166666666


Average metric values for train, validation and test have improved!