In [1]:
import pandas as pd
import numpy as np

import torch
from torch.utils.data import Dataset, TensorDataset, DataLoader
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss

from transformers import get_scheduler
from transformers import Trainer, TrainingArguments
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AutoTokenizer


from sklearn.metrics import f1_score, accuracy_score

  from .autonotebook import tqdm as notebook_tqdm


# RoBERTa classifier

---

In [2]:
label_to_id = {"entailment": 0, "neutral": 1, "contradiction": 2}
id_to_label = {v: k for k, v in label_to_id.items()}

In [3]:
def renameColumns(df):
    return df.rename(columns={'Sentence1': 'premise', 'Sentence2': 'hypothesis', 'Explanation_1': 'explanation'})

def filterNan(df):
    return df.dropna()

def convert_to_tensors(df):
    return torch.tensor(df.values)

def encode_labels(df):
    return df.apply(lambda x: label_to_id[x])


In [4]:
class eSNLIDataset(Dataset):
    def __init__(self, df, tokenizer, train=True):
        self.df = df
        self.train = train
        self.tokenizer = tokenizer
        self.premise_template = '"{}"'
        self.hypothesis_explanation_template = '"{}" because {}'

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        example = self.df.iloc[idx,:]
        premise = example["premise"]
        hypothesis = example["hypothesis"]
        explanation = example["explanation"]

        premise = self.premise_template.format(premise)
        hypothesis = self.hypothesis_explanation_template.format(hypothesis, explanation)

        if self.train:
            label = example["gold_label"]
            return premise, hypothesis, label
        
        return premise, hypothesis

In [5]:
def calc_f1_score(predicted_classes, actual_labels):
    return f1_score(predicted_classes, actual_labels, average='weighted'), f1_score(predicted_classes, actual_labels, average='micro'), f1_score(predicted_classes, actual_labels, average='macro')

## Train using Trainer

In [6]:
df_train1 = pd.read_csv('../data/esnli_train_1.csv')
df_train2 = pd.read_csv('../data/esnli_train_2.csv')
df_train = pd.concat((df_train1, df_train2), axis=0)
df_dev = pd.read_csv('../data/esnli_dev.csv')
df_test = pd.read_csv('../data/esnli_test.csv')

In [7]:
df_train_renamed = renameColumns(df_train)
df_train_cleaned = filterNan(df_train_renamed)
df_train_cleaned.loc[:, "gold_label"] = encode_labels(df_train_cleaned["gold_label"])
df_dev_renamed = renameColumns(df_dev)
df_dev_cleaned = filterNan(df_dev_renamed)
df_dev_cleaned.loc[:, "gold_label"] = encode_labels(df_dev_cleaned["gold_label"])
df_test_renamed = renameColumns(df_test)

In [8]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=3) # <- removed problem_type = multi_label_classification because it is multi-class, not multi-label

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# for mac
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

# for nvidia GPUs
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [10]:
train_batch_size = 16
num_epochs = 1

In [11]:
train_dataset = eSNLIDataset(df_train_cleaned, tokenizer)
dev_dataset = eSNLIDataset(df_dev_cleaned, tokenizer, train=False)
test_dataset = eSNLIDataset(df_test, tokenizer, train=False)

In [12]:
train_dataloader = DataLoader(train_dataset, batch_size=train_batch_size)
dev_dataloader = DataLoader(dev_dataset, batch_size=train_batch_size)
test_dataloader = DataLoader(test_dataset, batch_size=train_batch_size)

In [13]:
optimizer = AdamW(model.parameters(), lr=5e-5)
num_training_steps = num_epochs * len(train_dataloader) # <- number of batches
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)
criterion = CrossEntropyLoss()

In [14]:
for name, param in model.named_parameters():
    if 'classifier' not in name: # classifier layer
        param.requires_grad = False

In [15]:
for name, param in model.named_parameters():
    print("name:", name, "requires_grad:", param.requires_grad)

name: roberta.embeddings.word_embeddings.weight requires_grad: False
name: roberta.embeddings.position_embeddings.weight requires_grad: False
name: roberta.embeddings.token_type_embeddings.weight requires_grad: False
name: roberta.embeddings.LayerNorm.weight requires_grad: False
name: roberta.embeddings.LayerNorm.bias requires_grad: False
name: roberta.encoder.layer.0.attention.self.query.weight requires_grad: False
name: roberta.encoder.layer.0.attention.self.query.bias requires_grad: False
name: roberta.encoder.layer.0.attention.self.key.weight requires_grad: False
name: roberta.encoder.layer.0.attention.self.key.bias requires_grad: False
name: roberta.encoder.layer.0.attention.self.value.weight requires_grad: False
name: roberta.encoder.layer.0.attention.self.value.bias requires_grad: False
name: roberta.encoder.layer.0.attention.output.dense.weight requires_grad: False
name: roberta.encoder.layer.0.attention.output.dense.bias requires_grad: False
name: roberta.encoder.layer.0.atten

In [16]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

y_true = []

model.to(device)
model.train()
for epoch in range(num_epochs):
    torch.cuda.empty_cache()
    for batch in train_dataloader:
        premise, hypothesis, labels = batch
        y_true.extend(labels)

        encoded_input = tokenizer(premise, hypothesis, return_tensors='pt', padding=True, truncation=True).to(device)
        outputs = model(**encoded_input)
        logits = outputs.logits.to(device)

        loss = criterion(logits, labels.to(device))
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/34327 [00:00<?, ?it/s]

 16%|█▌        | 5548/34327 [12:32<45:49, 10.47it/s]   

In [None]:
# predict after training

model.to(device)
model.eval()
predictions = []

with torch.no_grad():
    for batch in validation_dataset:
        batch_input_ids = batch['input_ids'].unsqueeze(0).to(device)
        batch_attention_mask = batch['attention_mask'].unsqueeze(0).to(device)
        outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_mask)
        logits = outputs.logits.cpu()

        predicted_classes = torch.argmax(logits, dim=1)
        predictions.extend(predicted_classes)

predictions = torch.stack(predictions)


print(calc_f1_score(predictions, actual_labels[split:]))

(0.4615384615384615, 0.3, 0.15384615384615383)
