In [8]:
import pandas as pd
import numpy as np

import torch
from torch.utils.data import Dataset, TensorDataset, DataLoader

from transformers import Trainer, TrainingArguments
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AutoTokenizer

from sklearn.metrics import f1_score, accuracy_score

# RoBERTa classifier

---

In [16]:
label_to_id = {"entailment": 0, "neutral": 1, "contradiction": 2}
id_to_label = {v: k for k, v in label_to_id.items()}

In [26]:
def renameColumns(df):
    return df.rename(columns={'Sentence1': 'premise', 'Sentence2': 'hypothesis', 'Explanation_1': 'explanation'})

def filterNan(df):
    return df.dropna()

def convert_to_tensors(df):
    return torch.tensor(df.values)

def encode_labels(df):
    return df.apply(lambda x: label_to_id[x])


In [27]:
class eSNLIDataset(Dataset):
    def __init__(self, df, train=True):
        self.df = df
        self.train = train

        self.premise_template = '"{}"'
        self.hypothesis_explanation_template = '"{}" because {}'

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        example = self.df.iloc[idx,:]
        premise = example["premise"]
        hypothesis = example["hypothesis"]
        explanation = example["explanation"]

        premise = self.premise_template.format(premise)
        hypothesis = self.hypothesis_explanation_template.format(hypothesis, explanation)

        if self.train:
            label = example["gold_label"]
            return premise, hypothesis, label
        
        return premise, hypothesis

In [17]:
def calc_f1_score(predicted_classes, actual_labels):
    return f1_score(predicted_classes, actual_labels, average='weighted'), f1_score(predicted_classes, actual_labels, average='micro'), f1_score(predicted_classes, actual_labels, average='macro')

## Train using Trainer

In [11]:
train_batch_size = 16

In [28]:
df_train1 = pd.read_csv('../data/esnli_train_1.csv')
df_train2 = pd.read_csv('../data/esnli_train_2.csv')
df_train = pd.concat((df_train1, df_train2), axis=0)
df_dev = pd.read_csv('../data/esnli_dev.csv')
df_test = pd.read_csv('../data/esnli_test.csv')

In [41]:
df_train_renamed = renameColumns(df_train)
df_train_cleaned = filterNan(df_train_renamed)
df_train_cleaned.loc[:, "gold_label"] = encode_labels(df_train_cleaned["gold_label"])
df_dev_renamed = renameColumns(df_dev)
df_dev_cleaned = filterNan(df_dev_renamed)
df_dev_cleaned.loc[:, "gold_label"] = encode_labels(df_dev_cleaned["gold_label"])
df_test_renamed = renameColumns(df_test)

In [42]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=3) # <- removed problem_type = multi_label_classification because it is multi-class, not multi-label

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [43]:
# for mac
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

# for nvidia GPUs
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [44]:
training_args = TrainingArguments(
    output_dir='../results',          
    num_train_epochs=10,             
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,   
    warmup_steps=500,                
    weight_decay=0.01,               
    logging_dir='./logs',            
    logging_steps=10,                
    evaluation_strategy='steps',     
    eval_steps=50,                   
    save_strategy='epoch',           
    save_steps=100,
)

In [46]:
train_dataset = eSNLIDataset(df_train_cleaned)
dev_dataset = eSNLIDataset(df_dev_cleaned, train=False)
test_dataset = eSNLIDataset(df_test, train=False)

In [47]:
train_dataloader = DataLoader(train_dataset, batch_size=training_args.train_batch_size)
dev_dataloader = DataLoader(dev_dataset, batch_size=training_args.train_batch_size)
test_dataloader = DataLoader(test_dataset, batch_size=training_args.train_batch_size)

In [52]:
trainer = Trainer(
    model=model,                   
    args=training_args,            
    train_dataset=train_dataset,
    eval_dataset=dev_dataset
)

# Train
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
  0%|          | 0/343270 [01:10<?, ?it/s]
  0%|          | 0/343270 [00:36<?, ?it/s]
  0%|          | 0/343270 [00:00<?, ?it/s]

TypeError: vars() argument must have __dict__ attribute

In [37]:
# need to wrap in a dictionary to use the Trainer class
class DictDataset(Dataset):
    def __init__(self, tensor_dataset):
        self.tensor_dataset = tensor_dataset

    def __len__(self):
        return len(self.tensor_dataset)

    def __getitem__(self, idx):
        input_ids, attention_mask, labels = self.tensor_dataset[idx]
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

input_ids = [x['input_ids'].squeeze(0) for x in tokenized_input]
input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True)
attention_masks = [x['attention_mask'].squeeze(0) for x in tokenized_input]
attention_masks = torch.nn.utils.rnn.pad_sequence(attention_masks, batch_first=True)
labels = torch.tensor(actual_labels)

# One-hot encode the target labels
num_classes = 3
labels_one_hot = torch.nn.functional.one_hot(labels, num_classes=num_classes).float()

#split the dataset into training and validation
split = int(len(input_ids) * 0.8)
input_ids_train = input_ids[:split]
attention_masks_train = attention_masks[:split]
labels_one_hot_train = labels_one_hot[:split]

input_ids_val = input_ids[split:]
attention_masks_val = attention_masks[split:]
labels_one_hot_val = labels_one_hot[split:]

tensor_dataset = TensorDataset(input_ids_train, attention_masks_train, labels_one_hot_train)

dataset = DictDataset(tensor_dataset)

validation_tensor_dataset = TensorDataset(input_ids_val, attention_masks_val, labels_one_hot_val)
validation_dataset = DictDataset(validation_tensor_dataset)

training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=10,             
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=64,   
    warmup_steps=500,                
    weight_decay=0.01,               
    logging_dir='./logs',            
    logging_steps=10,                
    evaluation_strategy='steps',     
    eval_steps=50,                   
    save_strategy='epoch',           
    save_steps=100,
)

# Initialize the Trainer with the wrapped dataset
trainer = Trainer(
    model=model,                   
    args=training_args,            
    train_dataset=dataset,
    eval_dataset=validation_dataset
)

# Train
trainer.train()


  labels = torch.tensor(actual_labels)


  0%|          | 0/500 [00:00<?, ?it/s]

In [29]:
# predict after training

model.to(device)
model.eval()
predictions = []

with torch.no_grad():
    for batch in validation_dataset:
        batch_input_ids = batch['input_ids'].unsqueeze(0).to(device)
        batch_attention_mask = batch['attention_mask'].unsqueeze(0).to(device)
        outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_mask)
        logits = outputs.logits.cpu()

        predicted_classes = torch.argmax(logits, dim=1)
        predictions.extend(predicted_classes)

predictions = torch.stack(predictions)


print(calc_f1_score(predictions, actual_labels[split:]))

(0.4615384615384615, 0.3, 0.15384615384615383)


### To delete ??

In [49]:
encoded_input = tokenizer.encode_plus(premise, hypothesis, padding=True, truncation=True, return_tensors='pt')

labels = torch.tensor(df_cleaned['gold_label'].replace(label_to_id).tolist())[0]
output = model(**encoded_input)

predicted_class = torch.argmax(output.logits, dim=1)
print(f"Predicted class: {id_to_label[predicted_class.item()]}")

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)

In [107]:
# test the model
def renameColumnsTest(df):
    return df.rename(columns={'Sentence1': 'premise', 'Sentence2': 'hypothesis', 'Explanation_1': 'explanation'}).drop(["Sentence1_Highlighted_1", "Sentence2_Highlighted_1"], axis=1)

test_data = '../data/esnli_test.csv'
df_test = pd.read_csv(test_data)
df_test_cleaned = renameColumnsTest(df_test)
tokenized_input_test = tokenize(df_test_cleaned)[0]
output = model(**tokenized_input_test)
predicted_class = torch.argmax(output.logits, dim=1)

print(f"Premise: {premise}\nHypothesis: {hypothesis}\nExplanation: {explanation}\n")

print(f"True class: {actual_label}")
print(f"Predicted class: {id_to_label[predicted_class.item()]}")




Premise: A person on a horse jumps over a broken down airplane.
Hypothesis: A person is training his horse for a competition.
Explanation: the person is not necessarily training his horse

True class: 1
Predicted class: entailment
