In [4]:
import pandas as pd
import numpy as np

import torch

from transformers import RobertaTokenizer, RobertaForSequenceClassification, AutoTokenizer

In [5]:
df = pd.read_csv('../data/esnli_train_1.csv') #rmb to train on whole dataset
df.head()

Unnamed: 0,pairID,gold_label,Sentence1,Sentence2,Explanation_1,WorkerId,Sentence1_marked_1,Sentence2_marked_1,Sentence1_Highlighted_1,Sentence2_Highlighted_1
0,3416050480.jpg#4r1n,neutral,A person on a horse jumps over a broken down a...,A person is training his horse for a competition.,the person is not necessarily training his horse,AF0PI3RISB5Q7,A person on a horse jumps over a broken down a...,A person is *training* *his* *horse* for a co...,{},345
1,3416050480.jpg#4r1c,contradiction,A person on a horse jumps over a broken down a...,"A person is at a diner, ordering an omelette.",One cannot be on a jumping horse cannot be a d...,A36ZT2WFIA2HMF,A person *on* *a* *horse* *jumps* over a brok...,"A person *is* *at* *a* *diner,* *ordering* an...",4235,25436
2,3416050480.jpg#4r1e,entailment,A person on a horse jumps over a broken down a...,"A person is outdoors, on a horse.",a broken down airplane is outdoors,A2GK75ZQTX2RDZ,A person on a horse jumps over *a* *broken* *...,"A person is *outdoors,* on a horse.",89107,3
3,2267923837.jpg#2r1n,neutral,Children smiling and waving at camera,They are smiling at their parents,Just because they are smiling and waving at a ...,A18TOIDG32QICP,Children smiling and waving at camera,They are smiling *at* *their* *parents*,{},534
4,2267923837.jpg#2r1e,entailment,Children smiling and waving at camera,There are children present,The children must be present to see them smili...,AEX0YE6TUZRHT,*Children* *smiling* *and* *waving* at camera,There are children *present*,0132,3


In [7]:
def renameColumnsTrain(df):
    return df.rename(columns={'Sentence1': 'premise', 'Sentence2': 'hypothesis', 'Explanation_1': 'explanation'}).drop(["WorkerId", "Sentence1_Highlighted_1", "Sentence2_Highlighted_1"], axis=1)

df_cleaned = renameColumnsTrain(df)

In [8]:
label_to_id = {"entailment": 0, "neutral": 1, "contradiction": 2}
id_to_label = {v: k for k, v in label_to_id.items()}

## Test

In [5]:
# cell for testing the model's output for a single example
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base')

# Tokenize input
premise = df_cleaned['premise'][0]
hypothesis = df_cleaned['hypothesis'][0]
explanation = df_cleaned['explanation'][0]
actual_label = df_cleaned['gold_label'][0]
encoded_input = tokenizer.encode_plus(premise, hypothesis, explanation, padding=True, truncation=True, return_tensors='pt')

labels = torch.tensor(df_cleaned['gold_label'].replace(label_to_id).tolist())[0]
print(encoded_input)
output = model(**encoded_input)

predicted_class = torch.argmax(output.logits, dim=1)

print(f"Premise: {premise}\nHypothesis: {hypothesis}\nExplanation: {explanation}\n")
print(f"True class: {actual_label}")
print(f"Predicted class: {id_to_label[predicted_class.item()]}")

Downloading model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should pr

{'input_ids': tensor([[    0,   250,   621,    15,    10,  5253, 13855,    81,    10,  3187,
           159, 16847,     4,     2,     2,   250,   621,    16,  1058,    39,
          5253,    13,    10,  1465,     4,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1]])}
Premise: A person on a horse jumps over a broken down airplane.
Hypothesis: A person is training his horse for a competition.
Explanation: the person is not necessarily training his horse

True class: neutral
Predicted class: entailment


Concatenate everything together

# RoBERTa classifier

---

In [9]:
def filterNan(df):
    return df.dropna()

# def tokenize(df):
#     return df.apply(lambda x: tokenizer.encode_plus(x['premise'], x['hypothesis'], x['explanation'], padding='max_length', return_tensors='pt'), axis=1)

def convert_to_tensors(df):
    return torch.tensor(df.values)

def encode_labels(df):
    return df.apply(lambda x: label_to_id[x])

template = """
Premise: {}
Hypothesis: {}
Explanation: {}
"""

def tokenize(df):
    tokenized_batch = []
    for _, row in df.iterrows():
        encoded_dict = tokenizer.encode_plus(
            text = template.format("<s>" + row['premise'] + "</s>", "<s>" + row['hypothesis'] + "</s>", "<s>" + row['explanation'] + "</s>"),
            # row['premise'], # two ways to encode
            # row['hypothesis'], 
            # row['explanation'],
            padding=True,
            return_tensors='pt',
            # truncation=True
        )
        tokenized_batch.append(encoded_dict)
    return tokenized_batch

[RoBERTA huggingface](https://huggingface.co/FacebookAI/roberta-base#:~:text=RoBERTa%20is%20a%20transformers%20model%20pretrained%20on%20a,to%20generate%20inputs%20and%20labels%20from%20those%20texts)

In [10]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=3) # <- removed problem_type = multi_label_classification because it is multi-class, not multi-label

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
# for mac
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

# for nvidia GPUs
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [30]:
df_cleaned = renameColumnsTrain(df)
df_cleaned = df_cleaned[:1000]
df_cleaned = filterNan(df_cleaned)
df_cleaned['gold_label'] = encode_labels(df_cleaned['gold_label'])
tokenized_input = tokenize(df_cleaned)
actual_labels = convert_to_tensors(df_cleaned['gold_label'])

## Predict without training

In [35]:
import torch
from torch.utils.data import TensorDataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

# print(tokenized_input)
input_ids = [x['input_ids'].squeeze(0) for x in tokenized_input]
input_ids = pad_sequence(input_ids, batch_first=True)
attention_masks = [x['attention_mask'].squeeze(0) for x in tokenized_input]
attention_masks = pad_sequence(attention_masks, batch_first=True)

labels = actual_labels

dataset = TensorDataset(input_ids, attention_masks, labels)
loader = DataLoader(dataset, batch_size=16)

model.to(device)
model.eval()
predictions = []

with torch.no_grad():
    for batch in loader:
        batch_input_ids, batch_attention_mask, batch_labels = batch

        batch_input_ids = batch_input_ids.to(device)
        batch_attention_mask = batch_attention_mask.to(device)
        
        outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_mask)
        logits = outputs.logits.cpu()

        predicted_classes = torch.argmax(logits, dim=1)
        predictions.extend(predicted_classes)

predictions = torch.stack(predictions)

In [36]:
from sklearn.metrics import f1_score

def calc_f1_score(predicted_classes, actual_labels):
    return f1_score(predicted_classes, actual_labels, average='weighted'), f1_score(predicted_classes, actual_labels, average='micro'), f1_score(predicted_classes, actual_labels, average='macro')

print(calc_f1_score(predictions, actual_labels))

(0.8940854099251938, 0.893, 0.8918859739567128)


Training Loop using Trainer

In [37]:
from transformers import Trainer, TrainingArguments
from torch.utils.data import Dataset, TensorDataset
import torch

# need to wrap in a dictionary to use the Trainer class
class DictDataset(Dataset):
    def __init__(self, tensor_dataset):
        self.tensor_dataset = tensor_dataset

    def __len__(self):
        return len(self.tensor_dataset)

    def __getitem__(self, idx):
        input_ids, attention_mask, labels = self.tensor_dataset[idx]
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

input_ids = [x['input_ids'].squeeze(0) for x in tokenized_input]
input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True)
attention_masks = [x['attention_mask'].squeeze(0) for x in tokenized_input]
attention_masks = torch.nn.utils.rnn.pad_sequence(attention_masks, batch_first=True)
labels = torch.tensor(actual_labels)

# One-hot encode the target labels
num_classes = 3
labels_one_hot = torch.nn.functional.one_hot(labels, num_classes=num_classes).float()

#split the dataset into training and validation
split = int(len(input_ids) * 0.8)
input_ids_train = input_ids[:split]
attention_masks_train = attention_masks[:split]
labels_one_hot_train = labels_one_hot[:split]

input_ids_val = input_ids[split:]
attention_masks_val = attention_masks[split:]
labels_one_hot_val = labels_one_hot[split:]

tensor_dataset = TensorDataset(input_ids_train, attention_masks_train, labels_one_hot_train)

dataset = DictDataset(tensor_dataset)

validation_tensor_dataset = TensorDataset(input_ids_val, attention_masks_val, labels_one_hot_val)
validation_dataset = DictDataset(validation_tensor_dataset)

training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=10,             
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=64,   
    warmup_steps=500,                
    weight_decay=0.01,               
    logging_dir='./logs',            
    logging_steps=10,                
    evaluation_strategy='steps',     
    eval_steps=50,                   
    save_strategy='epoch',           
    save_steps=100,
)

# Initialize the Trainer with the wrapped dataset
trainer = Trainer(
    model=model,                   
    args=training_args,            
    train_dataset=dataset,
    eval_dataset=validation_dataset
)

# Train
trainer.train()


  labels = torch.tensor(actual_labels)


  0%|          | 0/500 [00:00<?, ?it/s]

In [29]:
# predict after training

model.to(device)
model.eval()
predictions = []

with torch.no_grad():
    for batch in validation_dataset:
        batch_input_ids = batch['input_ids'].unsqueeze(0).to(device)
        batch_attention_mask = batch['attention_mask'].unsqueeze(0).to(device)
        outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_mask)
        logits = outputs.logits.cpu()

        predicted_classes = torch.argmax(logits, dim=1)
        predictions.extend(predicted_classes)

predictions = torch.stack(predictions)


print(calc_f1_score(predictions, actual_labels[split:]))

(0.4615384615384615, 0.3, 0.15384615384615383)


### To delete ??

In [49]:
encoded_input = tokenizer.encode_plus(premise, hypothesis, padding=True, truncation=True, return_tensors='pt')

labels = torch.tensor(df_cleaned['gold_label'].replace(label_to_id).tolist())[0]
output = model(**encoded_input)

predicted_class = torch.argmax(output.logits, dim=1)
print(f"Predicted class: {id_to_label[predicted_class.item()]}")

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)

In [107]:
# test the model
def renameColumnsTest(df):
    return df.rename(columns={'Sentence1': 'premise', 'Sentence2': 'hypothesis', 'Explanation_1': 'explanation'}).drop(["Sentence1_Highlighted_1", "Sentence2_Highlighted_1"], axis=1)

test_data = '../data/esnli_test.csv'
df_test = pd.read_csv(test_data)
df_test_cleaned = renameColumnsTest(df_test)
tokenized_input_test = tokenize(df_test_cleaned)[0]
output = model(**tokenized_input_test)
predicted_class = torch.argmax(output.logits, dim=1)

print(f"Premise: {premise}\nHypothesis: {hypothesis}\nExplanation: {explanation}\n")

print(f"True class: {actual_label}")
print(f"Predicted class: {id_to_label[predicted_class.item()]}")




Premise: A person on a horse jumps over a broken down airplane.
Hypothesis: A person is training his horse for a competition.
Explanation: the person is not necessarily training his horse

True class: 1
Predicted class: entailment


## Finetune model classification head

## Fewshot training PET framework 

In [6]:
pattern = "{} [mask] {} because {}"