# Assignment 4

## Task 4.1:
Finetune a RoBERTa base model for sentiment analysis task on the following dataset: https://huggingface.co/datasets/google-research-datasets/poem_sentiment

The **Poem Sentiment** is a sentiment dataset of poem verses from Project Gutenberg. The language of the dataset is English and the key details are as follows:

The data fileds are as follows
   * id: index of the example
   * verse_text: The text of the poem verse
   * label: The sentiment label. Here
     - 0 = negative
     - 1 = positive
     - 2 = no impact
     - 3 = mixed (both negative and positive)

Report the model performance on test set.

In [11]:
%pip install -q torch transformers datasets scikit-learn ipywidgets

In [12]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import torch  # required for the Trainer class

# Load the dataset
dataset = load_dataset("google-research-datasets/poem_sentiment")

# Load the tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=4)

# Tokenize the dataset
def preprocess_data(examples):
    return tokenizer(examples['verse_text'], truncation=True, padding='max_length', max_length=128)

tokenized_dataset = dataset.map(preprocess_data, batched=True)
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Split the dataset
train_dataset = tokenized_dataset['train']
test_dataset = tokenized_dataset['test']

# Define evaluation metrics with `zero_division`
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted', zero_division=0)
    precision = precision_score(labels, preds, average='weighted', zero_division=0)
    recall = recall_score(labels, preds, average='weighted', zero_division=0)
    return {"accuracy": accuracy, "f1": f1, "precision": precision, "recall": recall}

# Define the training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none"
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    processing_class=tokenizer,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Evaluate the model on the test set
results = trainer.evaluate(test_dataset)
print(results)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.0051,0.790635,0.663462,0.529235,0.440181,0.663462
2,0.6669,0.570061,0.826923,0.817347,0.828307,0.826923
3,0.5609,0.512768,0.836538,0.839306,0.847091,0.836538


{'eval_loss': 0.5127676129341125, 'eval_accuracy': 0.8365384615384616, 'eval_f1': 0.8393060162285118, 'eval_precision': 0.8470907297830375, 'eval_recall': 0.8365384615384616, 'eval_runtime': 0.7236, 'eval_samples_per_second': 143.728, 'eval_steps_per_second': 9.674, 'epoch': 3.0}


## Task 4.2:
Original test set contains only three classes, excluding the mixed class. To address this, merge all datasets, shuffle them, and perform a new balanced split of classes. Train a new model and provide a performance report.

In [13]:
from datasets import concatenate_datasets, DatasetDict
from collections import Counter
import numpy as np

# Combine the datasets using concatenate_datasets
full_dataset = concatenate_datasets([
    tokenized_dataset['train'].shuffle(seed=42),
    tokenized_dataset['validation'],
    tokenized_dataset['test']
])

# Count the distribution of labels
label_counts = Counter(full_dataset['labels'])
print(f"Label distribution before resampling: {label_counts}")

# Remove the mixed class (label 3) and retain classes 0, 1, 2
filtered_dataset = full_dataset.filter(lambda x: x['labels'] != 3)

# Balance the classes by finding the smallest class and resampling
smallest_class_count = min(Counter(filtered_dataset['labels']).values())
balanced_samples = []

for label in [0, 1, 2]:
    class_samples = [example for example in filtered_dataset if example['labels'] == label]
    balanced_samples.extend(
        np.random.choice(class_samples, size=smallest_class_count, replace=False).tolist()
    )

# Shuffle the dataset
np.random.shuffle(balanced_samples)

# Split the balanced dataset into train and test sets
split_index = int(0.8 * len(balanced_samples))
balanced_dataset = DatasetDict({
    'train': balanced_samples[:split_index],
    'test': balanced_samples[split_index:]
})

# Train and evaluate a new model on the balanced dataset
training_args_balanced = TrainingArguments(
    output_dir="./results_balanced",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs_balanced",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none"
)

trainer_balanced = Trainer(
    model=model,
    args=training_args_balanced,
    train_dataset=balanced_dataset['train'],
    eval_dataset=balanced_dataset['test'],
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)

# Train the model on the balanced dataset
trainer_balanced.train()

# Evaluate the model on the balanced test set
results_balanced = trainer_balanced.evaluate(balanced_dataset['test'])
print("Performance on balanced test set:")
print(results_balanced)

Label distribution before resampling: Counter({2: 693, 0: 193, 1: 166, 3: 49})


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4272,0.352124,0.88,0.880749,0.892519,0.88
2,0.2376,0.533352,0.83,0.825591,0.830055,0.83
3,0.1315,0.444869,0.88,0.879708,0.885,0.88


Performance on balanced test set:
{'eval_loss': 0.3521239161491394, 'eval_accuracy': 0.88, 'eval_f1': 0.880748717948718, 'eval_precision': 0.8925186314363143, 'eval_recall': 0.88, 'eval_runtime': 0.7328, 'eval_samples_per_second': 136.467, 'eval_steps_per_second': 9.553, 'epoch': 3.0}
