In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_scheduler

In [None]:
class EmotionDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=128, is_test=False):
        self.texts = dataframe['text'].values
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.is_test = is_test
        if not is_test:
            self.labels = dataframe[['anger', 'fear', 'joy', 'sadness', 'surprise']].values
        else:
            self.labels = None

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt',
        )
        if not self.is_test:
            labels = torch.tensor(self.labels[idx], dtype=torch.float32)
            return {**encoding, 'labels': labels}
        else:
            return encoding

In [None]:
def prepare_data(train_path, val_path, test_path, tokenizer, max_length=128):
    train_data = pd.read_csv(train_path)
    val_data = pd.read_csv(val_path)
    test_data = pd.read_csv(test_path)

    train_dataset = EmotionDataset(train_data, tokenizer, max_length, is_test=False)
    val_dataset = EmotionDataset(val_data, tokenizer, max_length, is_test=False)
    test_dataset = EmotionDataset(test_data, tokenizer, max_length, is_test=True)

    return train_dataset, val_dataset, test_dataset

In [None]:
# File paths
train_path = 'track_b_data/train/eng.csv'
val_path = 'track_b_data/dev/eng.csv'
test_path = 'track_b_data/test/eng.csv'

# Load tokenizer and model
model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=5,  # For five emotions
    problem_type="regression"
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Prepare datasets
train_dataset, val_dataset, test_dataset = prepare_data(train_path, val_path, test_path, tokenizer)

# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)
test_loader = DataLoader(test_dataset, batch_size=16)

# Optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
total_steps = len(train_loader) * 10  # Assume 10 epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop with validation monitoring
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)



RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [None]:
epochs = 10
best_val_loss = float("inf")

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].squeeze(1).to(device)
        attention_mask = batch['attention_mask'].squeeze(1).to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        lr_scheduler.step()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}, Training Loss: {avg_loss}")

Epoch 1, Training Loss: 0.44692074330900444
Epoch 2, Training Loss: 0.2725960736329845
Epoch 3, Training Loss: 0.18321043769748224
Epoch 4, Training Loss: 0.13091420804317286
Epoch 5, Training Loss: 0.0998542246575645
Epoch 6, Training Loss: 0.08029991483068191
Epoch 7, Training Loss: 0.06608277174907957
Epoch 8, Training Loss: 0.05372090616150398
Epoch 9, Training Loss: 0.04635146642480627
Epoch 10, Training Loss: 0.04136635495532799


In [None]:
# Validation step
model.eval()
val_loss = 0
total_correct = 0
total_samples = 0
threshold = 0.5  # Define acceptable error range
with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].squeeze(1).to(device)
        attention_mask = batch['attention_mask'].squeeze(1).to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        val_loss += outputs.loss.item()

        preds = outputs.logits
        correct = torch.abs(preds - labels) <= threshold  # Check if within range
        total_correct += correct.sum().item()  # Count correct predictions
        total_samples += labels.numel()  # Count total values

avg_val_loss = val_loss / len(val_loader)
print(f"Epoch {epoch+1}, Validation Loss: {avg_val_loss}")
accuracy = (total_correct / total_samples) * 100  # Percentage accuracy
print(f"Validation Accuracy: {accuracy:.2f}%")

# Early stopping
if avg_val_loss < best_val_loss:
    best_val_loss = avg_val_loss
    torch.save(model.state_dict(), "best_model.pt")
    print("Model saved.")

Epoch 10, Validation Loss: 0.2978969607502222
Validation Accuracy: 74.14%
Model saved.


In [None]:
# Load the best model
model.load_state_dict(torch.load("best_model.pt"))

# Testing
model.eval()
predictions = []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].squeeze(1).to(device)
        attention_mask = batch['attention_mask'].squeeze(1).to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        preds = outputs.logits.cpu().numpy()  # Predicted emotion intensities
        predictions.extend(preds)

  model.load_state_dict(torch.load("best_model.pt"))


In [None]:
# Save predictions
predictions_df = pd.DataFrame(predictions, columns=['anger', 'fear', 'joy', 'sadness', 'surprise'])
predictions_df.to_csv('test_predictions_james.csv', index=False)
print("Predictions saved to test_predictions_james.csv")

Predictions saved to test_predictions_james.csv
