# Fine-tuning RoBERTa

In [None]:
import glob
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F

from sklearn.metrics import accuracy_score
from tqdm.auto import tqdm
from torch.utils.data import DataLoader, TensorDataset
from transformers import get_scheduler, RobertaModel, RobertaTokenizer

In [None]:
if torch.cuda.is_available(): device = torch.device("cuda")
elif torch.backends.mps.is_available(): device = torch.device("mps")
else: device = torch.device("cpu")

In [None]:
os.makedirs("data/embeddings/", exist_ok=True)
os.makedirs("output/preds/", exist_ok=True)

# Preprocessing
To improve efficiency, perform a forward pass through the base model, and store the resulting embeddings.

In [None]:
batch_size=250

class DataPreprocessor:
    def __init__(self, tokenizer, base_model, label2id, batch_size):
        self.tokenizer = tokenizer
        self.base_model = base_model.to(device)
        self.base_model.eval()
        self.label2id = label2id
        self.batch_size = batch_size

    def extract_embeddings(self, texts):
        """Parallelized embedding extraction in batches with progress bar."""
        dataloader = DataLoader(texts, batch_size=self.batch_size, shuffle=False)
        all_embeddings = []

        with torch.no_grad():
            for batch in tqdm(dataloader, desc="Extracting Embeddings", leave=True):
                tokens = self.tokenizer(
                    batch,
                    padding="longest",
                    truncation=True,
                    return_token_type_ids=False,
                    return_tensors="pt"
                ).to(device)

                outputs = self.base_model(**tokens)
                embeddings = outputs.last_hidden_state[:, 0, :]  # CLS token representation
                all_embeddings.append(embeddings.cpu())          # Move to CPU to save memory

        return torch.cat(all_embeddings, dim=0)  # Concatenate all batches

    def prepare(self, data):
        """Prepare dataloader with precomputed embeddings."""
        texts = data["text"].tolist()
        embeddings = self.extract_embeddings(texts)

        # Convert labels
        numeric_labels = [self.label2id[label] for label in data["sentiment"]]
        labels = torch.tensor(numeric_labels, dtype=torch.long)

        # Extract review IDs
        ids = torch.tensor(data["review_id"].tolist(), dtype=torch.long)

        # Create dataset and dataloader
        dataset = TensorDataset(embeddings, labels, ids)
        dataloader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True)

        return dataloader

In [None]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-large")
base_model = RobertaModel.from_pretrained("roberta-large")

id2label = {0: "negative", 1: "positive"}
label2id = {"negative": 0, "positive": 1}

preprocessor = DataPreprocessor(tokenizer, base_model, label2id, batch_size)

In [None]:
train_savefile = "data/embeddings/train.pt"

if os.path.exists(train_savefile):
    print(f"Loading embeddings for train samples")
    train_tensors = torch.load(train_savefile)
    train_dataset = TensorDataset(*train_tensors)
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
else:
    train = pd.read_csv("data/processed/train.csv")
    train_dataloader = preprocessor.prepare(train)
    train_dataset = train_dataloader.dataset
    torch.save(train_dataset.tensors, train_savefile)

In [None]:
test_savefile = "data/embeddings/test.pt"

if os.path.exists(test_savefile):
    print(f"Loading embeddings for validation samples")
    test_tensors = torch.load(test_savefile)
    test_dataset = TensorDataset(*test_tensors)
    test_dataloader = DataLoader(test_dataset, batch_size=250, shuffle=True)
else:
    test = pd.read_csv("data/processed/test.csv")
    test_dataloader = preprocessor.prepare(test)
    test_dataset = test_dataloader.dataset
    torch.save(test_dataset.tensors, test_savefile)

# Classification head

In [None]:
class RoBERTaClassifier(nn.Module): 
    def __init__(self, embedding_dim=1024, mod=1): 
        super(RoBERTaClassifier, self).__init__()
      ### Parameters
        self.mod = mod
        self.hidden_size = 1024 if mod==1 else 2048
        self.inter_size = 1024 if mod==1 else 512
      ### Layers
      ### Must be activated in __init__ for the trainable parameters count to be exact
        self.in_proj = nn.Linear(embedding_dim, self.hidden_size)           # Input layer
        self.dropout = nn.Dropout(0.1)                                      # Dropout layer
        self.silu = nn.SiLU()                                               # Activation
        if mod >= 4:
            self.layer_norm = nn.LayerNorm(self.hidden_size, eps=1e-5)      # Normalization
        if mod >= 2:
            self.inter_proj = nn.Linear(self.hidden_size, self.inter_size)  # Intermediate dense layer
        self.out_proj = nn.Linear(self.inter_size, 2)                       # Output layer

    def forward(self, embeddings):
        x = self.in_proj(embeddings)
        x = self.dropout(x) if self.mod>=5 else x
        x = self.layer_norm(x) if self.mod>=4 else x
        x = self.silu(x) if self.mod>=3 else x
        x = x + self.in_proj(embeddings) if self.mod>=6 else x
        x = self.inter_proj(x) if self.mod>=2 else x
        x = self.dropout(x) if self.mod>=5 else x
        x = self.out_proj(x)
        return x

# Training

In [None]:
loss_fn = nn.CrossEntropyLoss()

def model_train(batch, classifier, optimizer, metrics):
    # Unpack the batch and move tensors to the device
    embeddings, b_labels, b_ids = [t.to(device) for t in batch]
    # Reset gradients before backpropagation
    classifier.zero_grad()
    # Perform a forward pass to calculate outputs using embeddings as input
    logits = classifier(embeddings)
    # Store results for later analysis
    all_logits.append(logits.detach().cpu())
    all_labels.append(b_labels.detach().cpu())
    all_ids.append(b_ids.detach().cpu())
    # Calculate loss
    loss = loss_fn(logits, b_labels)
    metrics['batch_train_losses'].append(loss.item())
    # Calculate accuracy
    preds = torch.argmax(logits, dim=1)
    accuracy = (preds == b_labels).sum().item() / b_labels.size(0)
    metrics['batch_train_accuracy'].append(accuracy)
    # Backpropagate the loss
    loss.backward()
    # Update model parameters
    optimizer.step()

    return loss

def model_eval(batch, classifier, optimizer, metrics):
    # Unpack the batch and move tensors to the device
    embeddings, b_labels, b_ids = [t.to(device) for t in batch]
    # Forward pass using embeddings as input
    logits = classifier(embeddings)
    # Store results for later analysis
    all_logits.append(logits.detach().cpu())
    all_labels.append(b_labels.detach().cpu())
    all_ids.append(b_ids.detach().cpu())
    # Calculate loss
    loss = loss_fn(logits, b_labels)
    metrics['batch_test_losses'].append(loss.item())
    # Calculate accuracy
    preds = torch.argmax(logits, dim=1)
    accuracy = (preds == b_labels).sum().item() / b_labels.size(0)
    metrics['batch_test_accuracy'].append(accuracy)
    
    return loss

## Training parameters

In [None]:
num_epochs = 300
num_training_steps = num_epochs * len(train_dataloader)
print(f"Number of training steps: {num_training_steps}")

def get_optimizer_and_scheduler(classifier):
    # Optimizer
    optimizer = torch.optim.AdamW(
        classifier.parameters(),
        lr = 1e-3,
        weight_decay = 0.01,
        eps = 1e-8)

    # Scheduler
    lr_scheduler = get_scheduler(
        "cosine",
        optimizer = optimizer,
        num_warmup_steps = 0.1 * num_training_steps,
        num_training_steps = num_training_steps)

    return optimizer, lr_scheduler

## Training loop

In [None]:
metrics = {mod_type: {} for mod_type in range(1, 7)}

model_progress = tqdm(range(1, 7), desc = "Models", position = 0, unit = "model")
for mod_type in model_progress:
    ### Create model
    classifier = RoBERTaClassifier(embedding_dim=1024, mod=mod_type).to(device)
    optimizer, lr_scheduler = get_optimizer_and_scheduler(classifier)
    
    metrics[mod_type] = {}
    metrics[mod_type]['parameters'] = sum(p.numel() for p in classifier.parameters() if p.requires_grad)

    # Loop over epochs
    epoch_progress = tqdm(range(1, num_epochs), desc = f"Model {mod_type}", position = 1, unit = "epoch")
    for epoch in epoch_progress:
        metrics[mod_type][epoch] = {
            'batch_train_losses': [],
            'batch_train_accuracy': [],
            'batch_test_losses': [],
            'batch_test_accuracy': []}

        ### Training
        all_logits = []
        all_labels = []
        all_ids = []
        
        classifier.train()
        for batch in train_dataloader:
            loss = model_train(batch, classifier, optimizer, metrics[mod_type][epoch])
        lr_scheduler.step()
        
        # Loss & accuracy
        avg_train_loss = np.mean(metrics[mod_type][epoch]['batch_train_losses'][-len(train_dataloader):])
        metrics[mod_type][epoch]['epoch_train_loss'] = avg_train_loss
        avg_train_accuracy = np.mean(metrics[mod_type][epoch]['batch_train_accuracy'][-len(train_dataloader):])
        metrics[mod_type][epoch]['epoch_train_accuracy'] = avg_train_accuracy
        # Classification error
        all_logits = torch.cat(all_logits, dim=0)
        probs = F.softmax(all_logits, dim=1).detach()
        prob_class_0 = probs[:, 0]
        prob_class_1 = probs[:, 1]
        classif_error = (1 - torch.max(prob_class_0, prob_class_1)).mean().item()
        metrics[mod_type][epoch]['train_classif_error'] = classif_error

        ### Validating
        all_logits = []
        all_labels = []
        all_ids = []
    
        classifier.eval()
        for batch in test_dataloader:
            loss = model_eval(batch, classifier, optimizer, metrics[mod_type][epoch])

        # Loss & accuracy
        avg_test_loss = np.mean(metrics[mod_type][epoch]['batch_test_losses'][-len(test_dataloader):])
        metrics[mod_type][epoch]['epoch_test_loss'] = avg_test_loss
        avg_test_accuracy = np.mean(metrics[mod_type][epoch]['batch_test_accuracy'][-len(test_dataloader):])
        metrics[mod_type][epoch]['epoch_test_accuracy'] = avg_test_accuracy
        # Classification error
        all_logits = torch.cat(all_logits, dim=0)
        probs = F.softmax(all_logits, dim=1).detach()
        prob_class_0 = probs[:, 0]
        prob_class_1 = probs[:, 1]
        classif_error = (1 - torch.max(prob_class_0, prob_class_1)).mean().item()
        metrics[mod_type][epoch]['test_classif_error'] = classif_error

        ### Saving predictions
        best_test_accuracy = max(
            metrics[mod_type][epoch]['epoch_test_accuracy']
            for epoch in metrics[mod_type]
            if epoch != 'parameters')
        if avg_test_accuracy >= best_test_accuracy:
            probs_array = probs.cpu().numpy()
            labels_array = torch.cat(all_labels, dim=0).cpu().numpy()
            results = pd.DataFrame(probs_array, columns=[f"prob_class_{i}" for i in range(probs_array.shape[1])])
            results['true_label'] = [id2label[label] for label in labels_array]
            results['review_id'] = torch.cat(all_ids, dim=0).detach().cpu().numpy()
            results.to_csv(f"output/preds/mod_{mod_type}_epoch_{epoch}.csv", index=False)
    
        epoch_progress.set_postfix(acc=f"{avg_test_accuracy:.4f}",
                                   best_acc=f"{best_test_accuracy:.4f}", 
                                   losses_ratio=f"{avg_train_loss/avg_test_loss:.4f}")

    epoch_progress.close()
model_progress.close()

# 3. Results

In [None]:
summary = []

for mod_type in metrics:
    epoch_accuracies = [
        (epoch, data['epoch_test_accuracy'])
        for epoch, data in metrics[mod_type].items()
        if isinstance(epoch, int) and 'epoch_test_accuracy' in data
    ]

    if epoch_accuracies:
        best_epoch, best_accuracy = max(epoch_accuracies, key=lambda x: x[1])
        best_error = metrics[mod_type][best_epoch]['test_classif_error']

        summary.append({
            'mod_type': mod_type,
            'n_parameters': metrics[mod_type]['parameters'],
            'best_epoch': best_epoch,
            'best_test_accuracy': best_accuracy,
            'test_classif_error': best_error
        })

summary = pd.DataFrame(summary).sort_values(by='mod_type')
summary.style.hide(axis="index").format({
    "best_test_accuracy": "{:.4f}",
    "test_classif_error": "{:.4f}"})

In [None]:
best_model_index = summary['best_test_accuracy'].idxmax()
best_model_info = summary.loc[best_model_index]

best_mod_type = int(best_model_info['mod_type'])
best_epoch = int(best_model_info['best_epoch'])

In [None]:
rows = []

for mod_type, subdict in metrics.items():
    for epoch_key, metric in subdict.items():
        if epoch_key == 'parameters':
            continue
        if isinstance(metric, dict):
            rows.append({
                "mod_type": mod_type,
                "epoch": epoch_key,
                "epoch_train_loss": metric.get("epoch_train_loss"),
                "epoch_test_loss": metric.get("epoch_test_loss"),
                "epoch_train_accuracy": metric.get("epoch_train_accuracy"),
                "epoch_test_accuracy": metric.get("epoch_test_accuracy"),
                "train_classif_errors": metric.get("train_classif_error"),
                "test_classif_errors": metric.get("test_classif_error")
            })

df = pd.DataFrame(rows)

In [None]:
df_subset = df[df["mod_type"] == best_mod_type].sort_values(by="epoch")

epoch_train_losses = df_subset["epoch_train_loss"].tolist()
epoch_test_losses = df_subset["epoch_test_loss"].tolist()
epoch_train_accuracy = df_subset["epoch_train_accuracy"].tolist()
epoch_test_accuracy = df_subset["epoch_test_accuracy"].tolist()
train_classif_errors = df_subset["train_classif_errors"].tolist()
test_classif_errors = df_subset["test_classif_errors"].tolist()

In [None]:
plt.figure(figsize=(12, 4))

# Plot Loss
plt.subplot(1, 3, 1)
plt.plot(np.arange(1, len(epoch_train_losses) + 1), epoch_train_losses, label='Train', color='#97BC62FF')
plt.plot(np.arange(1, len(epoch_test_losses) + 1), epoch_test_losses, label='Test', color='#2C5F2D', alpha=0.8)
plt.xticks(np.arange(0, len(epoch_train_losses) + 2, max(1, len(epoch_train_losses)//10)))
plt.xlabel('Epochs')
plt.ylabel('')
plt.title('Loss')
plt.legend()

# Plot Accuracy
plt.subplot(1, 3, 2)
plt.plot(np.arange(1, len(epoch_train_accuracy) + 1), epoch_train_accuracy, label='Train', color='#9CC3D5FF')
plt.plot(np.arange(1, len(epoch_test_accuracy) + 1), epoch_test_accuracy, label='Test', color='#0063B2FF')
plt.xticks(np.arange(0, len(epoch_test_accuracy) + 2, max(1, len(epoch_test_accuracy)//10)))
plt.xlabel('Epochs')
plt.ylabel('')
plt.title('Accuracy')
plt.legend()

# Plot Classification Error
plt.subplot(1, 3, 3)
plt.plot(np.arange(1, len(train_classif_errors) + 1), train_classif_errors, label='Train', color='#F5C7B8FF')
plt.plot(np.arange(1, len(test_classif_errors) + 1), test_classif_errors, label='Test', color='#FFA177FF')
plt.xticks(np.arange(0, len(test_classif_errors) + 2, max(1, len(test_classif_errors)//10)))
plt.xlabel('Epochs')
plt.ylabel('')
plt.title('Classification Error')
plt.legend()

# Final Layout
plt.tight_layout()
plt.savefig(f"output/mod_type_{mod_type}_learning_curves.png", dpi=300, bbox_inches='tight')
plt.show()

In [None]:
plt.figure(figsize=(12, 10))

# Plot Loss
plt.subplot(3, 1, 1)
plt.plot(np.arange(1, len(epoch_train_losses) + 1), epoch_train_losses, label='Train', color='#97BC62FF')
plt.plot(np.arange(1, len(epoch_test_losses) + 1), epoch_test_losses, label='Test', color='#2C5F2D', alpha=0.8)
plt.xticks(np.arange(0, len(epoch_train_losses) + 2, max(1, len(epoch_train_losses)//10)))
plt.xlim(21, 251)
plt.ylim(0.15, 0.265)
plt.xlabel('')
plt.ylabel('')
plt.title('Loss')
plt.legend()

# Plot Accuracy
plt.subplot(3, 1, 2)
plt.plot(np.arange(1, len(epoch_train_accuracy) + 1), epoch_train_accuracy, label='Train', color='#9CC3D5FF')
plt.plot(np.arange(1, len(epoch_test_accuracy) + 1), epoch_test_accuracy, label='Test', color='#0063B2FF')
plt.xticks(np.arange(0, len(epoch_test_accuracy) + 2, max(1, len(epoch_test_accuracy)//10)))
plt.xlim(21, 251)
plt.ylim(0.9, 0.945)
plt.xlabel('')
plt.ylabel('')
plt.title('Accuracy')
plt.legend()

# Plot Classification Error
plt.subplot(3, 1, 3)
plt.plot(np.arange(1, len(train_classif_errors) + 1), train_classif_errors, label='Train', color='#F5C7B8FF')
plt.plot(np.arange(1, len(test_classif_errors) + 1), test_classif_errors, label='Test', color='#FFA177FF')
plt.xticks(np.arange(0, len(test_classif_errors) + 2, max(1, len(test_classif_errors)//10)))
plt.xlim(21, 251)
plt.ylim(0.055, 0.09)
plt.xlabel('Epochs')
plt.ylabel('')
plt.title('Classification Error')
plt.legend()

# Final Layout
plt.tight_layout()
plt.savefig(f"output/mod_type_{mod_type}_learning_curves_zoom.png", dpi=300, bbox_inches='tight')
plt.show()

In [None]:
test = pd.read_csv("data/processed/test.csv")
results = pd.read_csv(f"output/preds/mod_{best_mod_type}_epoch_{best_epoch}.csv")
results = pd.merge(test, results, on = 'review_id')

In [None]:
# Check for consistency
print(f"Do the true labels returned by the model match the original sentiments?")
print(f"Yes!" if (results['sentiment'] == results['true_label']).all() else f"No :'(")

In [None]:
# Get predicted sentiments and save
results['RoBERTa_ft'] = np.where(results['prob_class_1'] >= 0.5, 'positive', 'negative')
results[['review_id', 'RoBERTa_ft']].to_csv("output/RoBERTa_ft.csv", index=False)