# Fine-tuning RoBERTa

In [None]:
import glob
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import psutil
import seaborn as sns
import time
import torch
import torch.nn as nn
import torch.nn.functional as F

from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from tqdm.auto import tqdm
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_scheduler

In [None]:
if torch.cuda.is_available(): device = torch.device("cuda")
elif torch.backends.mps.is_available(): device = torch.device("mps")
else: device = torch.device("cpu")

In [None]:
os.makedirs("output/models/", exist_ok=True)
os.makedirs("output/preds/", exist_ok=True)

In [None]:
train = pd.read_csv("data/processed/train.csv")
test = pd.read_csv("data/processed/test.csv")

In [None]:
# train = train.sample(frac=0.005).reset_index(drop=True)
# test = test.sample(frac=0.005).reset_index(drop=True)

# 1. Build the model

First, we load the **base model** we intend to fine-tune. RoBERTa is initialized with an untrained classifier head consisting of two dense (i.e., linear) layers:
- The first projects the output from the base model to a hidden space of the same dimensionality (1024 for RoBERTa-large).
- The second maps the hidden representation to the label space, reducing the dimensionality accordingly (to 2 for a binary classifier).

Since this classifier head is not pretrained, the warning below appears. It can either be fine-tuned directly or replaced with a custom classifier block.

In [None]:
model = AutoModelForSequenceClassification.from_pretrained('roberta-large')

Fine-tuning is usually restricted to the final layers of the model responsible for the desired task. This approach speeds up training while preserving the knowledge acquired by the model during pretraining. Since we plan to implement a custom classifier block, we simply freeze all parameters of the base model (they are still trainable by default).

In [None]:
# Freeze pretrained layers 
print(f"Number of parameters in base RoBERTa: {sum(p.numel() for p in model.parameters()) / 1e6:.2f} M")

for param in model.parameters(): 
    param.requires_grad = False

print(f"Number of trainable parameters after freezing: {sum(p.numel() for p in model.parameters() if p.requires_grad) / 1e6:.2f}M")

There are two key decisions for the classifier head:
- Which layers to include
- Which input to use

Regarding the **layers**, a typical approach is to insert an activation layer and a dropout layer between the two projection layers of the initial classification head. The activation layer introduces non-linearity, while the dropout layer helps prevent overfitting. In our case, we expand the hidden space to 3072 to align with RoBERTa's intermediate size, chose a ReLU activation function, and set the dropout rate at 15 %.

Alternatively, as demonstrated [here](https://github.com/gnkhata1/Finetuning-BERT-on-Movie-Reviews-Sentiment-Analysis/blob/main/BERT%2BBiLSTM-SA.py), one can use an LSTM layer, a dropout, then a dense layer. This configuration is effective for capturing long-term dependencies across the sequences, but more computationaly expensive.

A final Softmax layer is necessary to obtain probabilities, and it is typically placed in the `predict` function (instead of `forward`) to simplify the computation of the loss.

Regarding the **input**:
- The simplest approach is to focus on the `[CLS]` token, which marks the beginning of the input sequence and serves as an embedding for its entirety: `x = outputs.last_hidden_state[:, 0, :]`
- Another approach is to pool all tokens from the input sequence, either by taking their mean or maximum: `x = torch.mean(outputs.last_hidden_state, dim=1)` or  `x = torch.max(outputs.last_hidden_state, dim=1).values()`
- The final option is to consider all tokens in the sequence, but this is only applicable when using an LSTM.

In [None]:
# Add layers for classification to finetune
class RoBERTa_architecture(nn.Module): 
    def __init__(self, roberta_model): 
        super(RoBERTa_architecture, self).__init__() 
        self.roberta = roberta_model
        # Dense layer 1
        self.fc1 = nn.Linear(1024, 3072)
        # Relu
        self.relu = nn.ReLU()
        # Dropout layer
        self.dropout = nn.Dropout(0.15)
        # Dense layer 2 (Output layer)
        self.fc2 = nn.Linear(3072, 2)
 
    def forward(self, sent_id, mask): 
        # Pass inputs to the base model
        outputs = self.roberta(sent_id, attention_mask=mask, return_dict=True)
        # Get CLS token representation
        x = outputs.last_hidden_state[:, 0, :]
        # Apply new layers
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        
        return x

    def predict(self, sent_id, mask):
        self.eval()
        # Disable gradient calculation for inference
        with torch.no_grad():
            logits = self.forward(sent_id, mask)
        # Apply Softmax
        probs = F.softmax(logits, dim=1)
        # Retain highest probability
        preds = torch.argmax(x, dim=1)

        return preds

In [None]:
# Create the model
roberta_classifier = RoBERTa_architecture(model.roberta)
roberta_classifier = roberta_classifier.to(device)

# Map the labels
id2label = {0: "negative", 1: "positive"}
label2id = {"negative": 0, "positive": 1}

roberta_classifier.id2label = id2label
roberta_classifier.label2id = label2id

In [None]:
# Chech which layers are available for training
for name, param in roberta_classifier.named_parameters(): 
    if param.requires_grad:
        print(name)

As expected, only the 2 dense layers from the classifier head are trainable. They represent $1024*3072+3072$ parameters for the first layer and $3072*2+2$ for the final layer, that is, $3,154,946$ trainable parameters. This is the value we get indeed.

In [None]:
print(f"Number of parameters to fine-tune: {sum(p.numel() for p in roberta_classifier.parameters() if p.requires_grad)}")

# 2. Preprocessing
First, we need a tokenizer. RoBERTa uses Byte-Pair Encoding (BPE) which operates at the subword level with an agglomerative approach. It merges the most frequent pairs iteratively until the desired vocabulary size is reached, which is 50,265 for RoBERTa. 

RoBERTa has a maximum context length of 512 tokens. Sequences longer than this must be truncated before being passed to the model. We opt for left-side truncation to preserve summaries that sometimes appear at the end of reviews. An alternative approach would be to split long sequences into smaller chunks, process them sequentially, and average the results. However, this would be more complex to implement and may not yield reliable results.

In [None]:
tokenizer = AutoTokenizer.from_pretrained('roberta-large', truncation_side = 'left')

RoBERTa requires all input sequences to be of the same length. Longer sequences are truncated, as discussed above, while shorter sequences are padded by adding empty tokens like `[PAD]` at the end. A mask is also generated to help the model identify the padded tokens. Padding can be applied either to the maximum context length of the model or to the longest sequence in the batch, which is the option used here. RoBERTa does not use `token_type_ids`, which are typically employed to distinguish between different parts of the input, such as prompts and answers. As a result, they are disabled here. Lastly, we pass the `review_id` variable to the `dataloader` so that we can match the predictions with the original dataset once the model is trained.

`batch_size` defines how many sequences are processed in parallel during a single batch. Larger batch sizes can improve computational efficiency, provided there is enough available memory. They offer more stable gradient estimates by using larger amounts of data, but they result in less frequent updates to the model. As a consequence, larger batch sizes may require more epochs to converge, as well as a lower learning rate to reduce the risk of overfitting. Here, `batch_size` is set to 250, meaning that 1% of the dataset is processed in each batch. This setup consumes up to 25 Go of memory. There is no downside to using a higher `batch_size` for inference, so we set it to 500 to fully utilize the 64 GB of available memory on our machine.

In [None]:
class DataPreprocessor:
    def __init__(self, tokenizer, label2id):
        self.tokenizer = tokenizer
        self.train_batch_size = 250
        self.test_batch_size = 500
        self.label2id = label2id

    def prepare(self, data, is_train=True):
        # Tokenize and convert to tensor
        tokens = self.tokenizer(
            data['text'].tolist(),
            padding = 'longest',
            truncation = True,
            return_token_type_ids = False,
            return_tensors = 'pt'
        )
        
        input_ids = tokens['input_ids'].clone().detach()
        attention_mask = tokens['attention_mask'].clone().detach()

        # Get sequence lengths
        seq_lengths = [len(seq) for seq in input_ids]

        # Convert labels
        numeric_labels = [self.label2id[label] for label in data['sentiment']]
        labels = torch.tensor(numeric_labels, dtype = torch.long)

        # Extract review IDs
        ids = torch.tensor(data['review_id'].tolist(), dtype = torch.long)

        # Choose the appropriate batch size
        batch_size = self.train_batch_size if is_train else self.test_batch_size

        # Create dataset and dataloader
        dataset = TensorDataset(input_ids, attention_mask, labels, ids)
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=is_train)

        return dataloader, seq_lengths

In [None]:
time0 = time.time()

preprocessor = DataPreprocessor(tokenizer, label2id)
train_dataloader, train_seq_lengths = preprocessor.prepare(train, is_train=True)
test_dataloader, test_seq_lengths = preprocessor.prepare(test, is_train=False)

time1 = time.time()
print(f"Tokenization duration: {time1-time0:.2f} seconds")

In [None]:
print(f"Average length of sequences in train: {sum(train_seq_lengths) / len(train_seq_lengths):.0f}")
print(f"Average length of sequences in test: {sum(test_seq_lengths) / len(test_seq_lengths):.0f}")

After tokenization and dataloading, the average length of sequences in 512. This indicates that there is at least one sequences longer than 512 tokens in each batch, so that `padding = 'longest'` has the same effect as `padding = 'max_length'`. With a `batch_size` of 256, there are exactly $256×512=131,072$ tokens per (training) batch.

# 3. Training

We define functions for **training** and **testing**. Since we plan to evaluate the model on the entire test dataset after each epoch, we store the logits returned at that time. This allows us to retrieve predictions directly, avoiding the need to recompute them. Additionally, we save the true labels and review IDs to link the predictions with the original dataset for analysis.

In [None]:
loss_fn = nn.CrossEntropyLoss()

In [None]:
batch_train_losses = []
batch_test_losses = []

def model_train(batch):
    b_input_ids, b_attention_mask, b_labels, _ = [t.to(device) for t in batch]
        
    # Reset gradients before backpropagation
    roberta_classifier.zero_grad()
        
    # Perform a forward pass to calculate outputs
    outputs = roberta_classifier(b_input_ids, b_attention_mask)
        
    # Calculate the loss
    loss = loss_fn(outputs, b_labels)
    batch_train_losses.append(loss.item())
        
    # Backpropagate the loss
    loss.backward()
        
    # Update model parameters
    optimizer.step()
    
    return loss

In [None]:
epoch_train_losses = []
epoch_test_losses = []

def model_eval(batch):
    b_input_ids, b_attention_mask, b_labels, b_ids = [t.to(device) for t in batch]
    
    # Forward pass
    logits = roberta_classifier(b_input_ids, b_attention_mask)

    # Store results 
    all_logits.append(logits.detach().cpu())  # Logits
    all_labels.append(b_labels.detach().cpu())  # True labels
    all_ids.append(b_ids.detach().cpu())  # Reviews ID
    
    # Calculate loss
    loss = loss_fn(logits, b_labels)
    batch_test_losses.append(loss.item())

    return loss

We define a **training loop** with the ability to resume after interruption.

The training dataset is processed `num_epochs` times. Typical values for fine-tuning range between 3 to 12. We set it to 10 and will compute the test error after every batch to detect when the model starts overfitting.

For the **optimizer**, we set a relatively low learning rate, as is commonly recommended for fine-tuning RoBERTa models. The `weight_decay` parameter is a regularization term that adds a penalty for large weights during optimization, helping to mitigate overfitting.

We use a **scheduler** to further reduce the learning rate as the model converges. The `num_warmup_steps` is another regularization parameter, which gradually reduces the learning rate during the first batches of each epoch to prevent instability.

In [None]:
num_epochs = 12
num_training_steps = num_epochs * len(train_dataloader)
print(f"Number of training steps: {num_training_steps}")

# Optimizer
optimizer = torch.optim.AdamW(
    roberta_classifier.parameters(), 
    lr = 5e-4, 
    weight_decay = 0.01)

# Scheduler
lr_scheduler = get_scheduler(
    "linear",
    optimizer = optimizer,
    num_warmup_steps = 0.05 * num_training_steps,
    num_training_steps = num_training_steps)

# Avoid issues with multithreading
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
# Resume training
def get_latest_checkpoint(model_dir):
    # Verify if checkpoints are present
    checkpoint_files = glob.glob(os.path.join(model_dir, "epoch_*.pth"))
    if not checkpoint_files:
        return None, 0

    # Extract epoch numbers from filenames
    epochs = [int(os.path.basename(f).split("_")[1].split(".")[0]) for f in checkpoint_files]
    last_epoch = max(epochs)
    
    return os.path.join(model_dir, f"epoch_{last_epoch}.pth"), last_epoch

latest_checkpoint, last_epoch = get_latest_checkpoint("output/models")

if latest_checkpoint:
    if last_epoch >= num_epochs:
        print(f"Training already completed up to epoch {last_epoch}.")
        
    else:
        checkpoint = torch.load(latest_checkpoint, weights_only = False)     
        roberta_classifier.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])

        batch_train_losses = checkpoint['batch_train_losses']
        epoch_train_losses = checkpoint['epoch_train_losses']
        batch_test_losses = checkpoint['batch_test_losses']
        epoch_train_losses = checkpoint['epoch_train_losses']

        batch_times_train = checkpoint['batch_times_train']
        batch_times_test = checkpoint['batch_times_test']
        epoch_times = checkpoint['epoch_times']

        print(f"Resuming from epoch {last_epoch}")

else:
    # Computation time tracking
    batch_times_train = []
    batch_times_test = []
    epoch_times = []

In [None]:
# Memory tracking
if device == 'mps':
    torch.mps.empty_cache()
baseline_memory = psutil.virtual_memory().available
max_memory_test = 0
max_memory_train = 0

In [None]:
epoch_progress = tqdm(range(last_epoch, num_epochs),
                      desc = "Epochs",
                      position = 0,
                      unit = " epoch")

for epoch in epoch_progress:
    epoch_st = time.time()
    
    ##### Training phase #####
    # Set up the train progress bar
    train_progress = tqdm(total = len(train_dataloader), 
                          desc = f"Train", 
                          position = 1, 
                          leave = False,
                          unit = " batch")
    
    roberta_classifier.train()
    for batch in train_dataloader:
        batch_st = time.time()
        loss = model_train(batch)
        # Update progress bar with current loss
        train_progress.set_postfix(loss=f"{loss.item():.4f}")
        train_progress.update(1)
        # Track memory use and computation time
        max_memory_train = max(max_memory_train, psutil.virtual_memory().available)
        batch_times_train.append(time.time() - batch_st)
    
    train_progress.close()
    # Calculate average train loss over the epoch
    avg_train_loss = np.mean(batch_train_losses[-len(train_dataloader):])
    epoch_train_losses.append(avg_train_loss)
    # Update the learning rate
    lr_scheduler.step()

    
    ##### Testing phase #####
    # Set up the test progress bar
    test_progress = tqdm(total = len(test_dataloader), 
                         desc = f"Test", 
                         position = 2, 
                         leave = False,
                         unit = " batch")
    
    all_logits = []
    all_labels = []
    all_ids = []
    
    roberta_classifier.eval()
    for batch in test_dataloader:
        batch_st = time.time()
        loss = model_eval(batch)
        # Update progress bar with current loss
        test_progress.set_postfix(loss=f"{loss.item():.4f}")
        test_progress.update(1)
        # Track memory use and computation time
        max_memory_test = max(max_memory_test, psutil.virtual_memory().available)
        batch_times_test.append(time.time() - batch_st)
    
    test_progress.close()
    # Calculate average test loss over the epoch
    avg_test_loss = np.mean(batch_test_losses[-len(test_dataloader):])
    epoch_test_losses.append(avg_test_loss)
    
    
    ##### Closing phase #####
    # Update epoch progress bar with average losses
    epoch_progress.set_postfix(train_loss=f"{avg_train_loss:.4f}", test_loss=f"{avg_test_loss:.4f}")
 
    # Save predictions
    all_logits = torch.cat(all_logits, dim=0)
    all_labels = torch.cat(all_labels, dim=0)
    all_ids = torch.cat(all_ids, dim=0)
    
    probs = F.softmax(all_logits, dim=1) # Get probabilities
    probs_array = probs.detach().cpu().numpy()
    labels_array = all_labels.detach().cpu().numpy()
    ids_array = all_ids.detach().cpu().numpy()

    results = pd.DataFrame(probs_array, columns=[f"prob_class_{i}" for i in range(probs_array.shape[1])])
    results['true_label'] = [id2label[label] for label in labels_array]
    results['review_id'] = ids_array
    results.to_csv(f"output/preds/epoch_{epoch+1}.csv", index=False)

    # Track computation time
    epoch_times.append(time.time() - epoch_st)
    
    # Save model checkpoint
    model_save_path = os.path.join("output/models", f"epoch_{epoch+1}.pth")
    checkpoint = {
        'epoch': epoch,
        'model': roberta_classifier.state_dict(),
        'optimizer': optimizer.state_dict(),
        'lr_scheduler': lr_scheduler.state_dict(),
        'batch_train_losses': batch_train_losses,
        'epoch_train_losses': epoch_train_losses,
        'batch_test_losses': batch_test_losses,
        'epoch_test_losses': epoch_test_losses,
        'batch_times_train': batch_times_train,
        'batch_times_test': batch_times_test,
        'epoch_times': epoch_times
    }
    torch.save(checkpoint, model_save_path)

    # Clean memory
    if device == 'mps':
        torch.mps.empty_cache()

epoch_progress.close()

In [None]:
print(f"Max memory used for training is {(baseline_memory - max_memory_train) / 1e9:.2f} GB")
print(f"Max memory used for evaluating is {(baseline_memory - max_memory_test) / 1e9:.2f} GB")

In [None]:
print(f"Average computation time for training batches: {np.mean(batch_times_train):.2f} seconds")
print(f"Average computation time for validation batches: {np.mean(batch_times_test):.2f} seconds")
print(f"Average computation time per epoch: {np.mean(epoch_times):.2f} seconds")

# 3. Results
## 1. Learning curves

In [None]:
plt.figure(figsize=(10, 5))

# Plot batch loss
plt.subplot(1, 2, 1)
plt.plot(batch_train_losses, label='Train', color='cornflowerblue')
# Match the scale for longer test batches
test_x = np.linspace(0, len(batch_train_losses) - 1, len(batch_test_losses))
plt.plot(test_x, batch_test_losses, label='Test', color='goldenrod')
plt.xlabel('Batches')
plt.ylabel('Loss')
plt.title('Batch Loss Over Training')
plt.legend()

# Plot epoch loss
plt.subplot(1, 2, 2)
plt.plot(epoch_train_losses, label='Train', color='cornflowerblue')
plt.plot(epoch_test_losses, label='Test', color='goldenrod')
plt.xlabel('Epochs')
plt.ylabel('')
plt.title('Epoch Loss Over Training')
plt.legend()

# Show plot
plt.tight_layout()
plt.savefig("output/RoBERTa_learning_curve.png", dpi=300, bbox_inches='tight')
plt.show()

## 2. Predictions

In [None]:
# Read predictions from the best model and merge them with the original dataset
results = pd.read_csv("output/preds/epoch_10.csv")
results = pd.merge(test, results, on = 'review_id')

In [None]:
# Check for consistency
print("Do the true labels returned by the model match the original sentiments?")
print("Yes!" if (results['sentiment'] == results['true_label']).all() else "No :'(")

### Assess certainty

In [None]:
print(f"Number of ties: {(results['prob_class_0'] == results['prob_class_1']).sum()}")

In [None]:
E = (1 - np.maximum(results['prob_class_0'], results['prob_class_1'])).mean()
print(f"Classification error {E:.3f}")

In [None]:
sns.histplot(results['prob_class_1'], bins=20, color="cornflowerblue", kde=True, stat="probability")
plt.xlabel("Probability")
plt.ylabel("Share of reviews")
plt.title("Distribution of Predicted Probabilities for Positive Sentiment")
plt.gca().yaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f'{x * 100:.1f}%'))
plt.show()

### Get predicted sentiments and save

In [None]:
results['RoBERTa_ft'] = np.where(results['prob_class_1'] >= 0.5, 'positive', 'negative')

In [None]:
results[['review_id', 'RoBERTa_ft']].to_csv("output/RoBERTa_ft.csv")

### Assess performance

In [None]:
accuracy = accuracy_score(results['sentiment'], results['RoBERTa_ft'])
print(f"Accuracy: {accuracy:.4f}")