<a href="https://colab.research.google.com/github/Hillascher5/nlp-tweets-sentiment-analysis/blob/main/Full_fine_tune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# # Needed for Google Colab
# !pip install --quiet evaluate transformers optuna datasets nltk scikit-learn
# !pip install numpy==1.26.4

In [11]:
%env CUDA_LAUNCH_BLOCKING=1

from wordcloud import WordCloud, STOPWORDS
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
from tqdm import tqdm
from collections import Counter
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer, get_scheduler, BertTokenizer, RobertaTokenizer, RobertaForSequenceClassification
from datasets import Dataset
from torch.utils.data import DataLoader, TensorDataset, Dataset
from optuna.pruners import MedianPruner
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import os
import re
import string
import time
import glob
import nltk
import evaluate
import transformers
import torch
import optuna
import wandb
wandb.login()

os.environ["WANDB_PROJECT"] = "tweet-sentiment-classification_split_to_test"
os.environ["WANDB_INIT_TIMEOUT"] = "180"

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

env: CUDA_LAUNCH_BLOCKING=1
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:
# Load data
df_train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/nlp_project/Data/Corona_NLP_train.csv', encoding='latin1')
df_test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/nlp_project/Data/Corona_NLP_test.csv', encoding='latin1')

In [4]:
# Merge and shuffle for better stratified splits
df_full = pd.concat([df_train, df_test], ignore_index=True)
df_full = df_full.sample(frac=1.0, random_state=42).reset_index(drop=True)

### Pre-processing the Data

The tweets were cleaned in several ways and also examined as raw data.

In [5]:
# Try without pre-processing
is_preprocessed = "no_preprocess"
df_full["clean_text"] = df_full["OriginalTweet"]

**Encode Sentiment Labels**

Map each unique sentiment label to a numeric ID for model compatibility, and apply this mapping to both training and validation datasets.

In [6]:
# Mapping sentiments to unique numeric IDs
unique_labels = sorted(df_full["Sentiment"].unique())
label2id = {label: idx for idx, label in enumerate(unique_labels)}
df_full["label"] = df_full["Sentiment"].map(label2id)

In [7]:
# Stratified split: 70% train, 15% val, 15% test
train_val_df, test_df = train_test_split(df_full, test_size=0.15, stratify=df_full["label"], random_state=42)
train_df, val_df = train_test_split(train_val_df, test_size=0.1765, stratify=train_val_df["label"], random_state=42)

# Confirm sizes
print("Train size:", len(train_df))
print("Val size:", len(val_df))
print("Test size:", len(test_df))

Train size: 31466
Val size: 6745
Test size: 6744


**Use Small Subsets for Quick Evaluation**

Select shuffled samples from each training and validation dataset for both BERT and RoBERTa. This allows faster experimentation during model development.

In [8]:
train_subset_df, _ = train_test_split(
    train_df[["clean_text", "label"]],
    train_size=2000,
    stratify=train_df["label"],
    random_state=42
)

val_subset_df, _ = train_test_split(
    val_df[["clean_text", "label"]],
    train_size=500,
    stratify=val_df["label"],
    random_state=42
)

In [13]:
# --- Dataset class ---
class TweetDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.texts = df['clean_text'].tolist()
        self.labels = df['label'].tolist()
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=64,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# --- Training loop ---
def train_model(model, train_loader, val_loader, optimizer, criterion, lr_scheduler, epochs, patience, trial):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    best_acc = 0.0
    best_epoch = 0
    best_model = None

    for epoch in range(1, epochs + 1):
        model.train()
        total_loss, total_correct = 0, 0
        total_samples = 0

        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask)
            loss = criterion(outputs.logits, labels)
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            lr_scheduler.step()

            total_loss += loss.item() * input_ids.size(0)
            total_correct += (outputs.logits.argmax(dim=1) == labels).sum().item()
            total_samples += input_ids.size(0)

        train_acc = total_correct / total_samples
        train_loss = total_loss / total_samples

        # Validation
        model.eval()
        val_loss, val_correct, val_total = 0, 0, 0
        all_preds, all_labels = [], []
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(input_ids, attention_mask=attention_mask)
                loss = criterion(outputs.logits, labels)

                val_loss += loss.item() * input_ids.size(0)
                val_correct += (outputs.logits.argmax(dim=1) == labels).sum().item()
                val_total += input_ids.size(0)

                all_preds.extend(outputs.logits.argmax(dim=1).cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

        val_acc = val_correct / val_total
        val_f1 = f1_score(all_labels, all_preds, average='macro')

        wandb.log({
            "epoch": epoch,
            "train_loss": train_loss,
            "train_acc": train_acc,
            "val_loss": val_loss / val_total,
            "val_acc": val_acc,
            "val_f1": val_f1
        })

        if val_acc > best_acc:
            best_acc = val_acc
            best_epoch = epoch
            best_model = model.state_dict()
        elif epoch - best_epoch > patience:
            break

    if best_model:
        save_path = f"/content/drive/MyDrive/Colab Notebooks/nlp_project/models/w_test_split/Manual_finetune/best_model_trial_{trial.number}.pt"
        torch.save(best_model, save_path)
    return best_acc

# --- Objective for Optuna ---
def objective(trial, model_name, tokenizer_class, dataset_name, train_df, val_df, num_labels, device, drive_save_path=None):
    # Hyperparameter suggestions
    epochs = 4
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1e-3)
    weight_decay = trial.suggest_loguniform("weight_decay", 1e-6, 1e-4)
    patience = trial.suggest_int("patience", 2, 4)
    batch_size = trial.suggest_categorical("batch_size", [16, 32, 64])
    num_layers = trial.suggest_int("num_layers", 1, 3)

    # Tokenizer and Dataset
    tokenizer = tokenizer_class.from_pretrained(model_name)
    train_dataset = TweetDataset(train_df, tokenizer)
    val_dataset = TweetDataset(val_df, tokenizer)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    # Model
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels).to(device)

    # # Freeze all layers, unfreeze last n
    # for param in model.base_model.parameters():
    #     param.requires_grad = False
    # for param in model.base_model.encoder.layer[-num_layers:].parameters():
    #     param.requires_grad = True
    # for param in model.classifier.parameters():
    #     param.requires_grad = True

    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

    total_steps = len(train_loader) * epochs
    lr_scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=total_steps
    )

    # Initialize wandb
    wandb.init(
        project=f"{dataset_name}-{model_name.split('/')[-1]}",
        config={
            "learning_rate": learning_rate,
            "weight_decay": weight_decay,
            "patience": patience,
            "batch_size": batch_size,
            "num_layers": num_layers,
            "model_name": model_name,
        },
        name=f"{model_name.split('/')[-1]}-trial_{trial.number}"
    )

    # Train the model
    best_val_accuracy = train_model(
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        optimizer=optimizer,
        criterion=criterion,
        lr_scheduler=lr_scheduler,
        epochs=epochs,
        patience=patience,
        trial=trial
    )

    # Save best model to drive if specified
    if drive_save_path:
        model_save_path = os.path.join(drive_save_path, f"best_model_{model_name.split('/')[-1]}_trial_{trial.number}.pt")
        torch.save(model.state_dict(), model_save_path)

    wandb.finish()
    return best_val_accuracy

In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# For BERT
study_bert = optuna.create_study(direction="maximize")
study_bert.optimize(lambda trial: objective(
    trial=trial,
    model_name="bert-base-uncased",
    tokenizer_class=BertTokenizer,
    dataset_name="covid-tweets_manual",
    train_df=train_subset_df,
    val_df=val_subset_df,
    num_labels=5,
    device=device,
    drive_save_path="/content/drive/MyDrive/Colab Notebooks/nlp_project/models/w_test_split/Manual_finetune"
), n_trials=5)
wandb.finish()

[I 2025-08-03 20:32:30,780] A new study created in memory with name: no-name-23922e9a-ee76-40de-bcf0-1b1d4b98546a
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1e-3)
  weight_decay = trial.suggest_loguniform("weight_decay", 1e-6, 1e-4)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)


0,1
epoch,▁▃▆█
train_acc,▄▁▅█
train_loss,█▆▃▁
val_acc,▁▄▄█
val_f1,▄▁▆█
val_loss,█▆▁▁

0,1
epoch,4.0
train_acc,0.2885
train_loss,1.55277
val_acc,0.296
val_f1,0.14344
val_loss,1.551


[I 2025-08-03 20:33:23,900] Trial 0 finished with value: 0.296 and parameters: {'learning_rate': 0.00030992481532662175, 'weight_decay': 1.9353633336995484e-05, 'patience': 3, 'batch_size': 64, 'num_layers': 1}. Best is trial 0 with value: 0.296.
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1e-3)
  weight_decay = trial.suggest_loguniform("weight_decay", 1e-6, 1e-4)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)


0,1
epoch,▁▃▆█
train_acc,▁▇▆█
train_loss,█▃▂▁
val_acc,▁▆██
val_f1,▁▆██
val_loss,█▁▄▁

0,1
epoch,4.0
train_acc,0.26
train_loss,1.5915
val_acc,0.276
val_f1,0.08652
val_loss,1.57664


[I 2025-08-03 20:34:16,277] Trial 1 finished with value: 0.276 and parameters: {'learning_rate': 0.0004221042416068645, 'weight_decay': 1.2868976349595918e-05, 'patience': 4, 'batch_size': 64, 'num_layers': 3}. Best is trial 0 with value: 0.296.
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1e-3)
  weight_decay = trial.suggest_loguniform("weight_decay", 1e-6, 1e-4)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)


0,1
epoch,▁▃▆█
train_acc,▃▆▁█
train_loss,█▄▃▁
val_acc,█▁█▆
val_f1,█▁█▇
val_loss,▁█▁▁

0,1
epoch,4.0
train_acc,0.2695
train_loss,1.58186
val_acc,0.244
val_f1,0.07846
val_loss,1.57825


[I 2025-08-03 20:35:16,228] Trial 2 finished with value: 0.276 and parameters: {'learning_rate': 0.0003373391885061881, 'weight_decay': 1.671626821164024e-06, 'patience': 4, 'batch_size': 32, 'num_layers': 1}. Best is trial 0 with value: 0.296.
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1e-3)
  weight_decay = trial.suggest_loguniform("weight_decay", 1e-6, 1e-4)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)


0,1
epoch,▁▃▆█
train_acc,▁▄▆█
train_loss,█▆▃▁
val_acc,▁▆██
val_f1,▁▆██
val_loss,█▄▁▁

0,1
epoch,4.0
train_acc,0.691
train_loss,0.83864
val_acc,0.51
val_f1,0.51568
val_loss,1.10346


[I 2025-08-03 20:36:08,191] Trial 3 finished with value: 0.518 and parameters: {'learning_rate': 5.413131196034434e-05, 'weight_decay': 1.672754736606803e-05, 'patience': 2, 'batch_size': 64, 'num_layers': 1}. Best is trial 3 with value: 0.518.
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1e-3)
  weight_decay = trial.suggest_loguniform("weight_decay", 1e-6, 1e-4)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)


0,1
epoch,▁▃▆█
train_acc,▁▄▆█
train_loss,█▆▃▁
val_acc,▁▄▇█
val_f1,▁▄▇█
val_loss,█▅▂▁

0,1
epoch,4.0
train_acc,0.6005
train_loss,1.0894
val_acc,0.472
val_f1,0.47247
val_loss,1.22546


[I 2025-08-03 20:37:09,363] Trial 4 finished with value: 0.472 and parameters: {'learning_rate': 1.7786307454975333e-05, 'weight_decay': 5.148651603505544e-06, 'patience': 3, 'batch_size': 32, 'num_layers': 1}. Best is trial 3 with value: 0.518.


In [15]:
# For RoBERTa
study_roberta = optuna.create_study(direction="maximize")
study_roberta.optimize(lambda trial: objective(
    trial=trial,
    model_name="roberta-base",
    tokenizer_class=RobertaTokenizer,
    dataset_name="covid-tweets_manual",
    train_df=train_subset_df,
    val_df=val_subset_df,
    num_labels=5,
    device=device,
    drive_save_path="/content/drive/MyDrive/Colab Notebooks/nlp_project/models/w_test_split/Manual_finetune"
), n_trials=5)
wandb.finish()

[I 2025-08-03 20:37:22,217] A new study created in memory with name: no-name-663f9a88-1729-4c2e-9fdf-912c38e4ff18
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1e-3)
  weight_decay = trial.suggest_loguniform("weight_decay", 1e-6, 1e-4)


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)


0,1
epoch,▁▃▆█
train_acc,▅▆█▁
train_loss,█▄▁▁
val_acc,███▁
val_f1,███▁
val_loss,▄█▁▁

0,1
epoch,4.0
train_acc,0.243
train_loss,1.58335
val_acc,0.244
val_f1,0.07846
val_loss,1.57607


[I 2025-08-03 20:38:29,339] Trial 0 finished with value: 0.276 and parameters: {'learning_rate': 0.00015290659826012226, 'weight_decay': 1.4692529830996639e-06, 'patience': 3, 'batch_size': 16, 'num_layers': 2}. Best is trial 0 with value: 0.276.
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1e-3)
  weight_decay = trial.suggest_loguniform("weight_decay", 1e-6, 1e-4)
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)


0,1
epoch,▁▃▆█
train_acc,▁▂▄█
train_loss,▆█▇▁
val_acc,▁▁▁█
val_f1,▁▁▁█
val_loss,▇█▇▁

0,1
epoch,4.0
train_acc,0.2815
train_loss,1.56496
val_acc,0.332
val_f1,0.20868
val_loss,1.52836


[I 2025-08-03 20:39:44,193] Trial 1 finished with value: 0.332 and parameters: {'learning_rate': 6.870339268983084e-05, 'weight_decay': 1.174325706714089e-05, 'patience': 4, 'batch_size': 16, 'num_layers': 2}. Best is trial 1 with value: 0.332.
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1e-3)
  weight_decay = trial.suggest_loguniform("weight_decay", 1e-6, 1e-4)
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)


0,1
epoch,▁▃▆█
train_acc,▁▄▇█
train_loss,█▅▂▁
val_acc,▁▆▇█
val_f1,▁▆██
val_loss,█▂▁▁

0,1
epoch,4.0
train_acc,0.667
train_loss,0.87149
val_acc,0.496
val_f1,0.50776
val_loss,1.18014


[I 2025-08-03 20:40:50,064] Trial 2 finished with value: 0.496 and parameters: {'learning_rate': 1.7397491184875135e-05, 'weight_decay': 3.989353560564995e-05, 'patience': 3, 'batch_size': 16, 'num_layers': 2}. Best is trial 2 with value: 0.496.
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1e-3)
  weight_decay = trial.suggest_loguniform("weight_decay", 1e-6, 1e-4)
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)


0,1
epoch,▁▃▆█
train_acc,▁▄▆█
train_loss,█▅▃▁
val_acc,▁▆██
val_f1,▁▇██
val_loss,█▂▁▁

0,1
epoch,4.0
train_acc,0.717
train_loss,0.76763
val_acc,0.518
val_f1,0.53443
val_loss,1.18457


[I 2025-08-03 20:41:51,612] Trial 3 finished with value: 0.518 and parameters: {'learning_rate': 4.337189034515284e-05, 'weight_decay': 3.17823866505434e-05, 'patience': 2, 'batch_size': 32, 'num_layers': 2}. Best is trial 3 with value: 0.518.
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1e-3)
  weight_decay = trial.suggest_loguniform("weight_decay", 1e-6, 1e-4)
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)


0,1
epoch,▁▃▆█
train_acc,▁▃█▇
train_loss,█▅▁▁
val_acc,▁█▁█
val_f1,▁█▁█
val_loss,█▅▇▁

0,1
epoch,4.0
train_acc,0.272
train_loss,1.58394
val_acc,0.276
val_f1,0.08652
val_loss,1.57512


[I 2025-08-03 20:42:55,753] Trial 4 finished with value: 0.276 and parameters: {'learning_rate': 0.00034515764427762015, 'weight_decay': 2.2417691241629243e-05, 'patience': 3, 'batch_size': 16, 'num_layers': 1}. Best is trial 3 with value: 0.518.


In [16]:
best_bert_trial = study_bert.best_trial
best_roberta_trial = study_roberta.best_trial

In [21]:
def finetune_on_full_data(model_name, tokenizer_class, best_trial, full_train_df, full_val_df, num_labels, device, save_path):
    epochs = 10
    tokenizer = tokenizer_class.from_pretrained(model_name)

    train_dataset = TweetDataset(full_train_df, tokenizer)
    val_dataset = TweetDataset(full_val_df, tokenizer)

    train_loader = DataLoader(train_dataset, batch_size=best_trial.params["batch_size"], shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=best_trial.params["batch_size"], shuffle=False)

    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels).to(device)

    # # Freeze all layers and unfreeze last N
    # for param in model.base_model.parameters():
    #     param.requires_grad = False
    # for param in model.base_model.encoder.layer[-best_trial.params["num_layers"]:].parameters():
    #     param.requires_grad = True
    # for param in model.classifier.parameters():
    #     param.requires_grad = True

    optimizer = optim.Adam(
        model.parameters(),
        lr=best_trial.params["learning_rate"],
        weight_decay=best_trial.params["weight_decay"]
    )

    total_steps = len(train_loader) * epochs
    lr_scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=total_steps
    )

    criterion = nn.CrossEntropyLoss()

    dummy_trial = type('', (), {})()   # fake trial object just for saving
    dummy_trial.number = 999           # so the save path is unique

    acc = train_model(
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        optimizer=optimizer,
        criterion=criterion,
        lr_scheduler=lr_scheduler,
        epochs=epochs,
        patience=best_trial.params["patience"],
        trial=dummy_trial
    )

    final_model_path = os.path.join(save_path, f"final_model_{model_name.split('/')[-1]}.pt")
    torch.save(model.state_dict(), final_model_path)
    return final_model_path

In [22]:
best_params_bert = best_bert_trial.params
run_name_bert = f"bert_final_stratify_{is_preprocessed}-lr{best_params_bert['learning_rate']:.1e}-bs{best_params_bert['batch_size']}"
wandb.init(project="covid-tweets_manual-bert-base-uncased", name=run_name_bert, reinit=True)

final_bert_path = finetune_on_full_data(
    model_name="bert-base-uncased",
    tokenizer_class=BertTokenizer,
    best_trial=best_bert_trial,
    full_train_df=train_df,
    full_val_df=val_df,
    num_labels=5,
    device=device,
    save_path="/content/drive/MyDrive/Colab Notebooks/nlp_project/models/w_test_split/Manual_finetune"
)
wandb.finish()


best_params_roberta = best_roberta_trial.params
run_name_roberta = f"roberta_final_stratify_{is_preprocessed}-lr{best_params_roberta['learning_rate']:.1e}-bs{best_params_roberta['batch_size']}"
wandb.init(project="covid-tweets_manual-roberta-base", name=run_name_roberta, reinit=True)

final_roberta_path = finetune_on_full_data(
    model_name="roberta-base",
    tokenizer_class=RobertaTokenizer,
    best_trial=best_roberta_trial,
    full_train_df=train_df,
    full_val_df=val_df,
    num_labels=5,
    device=device,
    save_path="/content/drive/MyDrive/Colab Notebooks/nlp_project/models/w_test_split/Manual_finetune"
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)


0,1
epoch,▁▂▃▅▆▇█
train_acc,▁▅▆▇▇██
train_loss,█▄▃▂▂▁▁
val_acc,▁▇▇███▆
val_f1,▁▇▇███▆
val_loss,▄▁▂▂▄▅█

0,1
epoch,7.0
train_acc,0.97664
train_loss,0.07728
val_acc,0.84537
val_f1,0.84918
val_loss,0.64971


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)


In [23]:
def evaluate_model(model_name, tokenizer_class, checkpoint_path, test_df, num_labels, device, batch_size=32):
    # Load tokenizer and dataset
    tokenizer = tokenizer_class.from_pretrained(model_name)
    test_dataset = TweetDataset(test_df, tokenizer)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # Load model and weights
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
    model.load_state_dict(torch.load(checkpoint_path))
    model.to(device)
    model.eval()

    all_labels = []
    all_preds = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = logits.argmax(dim=1)

            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())

    metrics = {
        "Accuracy": accuracy_score(all_labels, all_preds),
        "Precision": precision_score(all_labels, all_preds, average='macro'),
        "Recall": recall_score(all_labels, all_preds, average='macro'),
        "F1 Score": f1_score(all_labels, all_preds, average='macro')
    }

    return metrics

In [24]:
bert_metrics = evaluate_model(
    model_name="bert-base-uncased",
    tokenizer_class=BertTokenizer,
    checkpoint_path=final_bert_path,
    test_df=test_df,
    num_labels=5,
    device=device
)

roberta_metrics = evaluate_model(
    model_name="roberta-base",
    tokenizer_class=RobertaTokenizer,
    checkpoint_path=final_roberta_path,
    test_df=test_df,
    num_labels=5,
    device=device
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)


In [25]:
bert_metrics

{'Accuracy': 0.8483096085409253,
 'Precision': 0.8527875341351443,
 'Recall': 0.854444148318707,
 'F1 Score': 0.8520909120276196}

In [26]:
roberta_metrics

{'Accuracy': 0.8130189798339265,
 'Precision': 0.8112250727875885,
 'Recall': 0.8266234042749954,
 'F1 Score': 0.8174109541593682}