<a href="https://colab.research.google.com/github/Hillascher5/nlp-tweets-sentiment-analysis/blob/main/Full_fine_tune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# # Needed for Google Colab
# !pip install --quiet evaluate transformers optuna datasets nltk scikit-learn
# !pip install numpy==1.26.4

In [None]:
%env CUDA_LAUNCH_BLOCKING=1

from wordcloud import WordCloud, STOPWORDS
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
from tqdm import tqdm
from collections import Counter
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer, get_scheduler, BertTokenizer, RobertaTokenizer, RobertaForSequenceClassification
from datasets import Dataset
from torch.utils.data import DataLoader, TensorDataset, Dataset
from optuna.pruners import MedianPruner
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import os
import re
import random
import string
import time
import glob
import nltk
import evaluate
import transformers
import torch
import optuna
import wandb
wandb.login()
# API key - 0cbd7fe3cffd71df993b30edb4fa0db94f114413 - uni
# API key - 65fb8494261cc49f8d09e6c57ef80bcad6a653b9 - pers


num_train_samples = 5000
os.environ["WANDB_PROJECT"] = f"tweet-sentiment-classification_split_to_test_manual_code_{num_train_samples}_samples_optuna"
os.environ["WANDB_INIT_TIMEOUT"] = "180"

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
def set_seed_all(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed_all(42)

In [None]:
# Load data
df_train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/nlp_project/Data/Corona_NLP_train.csv', encoding='latin1')
df_test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/nlp_project/Data/Corona_NLP_test.csv', encoding='latin1')

In [None]:
# Merge and shuffle for better stratified splits
df_full = pd.concat([df_train, df_test], ignore_index=True)
df_full = df_full.sample(frac=1.0, random_state=42).reset_index(drop=True)

### Pre-processing the Data

The tweets were cleaned in several ways and also examined as raw data.

In [None]:
# # Try without pre-processing
# is_preprocessed = "no_preprocess"
# df_full["clean_text"] = df_full["OriginalTweet"]

In [None]:
# Try minimal pre-processing
def light_preprocess(text):
    return text.strip()                             # Remove unnecessary spaces

is_preprocessed = "minimal_preprocess"
df_full["clean_text"] = df_full["OriginalTweet"].apply(light_preprocess)

**Encode Sentiment Labels**

Map each unique sentiment label to a numeric ID for model compatibility, and apply this mapping to both training and validation datasets.

In [None]:
# Mapping sentiments to unique numeric IDs
unique_labels = sorted(df_full["Sentiment"].unique())
label2id = {label: idx for idx, label in enumerate(unique_labels)}
df_full["label"] = df_full["Sentiment"].map(label2id)

In [None]:
# Stratified split: 70% train, 15% val, 15% test
train_val_df, test_df = train_test_split(df_full, test_size=0.15, stratify=df_full["label"], random_state=42)
train_df, val_df = train_test_split(train_val_df, test_size=0.1765, stratify=train_val_df["label"], random_state=42)

# Confirm sizes
print("Train size:", len(train_df))
print("Val size:", len(val_df))
print("Test size:", len(test_df))

**Use Small Subsets for Quick Evaluation**

Select shuffled samples from each training and validation dataset for both BERT and RoBERTa. This allows faster experimentation during model development.

In [None]:
train_subset_df, _ = train_test_split(
    train_df[["clean_text", "label"]],
    train_size=num_train_samples,
    stratify=train_df["label"],
    random_state=42
)

val_subset_df, _ = train_test_split(
    val_df[["clean_text", "label"]],
    train_size=500,
    stratify=val_df["label"],
    random_state=42
)

### Manual Fine-Tuning

Manually fine-tune transformer models using TweetDataset class, training loop, and Optuna-driven hyperparameter search. This setup gives control over optimization, early stopping, and model saving while logging performance with Weights & Biases.

In [None]:
# Dataset class
class TweetDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=256):
        self.texts = df['clean_text'].tolist()
        self.labels = df['label'].tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Training loop
def train_model(model, train_loader, val_loader, optimizer, criterion, lr_scheduler, epochs, patience, trial):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    best_acc = 0.0
    best_f1 = -1.0
    best_epoch = 0
    best_model = None

    for epoch in range(1, epochs + 1):
        model.train()
        total_loss, total_correct = 0, 0
        total_samples = 0

        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask)
            loss = criterion(outputs.logits, labels)
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            lr_scheduler.step()

            total_loss += loss.item() * input_ids.size(0)
            total_correct += (outputs.logits.argmax(dim=1) == labels).sum().item()
            total_samples += input_ids.size(0)

        train_acc = total_correct / total_samples
        train_loss = total_loss / total_samples

        # Validation
        model.eval()
        val_loss, val_correct, val_total = 0, 0, 0
        all_preds, all_labels = [], []
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(input_ids, attention_mask=attention_mask)
                loss = criterion(outputs.logits, labels)

                val_loss += loss.item() * input_ids.size(0)
                val_correct += (outputs.logits.argmax(dim=1) == labels).sum().item()
                val_total += input_ids.size(0)

                all_preds.extend(outputs.logits.argmax(dim=1).cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

        val_acc = val_correct / val_total
        val_f1 = f1_score(all_labels, all_preds, average='macro', zero_division=0)

        wandb.log({
            "epoch": epoch,
            "train_loss": train_loss,
            "train_acc": train_acc,
            "val_loss": val_loss / val_total,
            "val_acc": val_acc,
            "val_f1": val_f1
        })

        # save best by F1
        if val_f1 > best_f1:
            best_f1 = val_f1
            best_epoch = epoch
            best_model = model.state_dict()
        elif epoch - best_epoch > patience:
            break

    # if best_model:
    #     save_path = f"/content/drive/MyDrive/Colab Notebooks/nlp_project/models/w_test_split/Manual_finetune_min_preproc_{num_train_samples}_samples_opt/best_model_trial_{trial.number}_{num_train_samples}_samples_opt.pt"
    #     torch.save(best_model, save_path)
    return float(best_f1)

# Objective for Optuna
def objective(trial, model_name, tokenizer_class, dataset_name, train_df, val_df, num_labels, device, drive_save_path=None):
    # Hyperparameter suggestions
    epochs = trial.suggest_int("epochs", 2, 5)
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1e-3)
    weight_decay = trial.suggest_loguniform("weight_decay", 1e-6, 1e-4)
    patience = trial.suggest_int("patience", 2, 4)
    batch_size = trial.suggest_categorical("batch_size", [16, 32, 64])
    num_layers = trial.suggest_int("num_layers", 1, 3)

    # Tokenizer and Dataset
    tokenizer = tokenizer_class.from_pretrained(model_name)
    train_dataset = TweetDataset(train_df, tokenizer, max_length=256)
    val_dataset = TweetDataset(val_df, tokenizer, max_length=256)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    # Model
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels).to(device)

    # Loss and optimizer
    criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
    optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

    total_steps = len(train_loader) * epochs
    warmup_steps = int(0.1 * total_steps)
    lr_scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=warmup_steps,
        num_training_steps=total_steps
    )

    # Initialize wandb
    wandb.init(
        project=f"{dataset_name}-{model_name.split('/')[-1]}",
        config={
            "epochs": epochs,
            "learning_rate": learning_rate,
            "weight_decay": weight_decay,
            "patience": patience,
            "batch_size": batch_size,
            "num_layers": num_layers,
            "model_name": model_name,
        },
        name=f"{model_name}-trial_{trial.number}"
    )

    # Train the model
    best_val_f1 = train_model(
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        optimizer=optimizer,
        criterion=criterion,
        lr_scheduler=lr_scheduler,
        epochs=epochs,
        patience=patience,
        trial=trial
    )

    # Save best model to drive
    if drive_save_path:
      if '/' in model_name:
        model_name = model_name.split('/')[-1]
      model_save_path = os.path.join(drive_save_path, f"best_model_{model_name}_trial_{trial.number}_{num_train_samples}_samples_opt.pt")
      torch.save(model.state_dict(), model_save_path)

    wandb.finish()
    return best_val_f1

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
manual_dir = f"/content/drive/MyDrive/Colab Notebooks/nlp_project/optuna/Manual_finetune_min_preproc_{num_train_samples}_samples_opt"

In [None]:
# Fine-tune BERT
study_bert = optuna.create_study(direction="maximize")
study_bert.optimize(lambda trial: objective(
    trial=trial,
    model_name="bert-base-uncased",
    tokenizer_class=BertTokenizer,
    dataset_name="covid_tweets_manual_trials",
    train_df=train_subset_df,
    val_df=val_subset_df,
    num_labels=5,
    device=device,
    drive_save_path=manual_dir
), n_trials=5)
wandb.finish()

In [None]:
# Fine-tune RoBERTa
study_roberta = optuna.create_study(direction="maximize")
study_roberta.optimize(lambda trial: objective(
    trial=trial,
    model_name="roberta-base",
    tokenizer_class=RobertaTokenizer,
    dataset_name="covid_tweets_manual_trials",
    train_df=train_subset_df,
    val_df=val_subset_df,
    num_labels=5,
    device=device,
    drive_save_path=manual_dir
), n_trials=5)
wandb.finish()

In [None]:
# Load deberta
deberta_model_name = "microsoft/deberta-v3-base"
n_labels = len(unique_labels)

# Tokenizer and model
deberta_tokenizer = AutoTokenizer.from_pretrained(deberta_model_name)
deberta_model = AutoModelForSequenceClassification.from_pretrained(deberta_model_name, num_labels=n_labels)

# Fine-tune DeBERTa
study_deberta = optuna.create_study(direction="maximize")
study_deberta.optimize(lambda trial: objective(
    trial=trial,
    model_name="microsoft/deberta-v3-base",
    tokenizer_class=deberta_tokenizer,
    dataset_name="covid_tweets_manual_trials",
    train_df=train_subset_df,
    val_df=val_subset_df,
    num_labels=5,
    device=device,
    drive_save_path=manual_dir
), n_trials=5)
wandb.finish()

In [None]:
# Saving best trials
save_path = manual_dir

study_bert.trials_dataframe().to_csv(save_path + f"/study_bert_results_{num_train_samples}_samples_opt.csv", index=False)
study_roberta.trials_dataframe().to_csv(save_path + f"/study_roberta_results_{num_train_samples}_samples_opt.csv", index=False)
study_deberta.trials_dataframe().to_csv(save_path + f"/study_deberta_results_{num_train_samples}_samples_opt.csv", index=False)

In [None]:
best_bert_trial = study_bert.best_trial
best_roberta_trial = study_roberta.best_trial
best_deberta_trial = study_deberta.best_trial

### Fine-Tuning on Full Data
Retrain the model on the entire dataset using the best hyperparameters found by Optuna. It rebuilds the dataloaders, initializes the model, and optimizer. Finally, it saves the fully fine-tuned model.

In [None]:
def finetune_on_full_data(model_name, tokenizer_class, best_trial, full_train_df, full_val_df, num_labels, device, save_path):
    epochs = best_trial.params["epochs"]
    tokenizer = tokenizer_class.from_pretrained(model_name)

    train_dataset = TweetDataset(full_train_df, tokenizer, max_length=256)
    val_dataset = TweetDataset(full_val_df, tokenizer, max_length=256)

    train_loader = DataLoader(train_dataset, batch_size=best_trial.params["batch_size"], shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=best_trial.params["batch_size"], shuffle=False)

    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels).to(device)

    optimizer = optim.AdamW(
        model.parameters(),
        lr=best_trial.params["learning_rate"],
        weight_decay=best_trial.params["weight_decay"]
    )

    total_steps = len(train_loader) * epochs
    warmup_steps = int(0.1 * total_steps)
    lr_scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=warmup_steps,
        num_training_steps=total_steps
    )

    criterion = nn.CrossEntropyLoss(label_smoothing=0.1)

    dummy_trial = type('', (), {})()
    dummy_trial.number = 999

    acc = train_model(
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        optimizer=optimizer,
        criterion=criterion,
        lr_scheduler=lr_scheduler,
        epochs=epochs,
        patience=best_trial.params["patience"],
        trial=dummy_trial
    )

    if '/' in model_name:
      model_name = model_name.split('/')[-1]
    final_model_path = os.path.join(save_path, f"final_model_{model_name}_{num_train_samples}_samples_opt.pt")
    torch.save(model.state_dict(), final_model_path)
    return final_model_path

In [None]:
manual_dir_best = f"/content/drive/MyDrive/Colab Notebooks/nlp_project/models/w_test_split/Manual_finetune_min_preproc_{num_train_samples}_samples_opt"

In [None]:
best_params_bert = best_bert_trial.params
run_name_bert = f"bert_final_stratify_{is_preprocessed}-ep{best_params_bert['epochs']}-lr{best_params_bert['learning_rate']:.1e}-bs{best_params_bert['batch_size']}"
wandb.init(project=f"covid_tweets_manual-bert-base-uncased_{num_train_samples}_samples_opt", name=run_name_bert, reinit=True)

final_bert_path = finetune_on_full_data(
    model_name="bert-base-uncased",
    tokenizer_class=BertTokenizer,
    best_trial=best_bert_trial,
    full_train_df=train_df,
    full_val_df=val_df,
    num_labels=5,
    device=device,
    save_path=manual_dir_best
)
wandb.finish()


best_params_roberta = best_roberta_trial.params
run_name_roberta = f"roberta_final_stratify_{is_preprocessed}-ep{best_params_roberta['epochs']}-lr{best_params_roberta['learning_rate']:.1e}-bs{best_params_roberta['batch_size']}"
wandb.init(project=f"covid_tweets_manual-roberta-base_{num_train_samples}_samples_opt", name=run_name_roberta, reinit=True)

final_roberta_path = finetune_on_full_data(
    model_name="roberta-base",
    tokenizer_class=RobertaTokenizer,
    best_trial=best_roberta_trial,
    full_train_df=train_df,
    full_val_df=val_df,
    num_labels=5,
    device=device,
    save_path=manual_dir_best
)
wandb.finish()

best_params_deberta = best_deberta_trial.params
run_name_deberta = f"deberta_final_stratify_{is_preprocessed}-ep{best_params_deberta['epochs']}-lr{best_params_deberta['learning_rate']:.1e}-bs{best_params_deberta['batch_size']}"
wandb.init(project=f"covid_tweets_manual-deberta-base_{num_train_samples}_samples_opt", name=run_name_deberta, reinit=True)

final_deberta_path = finetune_on_full_data(
    model_name="microsoft/deberta-v3-base",
    tokenizer_class=deberta_tokenizer,
    best_trial=best_deberta_trial,
    full_train_df=train_df,
    full_val_df=val_df,
    num_labels=5,
    device=device,
    save_path=manual_dir_best
)
wandb.finish()

### Model evaluation

In [None]:
def evaluate_model(model_name, tokenizer_class, checkpoint_path, test_df, num_labels, device, batch_size=32):
    # Load tokenizer and dataset
    tokenizer = tokenizer_class.from_pretrained(model_name)
    test_dataset = TweetDataset(test_df, tokenizer, max_length=256)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # Load model and weights
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
    model.load_state_dict(torch.load(checkpoint_path))
    model.to(device)
    model.eval()

    all_labels = []
    all_preds = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = logits.argmax(dim=1)

            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())

    metrics = {
        "Accuracy": accuracy_score(all_labels, all_preds),
        "Precision": precision_score(all_labels, all_preds, average='macro'),
        "Recall": recall_score(all_labels, all_preds, average='macro'),
        "F1 Score": f1_score(all_labels, all_preds, average='macro')
    }

    return metrics

In [None]:
bert_metrics = evaluate_model(
    model_name="bert-base-uncased",
    tokenizer_class=BertTokenizer,
    checkpoint_path=final_bert_path,
    test_df=test_df,
    num_labels=5,
    device=device
)

roberta_metrics = evaluate_model(
    model_name="roberta-base",
    tokenizer_class=RobertaTokenizer,
    checkpoint_path=final_roberta_path,
    test_df=test_df,
    num_labels=5,
    device=device
)

deberta_metrics = evaluate_model(
    model_name="microsoft/deberta-v3-base",
    tokenizer_class=deberta_tokenizer,
    checkpoint_path=final_deberta_path,
    test_df=test_df,
    num_labels=5,
    device=device
)

In [None]:
bert_metrics

In [None]:
roberta_metrics

In [None]:
deberta_metrics

Save models from manual code as Hugging Face format

In [None]:
bert_path = os.path.join(manual_dir_best, f"final_model_bert-base-uncased_{num_train_samples}_samples_opt.pt")
roberta_path = os.path.join(manual_dir_best, f"final_model_roberta-base_{num_train_samples}_samples_opt.pt")
deberta_path = os.path.join(manual_dir_best, f"final_model_deberta-v3-base_{num_train_samples}_samples_opt.pt")

# Mapping sentiments to unique numeric IDs
id2label = {v:k for k,v in label2id.items()}

bert_manual = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels=len(label2id), label2id=label2id, id2label=id2label).to(device)
bert_manual.load_state_dict(torch.load(bert_path, map_location=device))
bert_manual.eval()
bert_tok_manual = AutoTokenizer.from_pretrained("bert-base-uncased")

roberta_manual = AutoModelForSequenceClassification.from_pretrained(
    "roberta-base", num_labels=len(label2id), label2id=label2id, id2label=id2label).to(device)
roberta_manual.load_state_dict(torch.load(roberta_path, map_location=device))
roberta_manual.eval()
roberta_tok_manual = AutoTokenizer.from_pretrained("roberta-base")

deberta_manual = AutoModelForSequenceClassification.from_pretrained(
    "microsoft/deberta-v3-base", num_labels=len(label2id), label2id=label2id, id2label=id2label).to(device)
deberta_manual.load_state_dict(torch.load(deberta_path, map_location=device))
deberta_manual.eval()
deberta_tok_manual = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base")

In [None]:
# BERT manual - save_pretrained folder
bert_manual_out = os.path.join(manual_dir_best, f"manual2hf_bert-base-uncased_{num_train_samples}_samples_opt")
bert_manual.save_pretrained(bert_manual_out)
bert_tok_manual.save_pretrained(bert_manual_out)

# RoBERTa manual - save_pretrained folder
roberta_manual_out = os.path.join(manual_dir_best, f"manual2hf_roberta-base_{num_train_samples}_samples_opt")
roberta_manual.save_pretrained(roberta_manual_out)
roberta_tok_manual.save_pretrained(roberta_manual_out)

# DeBERTa manual - save_pretrained folder
deberta_manual_out = os.path.join(manual_dir_best, f"manual2hf_deberta-base_{num_train_samples}_samples_opt")
deberta_manual.save_pretrained(deberta_manual_out)
deberta_tok_manual.save_pretrained(deberta_manual_out)

In [None]:
# from IPython.display import Javascript

# def disconnect_runtime():
#     display(Javascript('google.colab.kernel.disconnect()'))

# disconnect_runtime()