## **NLP - Text Classification Project**
Group H - August 2025

Classification of tweets from Twitter that have been manually tagged for sentiment analysis.

In [24]:
# # Needed for Google Colab
# !pip install --quiet evaluate transformers optuna datasets nltk scikit-learn
# !pip install numpy==1.26.4

In [25]:
%env CUDA_LAUNCH_BLOCKING=1

from wordcloud import WordCloud, STOPWORDS
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
from tqdm import tqdm
from collections import Counter
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, classification_report
from nltk.stem import WordNetLemmatizer
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer
from datasets import Dataset
from optuna.pruners import MedianPruner
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import os
import re
import string
import time
import glob
import nltk
import evaluate
import transformers
import optuna
import wandb
wandb.login()

os.environ["WANDB_PROJECT"] = "tweet-sentiment-classification"
os.environ["WANDB_INIT_TIMEOUT"] = "180"

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

env: CUDA_LAUNCH_BLOCKING=1
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [26]:
# Load data
df_train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/nlp_project/Data/Corona_NLP_train.csv', encoding='latin1')
df_val = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/nlp_project/Data/Corona_NLP_test.csv', encoding='latin1')

### Pre-processing the Data

The tweets were cleaned by lowercasing (to reduce redundancy), removing stopwords, punctuation, numbers, short words, and applying lemmatization to reduce words to their base form (e.g. running → run). <br>This helps reduce noise and improve model performance.


In [27]:
# # Try agressive pre-processing
# lemmatizer = WordNetLemmatizer()

# def preprocess_text(text):
#     text = str(text).lower()
#     text = re.sub(r"http\S+|www.\S+", "", text)              # Remove URLs
#     text = re.sub(r"[^a-z\s]", "", text)                     # Remove punctuation & numbers
#     tokens = text.split()
#     tokens = [
#         lemmatizer.lemmatize(word)
#         for word in tokens
#         if word not in stop_words
#         and word not in domain_stopwords
#         and len(word) > 2
#     ]
#     return " ".join(tokens)

# df_train["clean_text"] = df_train["OriginalTweet"].apply(preprocess_text)
# df_val["clean_text"] = df_val["OriginalTweet"].apply(preprocess_text)

In [28]:
# # Try minimal pre-processing
# def preprocess_text_for_transformers(text):
#     return " ".join(str(text).split())  # Normalize whitespace

# df_train["clean_text"] = df_train["OriginalTweet"].apply(preprocess_text_for_transformers)
# df_val["clean_text"] = df_val["OriginalTweet"].apply(preprocess_text_for_transformers)

In [29]:
# Try without pre-processing
df_train["clean_text"] = df_train["OriginalTweet"]
df_val["clean_text"] = df_val["OriginalTweet"]

## Fine-Tuning Pretrained Language Models

Apply NLP techniques using transfer learning on our tweets dataset. Specifically, fine-tuning two pretrained transformer-based models from the Hugging Face library — BERT and RoBERTa — on our sentiment classification task. These models will be trained using both standard PyTorch and the Hugging Face API. Model performance will be monitored and tuned using hyperparameter optimization (Optuna) and experiment tracking (Weights & Biases).

**Load Pretrained Models**

Initialize tokenizers and models for BERT and RoBERTa, both widely used transformer architectures for text classification. The classification head is configured based on the number of sentiment labels.

In [30]:
# Choose pretrained models
bert_model_name = "bert-base-uncased"
roberta_model_name = "roberta-base"

sentiment_labels = df_train['Sentiment'].unique()
n_labels = len(sentiment_labels)

# Load BERT tokenizer and model
bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
bert_model = AutoModelForSequenceClassification.from_pretrained(bert_model_name, num_labels=n_labels)

# Load RoBERTa tokenizer and model
roberta_tokenizer = AutoTokenizer.from_pretrained(roberta_model_name)
roberta_model = AutoModelForSequenceClassification.from_pretrained(roberta_model_name, num_labels=n_labels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


**Encode Sentiment Labels**

Map each unique sentiment label to a numeric ID for model compatibility, and apply this mapping to both training and validation datasets.

In [31]:
# Mapping sentiments to unique numeric IDs
label2id = {label: idx for idx, label in enumerate(df_train["Sentiment"].unique())}
id2label = {idx: label for label, idx in label2id.items()}

df_train["label"] = df_train["Sentiment"].map(label2id)
df_val["label"] = df_val["Sentiment"].map(label2id)

**Tokenization**

Define tokenization functions for BERT and RoBERTa to preprocess text with truncation and fixed padding.

Transform training and validation DataFrames into Dataset objects compatible with Hugging Face workflows.

Apply tokenization to training and validation datasets using each model's tokenizer.

In [32]:
# Tokenize function
def tokenize_function_bert(examples):
    return bert_tokenizer(examples["clean_text"], truncation=True, padding='max_length', max_length=64)

def tokenize_function_roberta(examples):
    return roberta_tokenizer(examples["clean_text"], truncation=True, padding='max_length', max_length=64)

In [33]:
# Convert DataFrame to Hugging Face Dataset
hf_dataset_train = Dataset.from_pandas(df_train[["clean_text", "label"]])
hf_dataset_val = Dataset.from_pandas(df_val[["clean_text", "label"]])

In [34]:
# Tokenize for BERT
tokenized_bert_train = hf_dataset_train.map(tokenize_function_bert, batched=True)
tokenized_bert_train.set_format("torch", columns=["input_ids", "attention_mask", "label"])

tokenized_bert_val = hf_dataset_val.map(tokenize_function_bert, batched=True)
tokenized_bert_val.set_format("torch", columns=["input_ids", "attention_mask", "label"])

# Tokenize for RoBERTa
tokenized_roberta_train = hf_dataset_train.map(tokenize_function_roberta, batched=True)
tokenized_roberta_train.set_format("torch", columns=["input_ids", "attention_mask", "label"])

tokenized_roberta_val = hf_dataset_val.map(tokenize_function_roberta, batched=True)
tokenized_roberta_val.set_format("torch", columns=["input_ids", "attention_mask", "label"])

Map:   0%|          | 0/41157 [00:00<?, ? examples/s]

Map:   0%|          | 0/3798 [00:00<?, ? examples/s]

Map:   0%|          | 0/41157 [00:00<?, ? examples/s]

Map:   0%|          | 0/3798 [00:00<?, ? examples/s]

In [35]:
# # Define evaluation metric, Set up an evaluation function using accuracy as the metric to assess model performance during training.

# accuracy_metric = evaluate.load("accuracy")

# def compute_metrics(eval_pred):
#     logits, labels = eval_pred
#     predictions = np.argmax(logits, axis=1)
#     return accuracy_metric.compute(predictions=predictions, references=labels)

In [36]:
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"],
        "f1_macro": f1_metric.compute(predictions=predictions, references=labels, average="macro")["f1"]
    }

**Use Small Subsets for Quick Evaluation**

Select shuffled samples from each training and validation dataset for both BERT and RoBERTa. This allows faster experimentation during model development.

In [37]:
USE_SMALL_DATASET = True
if USE_SMALL_DATASET:
  n_samples_train = 2000
  n_samples_val = 500

  train_dataset_bert = tokenized_bert_train.shuffle(seed=42).select(range(n_samples_train))
  val_dataset_bert = tokenized_bert_val.shuffle(seed=42).select(range(n_samples_val))

  train_dataset_roberta = tokenized_roberta_train.shuffle(seed=42).select(range(n_samples_train))
  val_dataset_roberta = tokenized_roberta_val.shuffle(seed=42).select(range(n_samples_val))
else:
  n_samples_train = len(tokenized_bert_train)
  n_samples_val = len(tokenized_bert_val)

  train_dataset_bert = tokenized_bert_train
  val_dataset_bert = tokenized_bert_val

  train_dataset_roberta = tokenized_roberta_train
  val_dataset_roberta = tokenized_roberta_val

**Define Training Arguments**

Configure hyperparameters and training settings for both BERT and RoBERTa models, including batch size, number of epochs, learning rate, and evaluation strategy.

In [38]:
# # Define hyperparameters
# run_time = f"run-{int(time.time())}"

# num_epochs_bert = 5
# learning_rate_bert = 4.037872385196561e-05
# batch_size_bert = 16

# bert_training_args = TrainingArguments(
#     output_dir="./results/bert/{run_time}",
#     eval_strategy="epoch",
#     save_strategy="epoch",
#     per_device_train_batch_size=batch_size_bert,
#     per_device_eval_batch_size=batch_size_bert,
#     num_train_epochs=num_epochs_bert,
#     learning_rate=learning_rate_bert,
#     weight_decay=0.01,
#     load_best_model_at_end=True,
#     logging_dir="./logs/bert/{run_time}",
#     run_name = f"bert-ep{num_epochs_bert}-lr{learning_rate_bert}-bs{batch_size_bert}-samples{n_samples_train}-run{int(time.time())}-no_preprocess",
#     report_to="wandb"
# )

In [39]:
# # Define hyperparameters
# num_epochs_roberta = 4
# learning_rate_roberta = 4e-5
# batch_size_roberta = 16

# roberta_training_args = TrainingArguments(
#     output_dir="./results/roberta/{run_time}",
#     eval_strategy="epoch",
#     save_strategy="epoch",
#     per_device_train_batch_size=batch_size_roberta,
#     per_device_eval_batch_size=batch_size_roberta,
#     num_train_epochs=num_epochs_roberta,
#     learning_rate=learning_rate_roberta,
#     weight_decay=0.01,
#     load_best_model_at_end=True,
#     logging_dir="./logs/roberta/{run_time}",
#     run_name=f"roberta-ep{num_epochs_roberta}-lr{learning_rate_roberta}-bs{batch_size_roberta}-samples{n_samples_train}-run{int(time.time())}-no_preprocess",
#     report_to="wandb"
# )

**Initialize Trainers**

Configure Trainer objects for both BERT and RoBERTa using the small training and validation sets, tokenizers, training arguments, and evaluation metric.

In [40]:
# bert_trainer = Trainer(
#     model=bert_model,
#     args=bert_training_args,
#     train_dataset=train_dataset_bert,
#     eval_dataset=val_dataset_bert,
#     tokenizer=bert_tokenizer,
#     compute_metrics=compute_metrics
# )

In [41]:
# roberta_trainer = Trainer(
#     model=roberta_model,
#     args=roberta_training_args,
#     train_dataset=train_dataset_roberta,
#     eval_dataset=val_dataset_roberta,
#     tokenizer=roberta_tokenizer,
#     compute_metrics=compute_metrics
# )

**Model Training**

Train both BERT and RoBERTa models on the small datasets using the Trainer interface.

## Manual Trials

Manually trained BERT and RoBERTa on various sample sizes to understand how accuracy scales:
- 1000, 5000, 10000 train samples for 5 epochs

This section is useful for baseline comparison before running Optuna for automated tuning.


In [42]:
# bert_trainer.train()

In [43]:
# roberta_trainer.train()

# # Safely finish run with specific parameters
# wandb.finish()

**Hyperparameters Tuning with Optuna**

In [44]:
def build_trainer(model_checkpoint, trial, run_prefix, train_dataset, val_dataset):
    # Sample hyperparameters
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True)
    batch_size = trial.suggest_categorical("batch_size", [8, 16, 32])
    num_epochs = trial.suggest_int("num_train_epochs", 2, 5)
    n_samples = len(train_dataset)

    model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=5)
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

    run_name = f"{run_prefix}-ep{num_epochs}-lr{learning_rate}-bs{batch_size}-samples{n_samples}-run{int(time.time())}-no_preprocess"

    args = TrainingArguments(
        output_dir=f"./results/{run_prefix}/{run_name}",
        disable_tqdm=True,
        fp16=True,
        eval_strategy="epoch",
        save_strategy="epoch",
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_epochs,
        learning_rate=learning_rate,
        weight_decay=0.01,
        load_best_model_at_end=True,
        save_total_limit=1,
        logging_strategy="epoch",
        logging_dir=f"./logs/{run_prefix}/{run_name}",
        run_name=run_name,
        report_to="wandb",
        metric_for_best_model="f1_macro",
        greater_is_better=True
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    return trainer

In [45]:
def objective_bert(trial):
    trainer = build_trainer(
        model_checkpoint="bert-base-uncased",
        trial=trial,
        run_prefix="bert",
        train_dataset=train_dataset_bert,
        val_dataset=val_dataset_bert
    )
    trainer.train()
    eval_result = trainer.evaluate()
    return eval_result["eval_f1_macro"]

def objective_roberta(trial):
    trainer = build_trainer(
        model_checkpoint="roberta-base",
        trial=trial,
        run_prefix="roberta",
        train_dataset=train_dataset_roberta,
        val_dataset=val_dataset_roberta
    )
    trainer.train()
    eval_result = trainer.evaluate()
    return eval_result["eval_f1_macro"]

In [46]:
study_bert = optuna.create_study(direction="maximize",
                                 pruner=MedianPruner(n_startup_trials=2, n_warmup_steps=1),
                                 study_name="bert_study",
                                 storage="sqlite:////content/drive/MyDrive/Colab Notebooks/nlp_project/optuna/bert_study.db",
                                 load_if_exists=True)
study_bert.optimize(objective_bert, n_trials=5)

[I 2025-07-26 21:18:00,078] Using an existing study with name 'bert_study' instead of creating a new one.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


{'loss': 1.5524, 'grad_norm': 6.090435028076172, 'learning_rate': 1.221654449298299e-05, 'epoch': 1.0}
{'eval_loss': 1.503250002861023, 'eval_accuracy': 0.3, 'eval_f1_macro': 0.20947062355060098, 'eval_runtime': 0.7299, 'eval_samples_per_second': 685.05, 'eval_steps_per_second': 43.843, 'epoch': 1.0}
{'loss': 1.34, 'grad_norm': 8.667379379272461, 'learning_rate': 6.2052289488167565e-06, 'epoch': 2.0}
{'eval_loss': 1.3349297046661377, 'eval_accuracy': 0.39, 'eval_f1_macro': 0.3888128084689583, 'eval_runtime': 0.7105, 'eval_samples_per_second': 703.739, 'eval_steps_per_second': 45.039, 'epoch': 2.0}
{'loss': 1.1378, 'grad_norm': 10.126534461975098, 'learning_rate': 1.4543505348789274e-07, 'epoch': 3.0}
{'eval_loss': 1.2507128715515137, 'eval_accuracy': 0.464, 'eval_f1_macro': 0.4612105212127416, 'eval_runtime': 0.7065, 'eval_samples_per_second': 707.754, 'eval_steps_per_second': 45.296, 'epoch': 3.0}
{'train_runtime': 38.8347, 'train_samples_per_second': 154.501, 'train_steps_per_second'

[I 2025-07-26 21:18:41,081] Trial 5 finished with value: 0.4612105212127416 and parameters: {'learning_rate': 1.8179381685986592e-05, 'batch_size': 16, 'num_train_epochs': 3}. Best is trial 2 with value: 0.7425764396966665.


{'eval_loss': 1.2507128715515137, 'eval_accuracy': 0.464, 'eval_f1_macro': 0.4612105212127416, 'eval_runtime': 0.7173, 'eval_samples_per_second': 697.034, 'eval_steps_per_second': 44.61, 'epoch': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


{'loss': 1.4788, 'grad_norm': 18.253002166748047, 'learning_rate': 2.48524686309344e-05, 'epoch': 1.0}
{'eval_loss': 1.1820175647735596, 'eval_accuracy': 0.462, 'eval_f1_macro': 0.45481354553139414, 'eval_runtime': 1.3515, 'eval_samples_per_second': 369.968, 'eval_steps_per_second': 46.616, 'epoch': 1.0}
{'loss': 1.0547, 'grad_norm': 16.162446975708008, 'learning_rate': 1.8664104928012486e-05, 'epoch': 2.0}
{'eval_loss': 1.0282154083251953, 'eval_accuracy': 0.576, 'eval_f1_macro': 0.583580368688682, 'eval_runtime': 1.342, 'eval_samples_per_second': 372.582, 'eval_steps_per_second': 46.945, 'epoch': 2.0}
{'loss': 0.6633, 'grad_norm': 15.873087882995605, 'learning_rate': 1.2475741225090575e-05, 'epoch': 3.0}
{'eval_loss': 1.1236027479171753, 'eval_accuracy': 0.594, 'eval_f1_macro': 0.605907714087685, 'eval_runtime': 1.3521, 'eval_samples_per_second': 369.807, 'eval_steps_per_second': 46.596, 'epoch': 3.0}
{'loss': 0.3488, 'grad_norm': 15.529646873474121, 'learning_rate': 6.31213097698035

[I 2025-07-26 21:20:33,792] Trial 6 finished with value: 0.607330096864638 and parameters: {'learning_rate': 3.094181851460956e-05, 'batch_size': 8, 'num_train_epochs': 5}. Best is trial 2 with value: 0.7425764396966665.


{'eval_loss': 1.6186296939849854, 'eval_accuracy': 0.604, 'eval_f1_macro': 0.607330096864638, 'eval_runtime': 1.3477, 'eval_samples_per_second': 371.005, 'eval_steps_per_second': 46.747, 'epoch': 5.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


{'loss': 1.5243, 'grad_norm': 4.9427971839904785, 'learning_rate': 1.0323230326940293e-05, 'epoch': 1.0}
{'eval_loss': 1.5067763328552246, 'eval_accuracy': 0.334, 'eval_f1_macro': 0.24482543696808273, 'eval_runtime': 0.7169, 'eval_samples_per_second': 697.425, 'eval_steps_per_second': 44.635, 'epoch': 1.0}
{'loss': 1.2815, 'grad_norm': 5.954842567443848, 'learning_rate': 6.9277911743127165e-06, 'epoch': 2.0}
{'eval_loss': 1.3025224208831787, 'eval_accuracy': 0.442, 'eval_f1_macro': 0.43521338699660667, 'eval_runtime': 0.7239, 'eval_samples_per_second': 690.716, 'eval_steps_per_second': 44.206, 'epoch': 2.0}
{'loss': 1.0781, 'grad_norm': 12.893775939941406, 'learning_rate': 3.5049694478736274e-06, 'epoch': 3.0}
{'eval_loss': 1.2630776166915894, 'eval_accuracy': 0.456, 'eval_f1_macro': 0.4449665264885837, 'eval_runtime': 0.7105, 'eval_samples_per_second': 703.751, 'eval_steps_per_second': 45.04, 'epoch': 3.0}
{'loss': 0.9508, 'grad_norm': 12.612385749816895, 'learning_rate': 8.2147721434

[I 2025-07-26 21:21:24,921] Trial 7 finished with value: 0.48009736685062776 and parameters: {'learning_rate': 1.3691286905756356e-05, 'batch_size': 16, 'num_train_epochs': 4}. Best is trial 2 with value: 0.7425764396966665.


{'eval_loss': 1.241300344467163, 'eval_accuracy': 0.472, 'eval_f1_macro': 0.48009736685062776, 'eval_runtime': 0.7183, 'eval_samples_per_second': 696.047, 'eval_steps_per_second': 44.547, 'epoch': 4.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


{'loss': 1.4537, 'grad_norm': 7.925464630126953, 'learning_rate': 2.5008757740911875e-05, 'epoch': 1.0}
{'eval_loss': 1.2461833953857422, 'eval_accuracy': 0.456, 'eval_f1_macro': 0.4654606425675423, 'eval_runtime': 0.7112, 'eval_samples_per_second': 703.005, 'eval_steps_per_second': 44.992, 'epoch': 1.0}
{'loss': 1.0433, 'grad_norm': 8.68944263458252, 'learning_rate': 1.2702861074748887e-05, 'epoch': 2.0}
{'eval_loss': 1.181433081626892, 'eval_accuracy': 0.526, 'eval_f1_macro': 0.5329193026556026, 'eval_runtime': 0.7298, 'eval_samples_per_second': 685.132, 'eval_steps_per_second': 43.848, 'epoch': 2.0}
{'loss': 0.7398, 'grad_norm': 11.595009803771973, 'learning_rate': 2.9772330643942706e-07, 'epoch': 3.0}
{'eval_loss': 1.1430803537368774, 'eval_accuracy': 0.54, 'eval_f1_macro': 0.5474120900628854, 'eval_runtime': 0.7342, 'eval_samples_per_second': 681.056, 'eval_steps_per_second': 43.588, 'epoch': 3.0}
{'train_runtime': 36.7827, 'train_samples_per_second': 163.12, 'train_steps_per_seco

[I 2025-07-26 21:22:05,761] Trial 8 finished with value: 0.5474120900628854 and parameters: {'learning_rate': 3.721541330492838e-05, 'batch_size': 16, 'num_train_epochs': 3}. Best is trial 2 with value: 0.7425764396966665.


{'eval_loss': 1.1430803537368774, 'eval_accuracy': 0.54, 'eval_f1_macro': 0.5474120900628854, 'eval_runtime': 0.7299, 'eval_samples_per_second': 685.03, 'eval_steps_per_second': 43.842, 'epoch': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


{'loss': 1.5346, 'grad_norm': 6.407988548278809, 'learning_rate': 1.830372914687604e-05, 'epoch': 1.0}
{'eval_loss': 1.5227910280227661, 'eval_accuracy': 0.356, 'eval_f1_macro': 0.32050566067627756, 'eval_runtime': 0.4187, 'eval_samples_per_second': 1194.258, 'eval_steps_per_second': 38.216, 'epoch': 1.0}
{'loss': 1.3202, 'grad_norm': 7.393751621246338, 'learning_rate': 9.294862457397989e-06, 'epoch': 2.0}
{'eval_loss': 1.282150387763977, 'eval_accuracy': 0.472, 'eval_f1_macro': 0.4747942786821747, 'eval_runtime': 0.4327, 'eval_samples_per_second': 1155.428, 'eval_steps_per_second': 36.974, 'epoch': 2.0}
{'loss': 1.1335, 'grad_norm': 11.143248558044434, 'learning_rate': 2.859957679199381e-07, 'epoch': 3.0}
{'eval_loss': 1.2531943321228027, 'eval_accuracy': 0.49, 'eval_f1_macro': 0.49332396617628704, 'eval_runtime': 0.424, 'eval_samples_per_second': 1179.199, 'eval_steps_per_second': 37.734, 'epoch': 3.0}
{'train_runtime': 22.9775, 'train_samples_per_second': 261.126, 'train_steps_per_s

[I 2025-07-26 21:22:30,376] Trial 9 finished with value: 0.49332396617628704 and parameters: {'learning_rate': 2.7026600068434153e-05, 'batch_size': 32, 'num_train_epochs': 3}. Best is trial 2 with value: 0.7425764396966665.


{'eval_loss': 1.2531943321228027, 'eval_accuracy': 0.49, 'eval_f1_macro': 0.49332396617628704, 'eval_runtime': 0.419, 'eval_samples_per_second': 1193.39, 'eval_steps_per_second': 38.188, 'epoch': 3.0}


In [47]:
study_roberta = optuna.create_study(direction="maximize",
                                    pruner=MedianPruner(n_startup_trials=2, n_warmup_steps=1),
                                    study_name="roberta_study",
                                    storage="sqlite:////content/drive/MyDrive/Colab Notebooks/nlp_project/optuna/roberta_study.db",
                                    load_if_exists=True)
study_roberta.optimize(objective_roberta, n_trials=5)

[I 2025-07-26 21:22:32,970] A new study created in RDB with name: roberta_study
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


{'loss': 1.5704, 'grad_norm': 8.6424560546875, 'learning_rate': 2.9556320201422732e-05, 'epoch': 1.0}
{'eval_loss': 1.485044002532959, 'eval_accuracy': 0.334, 'eval_f1_macro': 0.25037718571490863, 'eval_runtime': 0.4193, 'eval_samples_per_second': 1192.593, 'eval_steps_per_second': 38.163, 'epoch': 1.0}
{'loss': 1.3368, 'grad_norm': 14.77864933013916, 'learning_rate': 2.234178534910695e-05, 'epoch': 2.0}
{'eval_loss': 1.3086049556732178, 'eval_accuracy': 0.442, 'eval_f1_macro': 0.45085084395331865, 'eval_runtime': 0.4479, 'eval_samples_per_second': 1116.302, 'eval_steps_per_second': 35.722, 'epoch': 2.0}
{'loss': 1.0667, 'grad_norm': 20.166650772094727, 'learning_rate': 1.501088703143123e-05, 'epoch': 3.0}
{'eval_loss': 1.2325727939605713, 'eval_accuracy': 0.49, 'eval_f1_macro': 0.47364330361844864, 'eval_runtime': 0.426, 'eval_samples_per_second': 1173.681, 'eval_steps_per_second': 37.558, 'epoch': 3.0}
{'loss': 0.8669, 'grad_norm': 27.653209686279297, 'learning_rate': 7.6799887137555

[I 2025-07-26 21:23:15,032] Trial 0 finished with value: 0.5400658649582297 and parameters: {'learning_rate': 3.6654491588378584e-05, 'batch_size': 32, 'num_train_epochs': 5}. Best is trial 0 with value: 0.5400658649582297.


{'eval_loss': 1.2176650762557983, 'eval_accuracy': 0.53, 'eval_f1_macro': 0.5400658649582297, 'eval_runtime': 0.4301, 'eval_samples_per_second': 1162.461, 'eval_steps_per_second': 37.199, 'epoch': 5.0}


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


{'loss': 1.568, 'grad_norm': 6.045679569244385, 'learning_rate': 1.7302165195522193e-05, 'epoch': 1.0}
{'eval_loss': 1.5199531316757202, 'eval_accuracy': 0.306, 'eval_f1_macro': 0.2490897591372871, 'eval_runtime': 0.4244, 'eval_samples_per_second': 1178.005, 'eval_steps_per_second': 37.696, 'epoch': 1.0}
{'loss': 1.3646, 'grad_norm': 14.102045059204102, 'learning_rate': 1.3062108902548373e-05, 'epoch': 2.0}
{'eval_loss': 1.4239639043807983, 'eval_accuracy': 0.364, 'eval_f1_macro': 0.3476544559683213, 'eval_runtime': 0.4198, 'eval_samples_per_second': 1191.114, 'eval_steps_per_second': 38.116, 'epoch': 2.0}
{'loss': 1.1625, 'grad_norm': 14.79910945892334, 'learning_rate': 8.822052609574555e-06, 'epoch': 3.0}
{'eval_loss': 1.3025556802749634, 'eval_accuracy': 0.446, 'eval_f1_macro': 0.4286845544153374, 'eval_runtime': 0.4252, 'eval_samples_per_second': 1175.848, 'eval_steps_per_second': 37.627, 'epoch': 3.0}
{'loss': 0.996, 'grad_norm': 27.249895095825195, 'learning_rate': 4.513608311875

[I 2025-07-26 21:23:56,822] Trial 1 finished with value: 0.4913550339262791 and parameters: {'learning_rate': 2.154222148849601e-05, 'batch_size': 32, 'num_train_epochs': 5}. Best is trial 0 with value: 0.5400658649582297.


{'eval_loss': 1.2509504556655884, 'eval_accuracy': 0.484, 'eval_f1_macro': 0.4913550339262791, 'eval_runtime': 0.4246, 'eval_samples_per_second': 1177.635, 'eval_steps_per_second': 37.684, 'epoch': 5.0}


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


{'loss': 1.5518, 'grad_norm': 3.0561962127685547, 'learning_rate': 3.191401532456544e-05, 'epoch': 1.0}
{'eval_loss': 1.4484853744506836, 'eval_accuracy': 0.364, 'eval_f1_macro': 0.2879114031040758, 'eval_runtime': 0.7262, 'eval_samples_per_second': 688.505, 'eval_steps_per_second': 44.064, 'epoch': 1.0}
{'loss': 1.2685, 'grad_norm': 10.72205638885498, 'learning_rate': 1.6146221191875005e-05, 'epoch': 2.0}
{'eval_loss': 1.1982678174972534, 'eval_accuracy': 0.49, 'eval_f1_macro': 0.4989522734721531, 'eval_runtime': 0.714, 'eval_samples_per_second': 700.297, 'eval_steps_per_second': 44.819, 'epoch': 2.0}
{'loss': 0.9582, 'grad_norm': 20.852901458740234, 'learning_rate': 5.045694122460939e-07, 'epoch': 3.0}
{'eval_loss': 1.1523406505584717, 'eval_accuracy': 0.516, 'eval_f1_macro': 0.5226487464931312, 'eval_runtime': 0.7174, 'eval_samples_per_second': 696.998, 'eval_steps_per_second': 44.608, 'epoch': 3.0}
{'train_runtime': 38.0697, 'train_samples_per_second': 157.605, 'train_steps_per_sec

[I 2025-07-26 21:24:39,731] Trial 2 finished with value: 0.5226487464931312 and parameters: {'learning_rate': 4.730338239807131e-05, 'batch_size': 16, 'num_train_epochs': 3}. Best is trial 0 with value: 0.5400658649582297.


{'eval_loss': 1.1523406505584717, 'eval_accuracy': 0.516, 'eval_f1_macro': 0.5226487464931312, 'eval_runtime': 0.7353, 'eval_samples_per_second': 680.012, 'eval_steps_per_second': 43.521, 'epoch': 3.0}


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


{'loss': 1.5495, 'grad_norm': 17.7938232421875, 'learning_rate': 1.0211299598711555e-05, 'epoch': 1.0}
{'eval_loss': 1.3879892826080322, 'eval_accuracy': 0.396, 'eval_f1_macro': 0.35512351831415656, 'eval_runtime': 1.3701, 'eval_samples_per_second': 364.948, 'eval_steps_per_second': 45.984, 'epoch': 1.0}
{'loss': 1.2589, 'grad_norm': 23.96282386779785, 'learning_rate': 5.176702579863711e-06, 'epoch': 2.0}
{'eval_loss': 1.2510786056518555, 'eval_accuracy': 0.462, 'eval_f1_macro': 0.46413962053073315, 'eval_runtime': 1.3581, 'eval_samples_per_second': 368.158, 'eval_steps_per_second': 46.388, 'epoch': 2.0}
{'loss': 1.017, 'grad_norm': 24.96974754333496, 'learning_rate': 1.0150397215419041e-07, 'epoch': 3.0}
{'eval_loss': 1.2364977598190308, 'eval_accuracy': 0.488, 'eval_f1_macro': 0.49343937357266954, 'eval_runtime': 1.3597, 'eval_samples_per_second': 367.718, 'eval_steps_per_second': 46.332, 'epoch': 3.0}
{'train_runtime': 67.0963, 'train_samples_per_second': 89.424, 'train_steps_per_se

[I 2025-07-26 21:25:52,126] Trial 3 finished with value: 0.49343937357266954 and parameters: {'learning_rate': 1.5225595823128561e-05, 'batch_size': 8, 'num_train_epochs': 3}. Best is trial 0 with value: 0.5400658649582297.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


{'loss': 1.5893, 'grad_norm': 3.570788621902466, 'learning_rate': 3.706910363093873e-05, 'epoch': 1.0}
{'eval_loss': 1.6233261823654175, 'eval_accuracy': 0.178, 'eval_f1_macro': 0.06044142614601019, 'eval_runtime': 1.3535, 'eval_samples_per_second': 369.4, 'eval_steps_per_second': 46.544, 'epoch': 1.0}
{'loss': 1.5847, 'grad_norm': 3.2867724895477295, 'learning_rate': 2.7820325279906117e-05, 'epoch': 2.0}
{'eval_loss': 1.595849633216858, 'eval_accuracy': 0.252, 'eval_f1_macro': 0.0805111821086262, 'eval_runtime': 1.3544, 'eval_samples_per_second': 369.175, 'eval_steps_per_second': 46.516, 'epoch': 2.0}
{'loss': 1.5775, 'grad_norm': 1.652222990989685, 'learning_rate': 1.8571546928873497e-05, 'epoch': 3.0}
{'eval_loss': 1.5942128896713257, 'eval_accuracy': 0.254, 'eval_f1_macro': 0.0810207336523126, 'eval_runtime': 1.3712, 'eval_samples_per_second': 364.639, 'eval_steps_per_second': 45.944, 'epoch': 3.0}
{'loss': 1.5801, 'grad_norm': 1.6719151735305786, 'learning_rate': 9.32276857784088e

[I 2025-07-26 21:27:48,309] Trial 4 finished with value: 0.0810207336523126 and parameters: {'learning_rate': 4.624389175516309e-05, 'batch_size': 8, 'num_train_epochs': 5}. Best is trial 0 with value: 0.5400658649582297.


{'eval_loss': 1.5942128896713257, 'eval_accuracy': 0.254, 'eval_f1_macro': 0.0810207336523126, 'eval_runtime': 1.3765, 'eval_samples_per_second': 363.244, 'eval_steps_per_second': 45.769, 'epoch': 5.0}


In [49]:
best_trial_bert = study_bert.best_trial
print('Bert best trial on subset:')
print(best_trial_bert.params)
best_trial_roberta = study_roberta.best_trial
print('RoBerta best trial on subset:')
print(best_trial_roberta.params)

Bert best trial on subset:
{'learning_rate': 3.8115643066555684e-05, 'batch_size': 16, 'num_train_epochs': 2}
RoBerta best trial on subset:
{'learning_rate': 3.6654491588378584e-05, 'batch_size': 32, 'num_train_epochs': 5}


In [50]:
final_trainer_bert = build_trainer(
    model_checkpoint="bert-base-uncased",
    trial=best_trial_bert,
    run_prefix="bert_final",
    train_dataset=tokenized_bert_train,
    val_dataset=tokenized_bert_val
)
final_trainer_bert.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


{'loss': 0.6822, 'grad_norm': 30.361814498901367, 'learning_rate': 1.9094855776443948e-05, 'epoch': 1.0}
{'eval_loss': 0.5002578496932983, 'eval_accuracy': 0.8296471827277514, 'eval_f1_macro': 0.8365850196182407, 'eval_runtime': 5.2425, 'eval_samples_per_second': 724.465, 'eval_steps_per_second': 45.398, 'epoch': 1.0}
{'loss': 0.323, 'grad_norm': 6.8882269859313965, 'learning_rate': 4.444109179932649e-08, 'epoch': 2.0}
{'eval_loss': 0.432912141084671, 'eval_accuracy': 0.8604528699315429, 'eval_f1_macro': 0.8652364145812331, 'eval_runtime': 5.335, 'eval_samples_per_second': 711.899, 'eval_steps_per_second': 44.611, 'epoch': 2.0}
{'train_runtime': 417.0563, 'train_samples_per_second': 197.369, 'train_steps_per_second': 12.339, 'train_loss': 0.5026131966120531, 'epoch': 2.0}


TrainOutput(global_step=5146, training_loss=0.5026131966120531, metrics={'train_runtime': 417.0563, 'train_samples_per_second': 197.369, 'train_steps_per_second': 12.339, 'train_loss': 0.5026131966120531, 'epoch': 2.0})

In [51]:
final_trainer_roberta = build_trainer(
    model_checkpoint="roberta-base",
    trial=best_trial_roberta,
    run_prefix="roberta_final",
    train_dataset=tokenized_roberta_train,
    val_dataset=tokenized_roberta_val
)
final_trainer_roberta.train()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


{'loss': 0.8645, 'grad_norm': 42.69869613647461, 'learning_rate': 2.934068161176971e-05, 'epoch': 1.0}
{'eval_loss': 0.7531467080116272, 'eval_accuracy': 0.7374934175882043, 'eval_f1_macro': 0.7268239763823591, 'eval_runtime': 3.0254, 'eval_samples_per_second': 1255.374, 'eval_steps_per_second': 39.334, 'epoch': 1.0}
{'loss': 0.5062, 'grad_norm': 34.75946044921875, 'learning_rate': 2.201547940778294e-05, 'epoch': 2.0}
{'eval_loss': 0.5646417737007141, 'eval_accuracy': 0.785676671932596, 'eval_f1_macro': 0.7932429470153, 'eval_runtime': 3.0733, 'eval_samples_per_second': 1235.788, 'eval_steps_per_second': 38.72, 'epoch': 2.0}
{'loss': 0.3591, 'grad_norm': 18.162437438964844, 'learning_rate': 1.469597331748512e-05, 'epoch': 3.0}
{'eval_loss': 0.5656107664108276, 'eval_accuracy': 0.8067403896787783, 'eval_f1_macro': 0.8138838893323017, 'eval_runtime': 3.1066, 'eval_samples_per_second': 1222.552, 'eval_steps_per_second': 38.305, 'epoch': 3.0}
{'loss': 0.2692, 'grad_norm': 17.24826622009277

TrainOutput(global_step=6435, training_loss=0.43948856048465423, metrics={'train_runtime': 599.3786, 'train_samples_per_second': 343.331, 'train_steps_per_second': 10.736, 'train_loss': 0.43948856048465423, 'epoch': 5.0})

In [54]:
# final_trainer_bert.save_model("models/bert_final")
# bert_tokenizer.save_pretrained("models/bert_final")
# !cp -r models/bert_final "/content/drive/MyDrive/Colab Notebooks/nlp_project/models/bert_best_model_no_preprocess"

# final_trainer_roberta.save_model("models/roberta_final")
# roberta_tokenizer.save_pretrained("models/roberta_final")
# !cp -r models/bert_final "/content/drive/MyDrive/Colab Notebooks/nlp_project/models/roberta_best_model_no_preprocess"