## **NLP - Text Classification Project**
Group H - August 2025

Classification of tweets from Twitter that have been manually tagged for sentiment analysis.

In [1]:
# # Needed for Google Colab
# !pip install --quiet evaluate transformers optuna datasets nltk scikit-learn
# !pip install numpy==1.26.4

In [2]:
%env CUDA_LAUNCH_BLOCKING=1

from wordcloud import WordCloud, STOPWORDS
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
from tqdm import tqdm
from collections import Counter
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, classification_report, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer
from datasets import Dataset
from torch.utils.data import DataLoader, TensorDataset
from optuna.pruners import MedianPruner
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import torch.nn.functional as F
import os
import re
import string
import time
import glob
import nltk
import evaluate
import transformers
import torch
import optuna
import wandb
wandb.login()
# API key - 0cbd7fe3cffd71df993b30edb4fa0db94f114413

os.environ["WANDB_PROJECT"] = "tweet-sentiment-classification_split_to_test_maxl_256"
os.environ["WANDB_INIT_TIMEOUT"] = "180"

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

env: CUDA_LAUNCH_BLOCKING=1
Mounted at /content/drive


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mhillas[0m ([33mhillas-tel-aviv-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [3]:
# Load data
df_train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/nlp_project/Data/Corona_NLP_train.csv', encoding='latin1')
df_test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/nlp_project/Data/Corona_NLP_test.csv', encoding='latin1')

In [4]:
# Merge and shuffle for better stratified splits
df_full = pd.concat([df_train, df_test], ignore_index=True)
df_full = df_full.sample(frac=1.0, random_state=42).reset_index(drop=True)

### Pre-processing the Data

The tweets were cleaned by lowercasing (to reduce redundancy), removing stopwords, punctuation, numbers, short words, and applying lemmatization to reduce words to their base form (e.g. running → run). <br>This helps reduce noise and improve model performance.

In [5]:
# Try without pre-processing
is_preprocessed = "no_preprocess"
df_full["clean_text"] = df_full["OriginalTweet"]

In [6]:
# # Try minimal pre-processing
# def light_preprocess(text):
#     return text.strip()                             # Remove unnecessary spaces

# is_preprocessed = "minimal_preprocess"
# df_full["clean_text"] = df_full["OriginalTweet"].apply(light_preprocess)

In [7]:
# # Try pre-processing
# def clean_text(text):
#     text = str(text).lower()
#     text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)  # URLs
#     text = re.sub(r'\@\w+|\#','', text)  # Mentions & hashtags
#     text = re.sub(r'\n', ' ', text)  # Line breaks
#     text = re.sub(r"[^a-zA-Z']", ' ', text)  # Keep letters only
#     text = re.sub(r'\s+', ' ', text).strip()  # Extra whitespace
#     return text

# is_preprocessed = "w_preprocess"
# df_full["clean_text"] = df_full["OriginalTweet"].apply(clean_text)

## Fine-Tuning Pretrained Language Models

Apply NLP techniques using transfer learning on our tweets dataset. Specifically, fine-tuning two pretrained transformer-based models from the Hugging Face library — BERT and RoBERTa — on our sentiment classification task. These models will be trained using both standard PyTorch and the Hugging Face API. Model performance will be monitored and tuned using hyperparameter optimization (Optuna) and experiment tracking (Weights & Biases).

**Load Pretrained Models**

Initialize tokenizers and models for BERT and RoBERTa, both widely used transformer architectures for text classification. The classification head is configured based on the number of sentiment labels.

In [8]:
# Choose pretrained models
bert_model_name = "bert-base-uncased"
roberta_model_name = "roberta-base"

sentiment_labels = df_full['Sentiment'].unique()
n_labels = len(sentiment_labels)

# Load BERT tokenizer and model
bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
bert_model = AutoModelForSequenceClassification.from_pretrained(bert_model_name, num_labels=n_labels)

# Load RoBERTa tokenizer and model
roberta_tokenizer = AutoTokenizer.from_pretrained(roberta_model_name)
roberta_model = AutoModelForSequenceClassification.from_pretrained(roberta_model_name, num_labels=n_labels)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


**Encode Sentiment Labels**

Map each unique sentiment label to a numeric ID for model compatibility, and apply this mapping to both training and validation datasets.

In [9]:
# Mapping sentiments to unique numeric IDs
unique_labels = sorted(df_full["Sentiment"].unique())
label2id = {label: idx for idx, label in enumerate(unique_labels)}
df_full["label"] = df_full["Sentiment"].map(label2id)

In [10]:
# Stratified split: 70% train, 15% val, 15% test
train_val_df, test_df = train_test_split(df_full, test_size=0.15, stratify=df_full["label"], random_state=42)
train_df, val_df = train_test_split(train_val_df, test_size=0.1765, stratify=train_val_df["label"], random_state=42)

# Confirm sizes
print("Train size:", len(train_df))
print("Val size:", len(val_df))
print("Test size:", len(test_df))

Train size: 31466
Val size: 6745
Test size: 6744


**Use Small Subsets for Quick Evaluation**

Select shuffled samples from each training and validation dataset for both BERT and RoBERTa. This allows faster experimentation during model development.

In [11]:
train_subset_df, _ = train_test_split(
    train_df[["clean_text", "label"]],
    train_size=2000,
    stratify=train_df["label"],
    random_state=42
)

val_subset_df, _ = train_test_split(
    val_df[["clean_text", "label"]],
    train_size=500,
    stratify=val_df["label"],
    random_state=42
)

**Tokenization**

Define tokenization functions for BERT and RoBERTa to preprocess text with truncation and fixed padding.

Transform training and validation DataFrames into Dataset objects compatible with Hugging Face workflows.

Apply tokenization to training and validation datasets using each model's tokenizer.

In [12]:
# Tokenize function
def tokenize_function_bert(examples):
    return bert_tokenizer(examples["clean_text"], truncation=True, padding='max_length', max_length=256)

def tokenize_function_roberta(examples):
    return roberta_tokenizer(examples["clean_text"], truncation=True, padding='max_length', max_length=256)

In [13]:
# Convert DataFrame to Hugging Face Dataset
hf_subset_train = Dataset.from_pandas(train_subset_df)
hf_subset_val = Dataset.from_pandas(val_subset_df)

hf_train = Dataset.from_pandas(train_df[["clean_text", "label"]])
hf_val = Dataset.from_pandas(val_df[["clean_text", "label"]])
hf_test = Dataset.from_pandas(test_df[["clean_text", "label"]])

In [14]:
# Tokenize subsets
# Tokenize for BERT
tokenized_bert_train_sub = hf_subset_train.map(tokenize_function_bert, batched=True)
tokenized_bert_train_sub.set_format("torch", columns=["input_ids", "attention_mask", "label"])

tokenized_bert_val_sub = hf_subset_val.map(tokenize_function_bert, batched=True)
tokenized_bert_val_sub.set_format("torch", columns=["input_ids", "attention_mask", "label"])

# Tokenize for RoBERTa
tokenized_roberta_train_sub = hf_subset_train.map(tokenize_function_roberta, batched=True)
tokenized_roberta_train_sub.set_format("torch", columns=["input_ids", "attention_mask", "label"])

tokenized_roberta_val_sub = hf_subset_val.map(tokenize_function_roberta, batched=True)
tokenized_roberta_val_sub.set_format("torch", columns=["input_ids", "attention_mask", "label"])

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [15]:
# Tokenize full dataset
# Tokenize for BERT
tokenized_bert_train = hf_train.map(tokenize_function_bert, batched=True)
tokenized_bert_train.set_format("torch", columns=["input_ids", "attention_mask", "label"])

tokenized_bert_val = hf_val.map(tokenize_function_bert, batched=True)
tokenized_bert_val.set_format("torch", columns=["input_ids", "attention_mask", "label"])

tokenized_bert_test = hf_test.map(tokenize_function_bert, batched=True)
tokenized_bert_test.set_format("torch", columns=["input_ids", "attention_mask", "label"])

# Tokenize for RoBERTa
tokenized_roberta_train = hf_train.map(tokenize_function_roberta, batched=True)
tokenized_roberta_train.set_format("torch", columns=["input_ids", "attention_mask", "label"])

tokenized_roberta_val = hf_val.map(tokenize_function_roberta, batched=True)
tokenized_roberta_val.set_format("torch", columns=["input_ids", "attention_mask", "label"])

tokenized_roberta_test = hf_test.map(tokenize_function_roberta, batched=True)
tokenized_roberta_test.set_format("torch", columns=["input_ids", "attention_mask", "label"])

Map:   0%|          | 0/31466 [00:00<?, ? examples/s]

Map:   0%|          | 0/6745 [00:00<?, ? examples/s]

Map:   0%|          | 0/6744 [00:00<?, ? examples/s]

Map:   0%|          | 0/31466 [00:00<?, ? examples/s]

Map:   0%|          | 0/6745 [00:00<?, ? examples/s]

Map:   0%|          | 0/6744 [00:00<?, ? examples/s]

In [16]:
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"],
        "f1_macro": f1_metric.compute(predictions=predictions, references=labels, average="macro")["f1"]
    }

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

**Hyperparameters Tuning with Optuna**

In [17]:
def build_trainer(model_checkpoint, trial, run_prefix, train_dataset, val_dataset):
    # Sample hyperparameters
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True)
    batch_size = trial.suggest_categorical("batch_size", [8, 16, 32])
    num_epochs = trial.suggest_int("num_train_epochs", 2, 5)
    n_samples = len(train_dataset)

    model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=5)
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

    run_name = f"{run_prefix}-ep{num_epochs}-lr{learning_rate}-bs{batch_size}-samples{n_samples}-run{int(time.time())}-{is_preprocessed}"

    args = TrainingArguments(
        output_dir=f"./results/{run_prefix}/{run_name}",
        disable_tqdm=True,
        fp16=True,
        eval_strategy="epoch",
        save_strategy="epoch",
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_epochs,
        learning_rate=learning_rate,
        weight_decay=0.01,
        label_smoothing_factor=0.1,
        load_best_model_at_end=True,
        save_total_limit=1,
        logging_strategy="epoch",
        logging_dir=f"./logs/{run_prefix}/{run_name}",
        run_name=run_name,
        report_to="wandb",
        metric_for_best_model="f1_macro",
        greater_is_better=True
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    return trainer

In [18]:
def objective_bert(trial):
    trainer = build_trainer(
        model_checkpoint="bert-base-uncased",
        trial=trial,
        run_prefix="bert",
        train_dataset=tokenized_bert_train_sub,
        val_dataset=tokenized_bert_val_sub
    )
    trainer.train()
    eval_result = trainer.evaluate()
    return eval_result["eval_f1_macro"]

def objective_roberta(trial):
    trainer = build_trainer(
        model_checkpoint="roberta-base",
        trial=trial,
        run_prefix="roberta",
        train_dataset=tokenized_roberta_train_sub,
        val_dataset=tokenized_roberta_val_sub
    )
    trainer.train()
    eval_result = trainer.evaluate()

    return eval_result["eval_f1_macro"]

In [19]:
study_bert = optuna.create_study(direction="maximize",
                                 pruner=MedianPruner(n_startup_trials=2, n_warmup_steps=1),
                                 study_name=f"bert_study_stratify_{is_preprocessed}",
                                 storage=f"sqlite:////content/drive/MyDrive/Colab Notebooks/nlp_project/optuna/bert_study_stratify_maxl_256_{is_preprocessed}.db",
                                 load_if_exists=True)
study_bert.optimize(objective_bert, n_trials=5)
wandb.finish()

[I 2025-08-07 20:59:54,986] Using an existing study with name 'bert_study_stratify_no_preprocess' instead of creating a new one.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


  return forward_call(*args, **kwargs)


{'loss': 1.5293, 'grad_norm': 3.855067491531372, 'learning_rate': 2.1647282149400416e-05, 'epoch': 1.0}
{'eval_loss': 1.3812739849090576, 'eval_accuracy': 0.42, 'eval_f1_macro': 0.343830610238052, 'eval_runtime': 1.0131, 'eval_samples_per_second': 493.514, 'eval_steps_per_second': 31.585, 'epoch': 1.0}


  return forward_call(*args, **kwargs)


{'loss': 1.2121, 'grad_norm': 7.0508599281311035, 'learning_rate': 1.625702265004772e-05, 'epoch': 2.0}
{'eval_loss': 1.2129623889923096, 'eval_accuracy': 0.532, 'eval_f1_macro': 0.540160959732994, 'eval_runtime': 0.9841, 'eval_samples_per_second': 508.06, 'eval_steps_per_second': 32.516, 'epoch': 2.0}


  return forward_call(*args, **kwargs)


{'loss': 0.969, 'grad_norm': 8.76950740814209, 'learning_rate': 1.0866763150695028e-05, 'epoch': 3.0}
{'eval_loss': 1.219394326210022, 'eval_accuracy': 0.51, 'eval_f1_macro': 0.527741081675614, 'eval_runtime': 0.9871, 'eval_samples_per_second': 506.524, 'eval_steps_per_second': 32.418, 'epoch': 3.0}


  return forward_call(*args, **kwargs)


{'loss': 0.7707, 'grad_norm': 11.577377319335938, 'learning_rate': 5.5196257273371575e-06, 'epoch': 4.0}
{'eval_loss': 1.2755590677261353, 'eval_accuracy': 0.564, 'eval_f1_macro': 0.5733137445028971, 'eval_runtime': 0.9786, 'eval_samples_per_second': 510.954, 'eval_steps_per_second': 32.701, 'epoch': 4.0}


  return forward_call(*args, **kwargs)


{'loss': 0.6536, 'grad_norm': 12.512775421142578, 'learning_rate': 1.2936622798446462e-07, 'epoch': 5.0}
{'eval_loss': 1.2985506057739258, 'eval_accuracy': 0.55, 'eval_f1_macro': 0.5611899332404842, 'eval_runtime': 1.0058, 'eval_samples_per_second': 497.117, 'eval_steps_per_second': 31.816, 'epoch': 5.0}
{'train_runtime': 81.5624, 'train_samples_per_second': 122.605, 'train_steps_per_second': 7.663, 'train_loss': 1.0269362182617188, 'epoch': 5.0}


  return forward_call(*args, **kwargs)
[I 2025-08-07 21:01:21,231] Trial 5 finished with value: 0.5733137445028971 and parameters: {'learning_rate': 2.6951297496763464e-05, 'batch_size': 16, 'num_train_epochs': 5}. Best is trial 5 with value: 0.5733137445028971.


{'eval_loss': 1.2755590677261353, 'eval_accuracy': 0.564, 'eval_f1_macro': 0.5733137445028971, 'eval_runtime': 0.9967, 'eval_samples_per_second': 501.656, 'eval_steps_per_second': 32.106, 'epoch': 5.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
  return forward_call(*args, **kwargs)


{'loss': 1.5508, 'grad_norm': 4.531980991363525, 'learning_rate': 1.1666263929535294e-05, 'epoch': 1.0}
{'eval_loss': 1.4712074995040894, 'eval_accuracy': 0.362, 'eval_f1_macro': 0.29347521922907105, 'eval_runtime': 0.6325, 'eval_samples_per_second': 790.521, 'eval_steps_per_second': 25.297, 'epoch': 1.0}


  return forward_call(*args, **kwargs)


{'loss': 1.4126, 'grad_norm': 5.779858112335205, 'learning_rate': 3.5896196706262434e-07, 'epoch': 2.0}
{'eval_loss': 1.4230363368988037, 'eval_accuracy': 0.378, 'eval_f1_macro': 0.3301264641354708, 'eval_runtime': 0.643, 'eval_samples_per_second': 777.566, 'eval_steps_per_second': 24.882, 'epoch': 2.0}
{'train_runtime': 20.8573, 'train_samples_per_second': 191.78, 'train_steps_per_second': 6.041, 'train_loss': 1.4817256624736483, 'epoch': 2.0}


  return forward_call(*args, **kwargs)
[I 2025-08-07 21:01:44,047] Trial 6 finished with value: 0.3301264641354708 and parameters: {'learning_rate': 2.2614603924945335e-05, 'batch_size': 32, 'num_train_epochs': 2}. Best is trial 5 with value: 0.5733137445028971.


{'eval_loss': 1.4230363368988037, 'eval_accuracy': 0.378, 'eval_f1_macro': 0.3301264641354708, 'eval_runtime': 0.6749, 'eval_samples_per_second': 740.819, 'eval_steps_per_second': 23.706, 'epoch': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
  return forward_call(*args, **kwargs)


{'loss': 1.4218, 'grad_norm': 8.918575286865234, 'learning_rate': 2.9594663150505234e-05, 'epoch': 1.0}
{'eval_loss': 1.2088779211044312, 'eval_accuracy': 0.532, 'eval_f1_macro': 0.5291914328398994, 'eval_runtime': 1.6485, 'eval_samples_per_second': 303.312, 'eval_steps_per_second': 38.217, 'epoch': 1.0}


  return forward_call(*args, **kwargs)


{'loss': 1.0478, 'grad_norm': 8.231688499450684, 'learning_rate': 1.4915238599756621e-05, 'epoch': 2.0}
{'eval_loss': 1.2032746076583862, 'eval_accuracy': 0.562, 'eval_f1_macro': 0.5523916804169381, 'eval_runtime': 1.6453, 'eval_samples_per_second': 303.905, 'eval_steps_per_second': 38.292, 'epoch': 2.0}


  return forward_call(*args, **kwargs)


{'loss': 0.7551, 'grad_norm': 10.303268432617188, 'learning_rate': 1.768605367560074e-07, 'epoch': 3.0}
{'eval_loss': 1.2347662448883057, 'eval_accuracy': 0.596, 'eval_f1_macro': 0.5938229373061356, 'eval_runtime': 1.6539, 'eval_samples_per_second': 302.321, 'eval_steps_per_second': 38.092, 'epoch': 3.0}
{'train_runtime': 78.454, 'train_samples_per_second': 76.478, 'train_steps_per_second': 9.56, 'train_loss': 1.0748961181640626, 'epoch': 3.0}


  return forward_call(*args, **kwargs)
[I 2025-08-07 21:03:07,690] Trial 7 finished with value: 0.5938229373061356 and parameters: {'learning_rate': 4.4215134189001845e-05, 'batch_size': 8, 'num_train_epochs': 3}. Best is trial 7 with value: 0.5938229373061356.


{'eval_loss': 1.2347662448883057, 'eval_accuracy': 0.596, 'eval_f1_macro': 0.5938229373061356, 'eval_runtime': 1.7276, 'eval_samples_per_second': 289.411, 'eval_steps_per_second': 36.466, 'epoch': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
  return forward_call(*args, **kwargs)


{'loss': 1.5907, 'grad_norm': 3.959890842437744, 'learning_rate': 1.1070122616636273e-05, 'epoch': 1.0}
{'eval_loss': 1.565200924873352, 'eval_accuracy': 0.292, 'eval_f1_macro': 0.16277083917673255, 'eval_runtime': 0.6354, 'eval_samples_per_second': 786.939, 'eval_steps_per_second': 25.182, 'epoch': 1.0}


  return forward_call(*args, **kwargs)


{'loss': 1.5295, 'grad_norm': 4.972585678100586, 'learning_rate': 8.357286244179953e-06, 'epoch': 2.0}
{'eval_loss': 1.4885830879211426, 'eval_accuracy': 0.314, 'eval_f1_macro': 0.20905372944660447, 'eval_runtime': 0.6355, 'eval_samples_per_second': 786.757, 'eval_steps_per_second': 25.176, 'epoch': 2.0}


  return forward_call(*args, **kwargs)


{'loss': 1.3943, 'grad_norm': 8.643104553222656, 'learning_rate': 5.600694446361435e-06, 'epoch': 3.0}
{'eval_loss': 1.3831959962844849, 'eval_accuracy': 0.406, 'eval_f1_macro': 0.4028618571570429, 'eval_runtime': 0.6351, 'eval_samples_per_second': 787.321, 'eval_steps_per_second': 25.194, 'epoch': 3.0}


  return forward_call(*args, **kwargs)


{'loss': 1.2768, 'grad_norm': 7.807661056518555, 'learning_rate': 2.844102648542916e-06, 'epoch': 4.0}
{'eval_loss': 1.336681842803955, 'eval_accuracy': 0.446, 'eval_f1_macro': 0.4522825785824164, 'eval_runtime': 0.638, 'eval_samples_per_second': 783.679, 'eval_steps_per_second': 25.078, 'epoch': 4.0}


  return forward_call(*args, **kwargs)


{'loss': 1.2026, 'grad_norm': 7.692954063415527, 'learning_rate': 8.751085072439742e-08, 'epoch': 5.0}
{'eval_loss': 1.32039213180542, 'eval_accuracy': 0.454, 'eval_f1_macro': 0.46258685953473966, 'eval_runtime': 0.6444, 'eval_samples_per_second': 775.915, 'eval_steps_per_second': 24.829, 'epoch': 5.0}
{'train_runtime': 51.8789, 'train_samples_per_second': 192.757, 'train_steps_per_second': 6.072, 'train_loss': 1.3987757364908855, 'epoch': 5.0}


  return forward_call(*args, **kwargs)
[I 2025-08-07 21:04:01,735] Trial 8 finished with value: 0.46258685953473966 and parameters: {'learning_rate': 1.3782958989092594e-05, 'batch_size': 32, 'num_train_epochs': 5}. Best is trial 7 with value: 0.5938229373061356.


{'eval_loss': 1.32039213180542, 'eval_accuracy': 0.454, 'eval_f1_macro': 0.46258685953473966, 'eval_runtime': 0.6961, 'eval_samples_per_second': 718.326, 'eval_steps_per_second': 22.986, 'epoch': 5.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
  return forward_call(*args, **kwargs)


{'loss': 1.4766, 'grad_norm': 5.919511795043945, 'learning_rate': 3.231383169003168e-05, 'epoch': 1.0}
{'eval_loss': 1.2525513172149658, 'eval_accuracy': 0.486, 'eval_f1_macro': 0.43474375357226974, 'eval_runtime': 0.9952, 'eval_samples_per_second': 502.411, 'eval_steps_per_second': 32.154, 'epoch': 1.0}


  return forward_call(*args, **kwargs)


{'loss': 1.081, 'grad_norm': 5.0565185546875, 'learning_rate': 1.6413374826682757e-05, 'epoch': 2.0}
{'eval_loss': 1.1874380111694336, 'eval_accuracy': 0.544, 'eval_f1_macro': 0.5421852966861691, 'eval_runtime': 1.0267, 'eval_samples_per_second': 487.018, 'eval_steps_per_second': 31.169, 'epoch': 2.0}


  return forward_call(*args, **kwargs)


{'loss': 0.8072, 'grad_norm': 7.888729572296143, 'learning_rate': 3.846884725003771e-07, 'epoch': 3.0}
{'eval_loss': 1.202750325202942, 'eval_accuracy': 0.578, 'eval_f1_macro': 0.5760127646396417, 'eval_runtime': 1.0009, 'eval_samples_per_second': 499.574, 'eval_steps_per_second': 31.973, 'epoch': 3.0}
{'train_runtime': 47.5808, 'train_samples_per_second': 126.101, 'train_steps_per_second': 7.881, 'train_loss': 1.1215946248372395, 'epoch': 3.0}


  return forward_call(*args, **kwargs)
[I 2025-08-07 21:04:53,813] Trial 9 finished with value: 0.5760127646396417 and parameters: {'learning_rate': 4.808605906254714e-05, 'batch_size': 16, 'num_train_epochs': 3}. Best is trial 7 with value: 0.5938229373061356.


{'eval_loss': 1.202750325202942, 'eval_accuracy': 0.578, 'eval_f1_macro': 0.5760127646396417, 'eval_runtime': 1.032, 'eval_samples_per_second': 484.517, 'eval_steps_per_second': 31.009, 'epoch': 3.0}


0,1
eval/accuracy,▄▇▆▇▇▇▃▃▃▇▇██▁▂▄▅▅▅▅▇██
eval/f1_macro,▄▇▇█▇█▃▄▄▇▇██▁▂▅▆▆▆▅▇██
eval/loss,▅▁▂▃▃▃▆▅▅▁▁▂▂█▇▅▄▃▃▂▁▁▁
eval/runtime,▃▃▃▃▃▃▁▁▁▇▇██▁▁▁▁▁▁▃▄▃▄
eval/samples_per_second,▄▄▄▄▄▄██▇▁▁▁▁█████▇▄▄▄▄
eval/steps_per_second,▅▅▅▅▅▅▂▂▁███▇▂▂▂▂▂▁▅▅▅▅
train/epoch,▁▁▃▃▅▅▆████▁▁▃▃▃▁▁▃▃▅▅▅▁▁▃▃▅▆▆███▁▁▃▃▅▅▅
train/global_step,▂▂▃▃▄▄▅▇▇▇▇▁▁▂▂▂▃▃▅▅███▁▁▂▂▂▃▃▄▄▄▂▂▃▃▄▄▄
train/grad_norm,▁▄▅▇█▂▃▅▅▆▁▂▅▄▄▃▂▄
train/learning_rate,▆▅▃▂▁▄▁▇▄▁▃▃▂▂▁█▅▁

0,1
eval/accuracy,0.578
eval/f1_macro,0.57601
eval/loss,1.20275
eval/runtime,1.032
eval/samples_per_second,484.517
eval/steps_per_second,31.009
total_flos,789354427392000.0
train/epoch,3.0
train/global_step,375.0
train/grad_norm,7.88873


In [20]:
study_roberta = optuna.create_study(direction="maximize",
                                    pruner=MedianPruner(n_startup_trials=2, n_warmup_steps=1),
                                    study_name=f"roberta_study_stratify_{is_preprocessed}",
                                    storage=f"sqlite:////content/drive/MyDrive/Colab Notebooks/nlp_project/optuna/roberta_study_stratify_maxl_256_{is_preprocessed}.db",
                                    load_if_exists=True)
study_roberta.optimize(objective_roberta, n_trials=5)
wandb.finish()

[I 2025-08-07 21:05:27,640] Using an existing study with name 'roberta_study_stratify_no_preprocess' instead of creating a new one.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


  return forward_call(*args, **kwargs)


{'loss': 1.5538, 'grad_norm': 7.124143123626709, 'learning_rate': 1.6591987912395855e-05, 'epoch': 1.0}
{'eval_loss': 1.4615013599395752, 'eval_accuracy': 0.374, 'eval_f1_macro': 0.2645378140912062, 'eval_runtime': 0.9953, 'eval_samples_per_second': 502.362, 'eval_steps_per_second': 32.151, 'epoch': 1.0}


  return forward_call(*args, **kwargs)


{'loss': 1.2901, 'grad_norm': 17.27823257446289, 'learning_rate': 1.249356858742158e-05, 'epoch': 2.0}
{'eval_loss': 1.3017396926879883, 'eval_accuracy': 0.468, 'eval_f1_macro': 0.47369367913892046, 'eval_runtime': 0.9964, 'eval_samples_per_second': 501.797, 'eval_steps_per_second': 32.115, 'epoch': 2.0}


  return forward_call(*args, **kwargs)


{'loss': 1.0963, 'grad_norm': 9.05462646484375, 'learning_rate': 8.395149262447305e-06, 'epoch': 3.0}
{'eval_loss': 1.2451032400131226, 'eval_accuracy': 0.522, 'eval_f1_macro': 0.5347328202955637, 'eval_runtime': 1.0469, 'eval_samples_per_second': 477.588, 'eval_steps_per_second': 30.566, 'epoch': 3.0}


  return forward_call(*args, **kwargs)


{'loss': 0.9319, 'grad_norm': 23.759342193603516, 'learning_rate': 4.263678168723237e-06, 'epoch': 4.0}
{'eval_loss': 1.2771320343017578, 'eval_accuracy': 0.54, 'eval_f1_macro': 0.5450590621178856, 'eval_runtime': 0.9842, 'eval_samples_per_second': 508.006, 'eval_steps_per_second': 32.512, 'epoch': 4.0}


  return forward_call(*args, **kwargs)


{'loss': 0.8355, 'grad_norm': 18.477176666259766, 'learning_rate': 1.3220707499917017e-07, 'epoch': 5.0}
{'eval_loss': 1.2826781272888184, 'eval_accuracy': 0.524, 'eval_f1_macro': 0.5355646684375239, 'eval_runtime': 0.9833, 'eval_samples_per_second': 508.498, 'eval_steps_per_second': 32.544, 'epoch': 5.0}
{'train_runtime': 83.1479, 'train_samples_per_second': 120.268, 'train_steps_per_second': 7.517, 'train_loss': 1.141515869140625, 'epoch': 5.0}


  return forward_call(*args, **kwargs)
[I 2025-08-07 21:06:53,245] Trial 5 finished with value: 0.5450590621178856 and parameters: {'learning_rate': 2.0657355468620338e-05, 'batch_size': 16, 'num_train_epochs': 5}. Best is trial 4 with value: 0.5787705726512649.


{'eval_loss': 1.2771320343017578, 'eval_accuracy': 0.54, 'eval_f1_macro': 0.5450590621178856, 'eval_runtime': 1.0227, 'eval_samples_per_second': 488.912, 'eval_steps_per_second': 31.29, 'epoch': 5.0}


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
  return forward_call(*args, **kwargs)


{'loss': 1.5822, 'grad_norm': 3.8010709285736084, 'learning_rate': 1.0986889515214714e-05, 'epoch': 1.0}
{'eval_loss': 1.5519810914993286, 'eval_accuracy': 0.286, 'eval_f1_macro': 0.1234090354090354, 'eval_runtime': 0.6504, 'eval_samples_per_second': 768.792, 'eval_steps_per_second': 24.601, 'epoch': 1.0}


  return forward_call(*args, **kwargs)


{'loss': 1.4567, 'grad_norm': 16.68305015563965, 'learning_rate': 8.337876628147136e-06, 'epoch': 2.0}
{'eval_loss': 1.423843502998352, 'eval_accuracy': 0.35, 'eval_f1_macro': 0.30348322122430577, 'eval_runtime': 0.639, 'eval_samples_per_second': 782.523, 'eval_steps_per_second': 25.041, 'epoch': 2.0}


  return forward_call(*args, **kwargs)


{'loss': 1.309, 'grad_norm': 19.621980667114258, 'learning_rate': 5.602010859536356e-06, 'epoch': 3.0}
{'eval_loss': 1.3233089447021484, 'eval_accuracy': 0.452, 'eval_f1_macro': 0.4512080524750096, 'eval_runtime': 0.6431, 'eval_samples_per_second': 777.444, 'eval_steps_per_second': 24.878, 'epoch': 3.0}


  return forward_call(*args, **kwargs)


{'loss': 1.189, 'grad_norm': 18.221874237060547, 'learning_rate': 2.8661450909255777e-06, 'epoch': 4.0}
{'eval_loss': 1.3025363683700562, 'eval_accuracy': 0.464, 'eval_f1_macro': 0.47546957609808455, 'eval_runtime': 0.64, 'eval_samples_per_second': 781.273, 'eval_steps_per_second': 25.001, 'epoch': 4.0}


  return forward_call(*args, **kwargs)


{'loss': 1.1256, 'grad_norm': 14.022656440734863, 'learning_rate': 1.30279322314799e-07, 'epoch': 5.0}
{'eval_loss': 1.296478033065796, 'eval_accuracy': 0.49, 'eval_f1_macro': 0.5043878384687208, 'eval_runtime': 0.6497, 'eval_samples_per_second': 769.57, 'eval_steps_per_second': 24.626, 'epoch': 5.0}
{'train_runtime': 55.0627, 'train_samples_per_second': 181.611, 'train_steps_per_second': 5.721, 'train_loss': 1.3324882628425718, 'epoch': 5.0}


  return forward_call(*args, **kwargs)
[I 2025-08-07 21:07:52,046] Trial 6 finished with value: 0.5043878384687208 and parameters: {'learning_rate': 1.3679328843053893e-05, 'batch_size': 32, 'num_train_epochs': 5}. Best is trial 4 with value: 0.5787705726512649.


{'eval_loss': 1.296478033065796, 'eval_accuracy': 0.49, 'eval_f1_macro': 0.5043878384687208, 'eval_runtime': 0.6958, 'eval_samples_per_second': 718.643, 'eval_steps_per_second': 22.997, 'epoch': 5.0}


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
  return forward_call(*args, **kwargs)


{'loss': 1.5245, 'grad_norm': 7.666121959686279, 'learning_rate': 2.2995532896872272e-05, 'epoch': 1.0}
{'eval_loss': 1.3507355451583862, 'eval_accuracy': 0.42, 'eval_f1_macro': 0.3956424771379455, 'eval_runtime': 1.0106, 'eval_samples_per_second': 494.759, 'eval_steps_per_second': 31.665, 'epoch': 1.0}


  return forward_call(*args, **kwargs)


{'loss': 1.2493, 'grad_norm': 12.835816383361816, 'learning_rate': 1.7326654011758632e-05, 'epoch': 2.0}
{'eval_loss': 1.28301203250885, 'eval_accuracy': 0.492, 'eval_f1_macro': 0.4968350532205664, 'eval_runtime': 1.007, 'eval_samples_per_second': 496.516, 'eval_steps_per_second': 31.777, 'epoch': 2.0}


  return forward_call(*args, **kwargs)


{'loss': 1.0459, 'grad_norm': 17.082054138183594, 'learning_rate': 1.1612058361442459e-05, 'epoch': 3.0}
{'eval_loss': 1.2793805599212646, 'eval_accuracy': 0.5, 'eval_f1_macro': 0.5169525944308797, 'eval_runtime': 0.9983, 'eval_samples_per_second': 500.843, 'eval_steps_per_second': 32.054, 'epoch': 3.0}


  return forward_call(*args, **kwargs)


{'loss': 0.8814, 'grad_norm': 17.282516479492188, 'learning_rate': 5.943179476328818e-06, 'epoch': 4.0}
{'eval_loss': 1.2923190593719482, 'eval_accuracy': 0.546, 'eval_f1_macro': 0.5499913781194773, 'eval_runtime': 1.0212, 'eval_samples_per_second': 489.608, 'eval_steps_per_second': 31.335, 'epoch': 4.0}


  return forward_call(*args, **kwargs)


{'loss': 0.7432, 'grad_norm': 6.876336574554443, 'learning_rate': 2.2858382601264685e-07, 'epoch': 5.0}
{'eval_loss': 1.3544284105300903, 'eval_accuracy': 0.54, 'eval_f1_macro': 0.5556627381374681, 'eval_runtime': 1.0199, 'eval_samples_per_second': 490.226, 'eval_steps_per_second': 31.374, 'epoch': 5.0}
{'train_runtime': 82.8767, 'train_samples_per_second': 120.661, 'train_steps_per_second': 7.541, 'train_loss': 1.0888759765625, 'epoch': 5.0}


  return forward_call(*args, **kwargs)


{'eval_loss': 1.3544284105300903, 'eval_accuracy': 0.54, 'eval_f1_macro': 0.5556627381374681, 'eval_runtime': 1.0993, 'eval_samples_per_second': 454.855, 'eval_steps_per_second': 29.111, 'epoch': 5.0}


[I 2025-08-07 21:09:19,411] Trial 7 finished with value: 0.5556627381374681 and parameters: {'learning_rate': 2.8572978251580856e-05, 'batch_size': 16, 'num_train_epochs': 5}. Best is trial 4 with value: 0.5787705726512649.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
  return forward_call(*args, **kwargs)


{'loss': 1.5458, 'grad_norm': 12.845026969909668, 'learning_rate': 9.8703396753916e-06, 'epoch': 1.0}
{'eval_loss': 1.393621563911438, 'eval_accuracy': 0.382, 'eval_f1_macro': 0.3102844025116184, 'eval_runtime': 1.711, 'eval_samples_per_second': 292.231, 'eval_steps_per_second': 36.821, 'epoch': 1.0}


  return forward_call(*args, **kwargs)


{'loss': 1.2924, 'grad_norm': 15.390912055969238, 'learning_rate': 7.41997618668521e-06, 'epoch': 2.0}
{'eval_loss': 1.297756552696228, 'eval_accuracy': 0.482, 'eval_f1_macro': 0.49015833445287865, 'eval_runtime': 1.7245, 'eval_samples_per_second': 289.936, 'eval_steps_per_second': 36.532, 'epoch': 2.0}


  return forward_call(*args, **kwargs)


{'loss': 1.1005, 'grad_norm': 16.714962005615234, 'learning_rate': 4.959771880755101e-06, 'epoch': 3.0}
{'eval_loss': 1.2529124021530151, 'eval_accuracy': 0.502, 'eval_f1_macro': 0.5181085565453456, 'eval_runtime': 1.6733, 'eval_samples_per_second': 298.819, 'eval_steps_per_second': 37.651, 'epoch': 3.0}


  return forward_call(*args, **kwargs)


{'loss': 0.9691, 'grad_norm': 44.808692932128906, 'learning_rate': 2.499567574824991e-06, 'epoch': 4.0}
{'eval_loss': 1.3215268850326538, 'eval_accuracy': 0.534, 'eval_f1_macro': 0.5519521065849791, 'eval_runtime': 1.6919, 'eval_samples_per_second': 295.526, 'eval_steps_per_second': 37.236, 'epoch': 4.0}


  return forward_call(*args, **kwargs)


{'loss': 0.8637, 'grad_norm': 31.92978286743164, 'learning_rate': 4.920408611860219e-08, 'epoch': 5.0}
{'eval_loss': 1.3209311962127686, 'eval_accuracy': 0.528, 'eval_f1_macro': 0.5467085866181285, 'eval_runtime': 1.6755, 'eval_samples_per_second': 298.417, 'eval_steps_per_second': 37.601, 'epoch': 5.0}
{'train_runtime': 136.7478, 'train_samples_per_second': 73.127, 'train_steps_per_second': 9.141, 'train_loss': 1.1542795776367187, 'epoch': 5.0}


  return forward_call(*args, **kwargs)
[I 2025-08-07 21:11:39,318] Trial 8 finished with value: 0.5519521065849791 and parameters: {'learning_rate': 1.2301021529650548e-05, 'batch_size': 8, 'num_train_epochs': 5}. Best is trial 4 with value: 0.5787705726512649.


{'eval_loss': 1.3215268850326538, 'eval_accuracy': 0.534, 'eval_f1_macro': 0.5519521065849791, 'eval_runtime': 1.7041, 'eval_samples_per_second': 293.414, 'eval_steps_per_second': 36.97, 'epoch': 5.0}


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
  return forward_call(*args, **kwargs)


{'loss': 1.5552, 'grad_norm': 7.353111267089844, 'learning_rate': 1.4721861048020431e-05, 'epoch': 1.0}
{'eval_loss': 1.4334838390350342, 'eval_accuracy': 0.354, 'eval_f1_macro': 0.25921335977400023, 'eval_runtime': 1.006, 'eval_samples_per_second': 497.014, 'eval_steps_per_second': 31.809, 'epoch': 1.0}


  return forward_call(*args, **kwargs)


{'loss': 1.2995, 'grad_norm': 15.950003623962402, 'learning_rate': 7.448214285164487e-06, 'epoch': 2.0}
{'eval_loss': 1.2986464500427246, 'eval_accuracy': 0.478, 'eval_f1_macro': 0.49811944420239895, 'eval_runtime': 0.9992, 'eval_samples_per_second': 500.396, 'eval_steps_per_second': 32.025, 'epoch': 2.0}


  return forward_call(*args, **kwargs)


{'loss': 1.1293, 'grad_norm': 11.650919914245605, 'learning_rate': 2.3275669641139022e-07, 'epoch': 3.0}
{'eval_loss': 1.285720944404602, 'eval_accuracy': 0.496, 'eval_f1_macro': 0.5099192149439827, 'eval_runtime': 0.9895, 'eval_samples_per_second': 505.304, 'eval_steps_per_second': 32.339, 'epoch': 3.0}
{'train_runtime': 49.0209, 'train_samples_per_second': 122.397, 'train_steps_per_second': 7.65, 'train_loss': 1.32797998046875, 'epoch': 3.0}


  return forward_call(*args, **kwargs)
[I 2025-08-07 21:12:32,054] Trial 9 finished with value: 0.5099192149439827 and parameters: {'learning_rate': 2.1820940288567834e-05, 'batch_size': 16, 'num_train_epochs': 3}. Best is trial 4 with value: 0.5787705726512649.


{'eval_loss': 1.285720944404602, 'eval_accuracy': 0.496, 'eval_f1_macro': 0.5099192149439827, 'eval_runtime': 1.0443, 'eval_samples_per_second': 478.769, 'eval_steps_per_second': 30.641, 'epoch': 3.0}


0,1
eval/accuracy,▃▆▇█▇█▁▃▅▆▆▆▅▇▇███▄▆▇███▃▆▇▇
eval/f1_macro,▃▇████▁▄▆▇▇▇▅▇▇███▄▇▇███▃▇▇▇
eval/loss,▆▂▁▂▂▂█▅▃▂▂▂▃▂▂▂▃▃▄▂▁▃▃▃▅▂▂▂
eval/runtime,▃▃▄▃▃▃▁▁▁▁▁▁▃▃▃▃▃▄██████▃▃▃▄
eval/samples_per_second,▄▄▄▄▄▄█████▇▄▄▄▄▄▃▁▁▁▁▁▁▄▄▄▄
eval/steps_per_second,▅▅▅▆▆▅▂▂▂▂▂▁▅▅▅▅▅▄█▇████▅▅▅▅
train/epoch,▁▃▃▅▅██▁▁▃▆▆███▁▃▃▅▅▆███▁▃▃▅▅▆████▁▃▃▅▅▅
train/global_step,▁▁▂▂▃▃▄▄▄▄▁▁▁▁▂▂▂▂▂▁▂▂▃▃▃▄▂▂▃▃▅▆▆███▁▃▃▃
train/grad_norm,▂▃▂▄▄▁▃▄▃▃▂▃▃▃▂▃▃▃█▆▂▃▂
train/learning_rate,▆▅▄▂▁▄▄▃▂▁█▆▅▃▁▄▃▂▂▁▅▃▁

0,1
eval/accuracy,0.496
eval/f1_macro,0.50992
eval/loss,1.28572
eval/runtime,1.0443
eval/samples_per_second,478.769
eval/steps_per_second,30.641
total_flos,789354427392000.0
train/epoch,3.0
train/global_step,375.0
train/grad_norm,11.65092


In [21]:
best_trial_bert = study_bert.best_trial
print('Bert best trial on subset:')
print(best_trial_bert.params)
best_trial_roberta = study_roberta.best_trial
print('RoBerta best trial on subset:')
print(best_trial_roberta.params)

Bert best trial on subset:
{'learning_rate': 4.4215134189001845e-05, 'batch_size': 8, 'num_train_epochs': 3}
RoBerta best trial on subset:
{'learning_rate': 3.9023135920177784e-05, 'batch_size': 16, 'num_train_epochs': 5}


In [22]:
best_params_bert = best_trial_bert.params
run_name_bert = f"bert_final_stratify_{is_preprocessed}-ep{best_params_bert['num_train_epochs']}-lr{best_params_bert['learning_rate']:.1e}-bs{best_params_bert['batch_size']}"
wandb.init(project="tweet-sentiment-classification_split_to_test_maxl_256", name=run_name_bert, reinit=True)

final_trainer_bert = build_trainer(
    model_checkpoint="bert-base-uncased",
    trial=best_trial_bert,
    run_prefix=f"bert_final_stratify_{is_preprocessed}",
    train_dataset=tokenized_bert_train,
    val_dataset=tokenized_bert_val
)
final_trainer_bert.train()
final_trainer_bert.evaluate(tokenized_bert_test)
wandb.finish()



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
  return forward_call(*args, **kwargs)


{'loss': 0.949, 'grad_norm': 20.738698959350586, 'learning_rate': 2.9495488177428533e-05, 'epoch': 1.0}
{'eval_loss': 0.7834368348121643, 'eval_accuracy': 0.8105263157894737, 'eval_f1_macro': 0.8174129839043711, 'eval_runtime': 22.2296, 'eval_samples_per_second': 303.424, 'eval_steps_per_second': 37.967, 'epoch': 1.0}


  return forward_call(*args, **kwargs)


{'loss': 0.6851, 'grad_norm': 2.0032002925872803, 'learning_rate': 1.4764602934998836e-05, 'epoch': 2.0}
{'eval_loss': 0.699436366558075, 'eval_accuracy': 0.8613787991104522, 'eval_f1_macro': 0.8665421915197182, 'eval_runtime': 21.9741, 'eval_samples_per_second': 306.952, 'eval_steps_per_second': 38.409, 'epoch': 2.0}


  return forward_call(*args, **kwargs)


{'loss': 0.5624, 'grad_norm': 0.16492992639541626, 'learning_rate': 3.371769256914223e-08, 'epoch': 3.0}
{'eval_loss': 0.7147858142852783, 'eval_accuracy': 0.8744255003706449, 'eval_f1_macro': 0.8788204680910514, 'eval_runtime': 21.9778, 'eval_samples_per_second': 306.901, 'eval_steps_per_second': 38.402, 'epoch': 3.0}
{'train_runtime': 1136.6259, 'train_samples_per_second': 83.051, 'train_steps_per_second': 10.383, 'train_loss': 0.7321671843225724, 'epoch': 3.0}


  return forward_call(*args, **kwargs)


{'eval_loss': 0.7161492109298706, 'eval_accuracy': 0.8733689205219455, 'eval_f1_macro': 0.8769232968000985, 'eval_runtime': 22.1349, 'eval_samples_per_second': 304.678, 'eval_steps_per_second': 38.085, 'epoch': 3.0}


0,1
eval/accuracy,▁▇██
eval/f1_macro,▁▇██
eval/loss,█▁▂▂
eval/runtime,█▁▁▅
eval/samples_per_second,▁██▃
eval/steps_per_second,▁██▃
train/epoch,▁▁▅▅████
train/global_step,▁▁▅▅████
train/grad_norm,█▂▁
train/learning_rate,█▅▁

0,1
eval/accuracy,0.87337
eval/f1_macro,0.87692
eval/loss,0.71615
eval/runtime,22.1349
eval/samples_per_second,304.678
eval/steps_per_second,38.085
total_flos,1.2418913206158336e+16
train/epoch,3.0
train/global_step,11802.0
train/grad_norm,0.16493


In [23]:
best_params_roberta = best_trial_roberta.params
run_name_roberta = f"roberta_final_stratify_{is_preprocessed}-ep{best_params_roberta['num_train_epochs']}-lr{best_params_roberta['learning_rate']:.1e}-bs{best_params_roberta['batch_size']}"
wandb.init(project="tweet-sentiment-classification_split_to_test_maxl_256", name=run_name_roberta, reinit=True)

final_trainer_roberta = build_trainer(
    model_checkpoint="roberta-base",
    trial=best_trial_roberta,
    run_prefix=f"roberta_final_stratify_{is_preprocessed}",
    train_dataset=tokenized_roberta_train,
    val_dataset=tokenized_roberta_val
)
final_trainer_roberta.train()
final_trainer_roberta.evaluate(tokenized_roberta_test)
wandb.finish()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
  return forward_call(*args, **kwargs)


{'loss': 1.1045, 'grad_norm': 15.890998840332031, 'learning_rate': 3.124231542811183e-05, 'epoch': 1.0}
{'eval_loss': 0.8863462209701538, 'eval_accuracy': 0.7405485544848036, 'eval_f1_macro': 0.7507302485192145, 'eval_runtime': 13.7521, 'eval_samples_per_second': 490.47, 'eval_steps_per_second': 30.686, 'epoch': 1.0}


  return forward_call(*args, **kwargs)


{'loss': 0.7892, 'grad_norm': 4.179565906524658, 'learning_rate': 2.344165602607121e-05, 'epoch': 2.0}
{'eval_loss': 0.804499626159668, 'eval_accuracy': 0.792290585618977, 'eval_f1_macro': 0.8006433994853828, 'eval_runtime': 13.3374, 'eval_samples_per_second': 505.722, 'eval_steps_per_second': 31.64, 'epoch': 2.0}


  return forward_call(*args, **kwargs)


{'loss': 0.6619, 'grad_norm': 16.28511619567871, 'learning_rate': 1.5637028842035652e-05, 'epoch': 3.0}
{'eval_loss': 0.829306960105896, 'eval_accuracy': 0.7943661971830986, 'eval_f1_macro': 0.800023331955587, 'eval_runtime': 13.3251, 'eval_samples_per_second': 506.186, 'eval_steps_per_second': 31.669, 'epoch': 3.0}


  return forward_call(*args, **kwargs)


{'loss': 0.58, 'grad_norm': 9.4111909866333, 'learning_rate': 7.832401658000095e-06, 'epoch': 4.0}
{'eval_loss': 0.7673983573913574, 'eval_accuracy': 0.8406226834692365, 'eval_f1_macro': 0.8462306266201365, 'eval_runtime': 13.3522, 'eval_samples_per_second': 505.16, 'eval_steps_per_second': 31.605, 'epoch': 4.0}


  return forward_call(*args, **kwargs)


{'loss': 0.5203, 'grad_norm': 0.5141141414642334, 'learning_rate': 3.174225595947354e-08, 'epoch': 5.0}
{'eval_loss': 0.8042707443237305, 'eval_accuracy': 0.8465530022238695, 'eval_f1_macro': 0.8515113328830293, 'eval_runtime': 13.369, 'eval_samples_per_second': 504.527, 'eval_steps_per_second': 31.566, 'epoch': 5.0}
{'train_runtime': 1096.9863, 'train_samples_per_second': 143.42, 'train_steps_per_second': 8.965, 'train_loss': 0.7311791284713396, 'epoch': 5.0}


  return forward_call(*args, **kwargs)


{'eval_loss': 0.8232207298278809, 'eval_accuracy': 0.8404507710557533, 'eval_f1_macro': 0.8450049045492734, 'eval_runtime': 13.3989, 'eval_samples_per_second': 503.325, 'eval_steps_per_second': 31.495, 'epoch': 5.0}


0,1
eval/accuracy,▁▄▅███
eval/f1_macro,▁▄▄███
eval/loss,█▃▅▁▃▄
eval/runtime,█▁▁▁▂▂
eval/samples_per_second,▁███▇▇
eval/steps_per_second,▁███▇▇
train/epoch,▁▁▃▃▅▅▆▆████
train/global_step,▁▁▃▃▅▅▆▆████
train/grad_norm,█▃█▅▁
train/learning_rate,█▆▅▃▁

0,1
eval/accuracy,0.84045
eval/f1_macro,0.845
eval/loss,0.82322
eval/runtime,13.3989
eval/samples_per_second,503.325
eval/steps_per_second,31.495
total_flos,2.069818867693056e+16
train/epoch,5.0
train/global_step,9835.0
train/grad_norm,0.51411


In [24]:
final_trainer_bert.save_model(f"models/bert_final_stratify_{is_preprocessed}")
bert_tokenizer.save_pretrained(f"models/bert_final_stratify_{is_preprocessed}")
!cp -r models/bert_final_stratify_{is_preprocessed} "/content/drive/MyDrive/Colab Notebooks/nlp_project/models/w_test_split/bert_best_model_stratify_maxl_256_{is_preprocessed}"

final_trainer_roberta.save_model(f"models/roberta_final_stratify_{is_preprocessed}")
roberta_tokenizer.save_pretrained(f"models/roberta_final_stratify_{is_preprocessed}")
!cp -r models/roberta_final_stratify_{is_preprocessed} "/content/drive/MyDrive/Colab Notebooks/nlp_project/models/w_test_split/roberta_best_model_stratify_maxl_256_{is_preprocessed}"

#### Ensemble Bert and RoBerta

Load best models

In [25]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

best_bert_path = f"/content/drive/MyDrive/Colab Notebooks/nlp_project/models/w_test_split/bert_best_model_stratify_maxl_256_{is_preprocessed}"
bert_model = AutoModelForSequenceClassification.from_pretrained(best_bert_path).to(device)
bert_tokenizer = AutoTokenizer.from_pretrained(best_bert_path)

best_roberta_path = f"/content/drive/MyDrive/Colab Notebooks/nlp_project/models/w_test_split/roberta_best_model_stratify_maxl_256_{is_preprocessed}"
roberta_model = AutoModelForSequenceClassification.from_pretrained(best_roberta_path).to(device)
roberta_tokenizer = AutoTokenizer.from_pretrained(best_roberta_path)


Tokenize testset

In [26]:
bert_inputs = bert_tokenizer(
    list(test_df["clean_text"]),
    padding=True,
    truncation=True,
    return_tensors="pt"
).to(device)

roberta_inputs = roberta_tokenizer(
    list(test_df["clean_text"]),
    padding=True,
    truncation=True,
    return_tensors="pt"
).to(device)

Generate predictions with both models

In [27]:
# Prepare DataLoader
batch_size = 32  # adjust if needed

bert_dataset = TensorDataset(
    bert_inputs["input_ids"],
    bert_inputs["attention_mask"]
)
bert_loader = DataLoader(bert_dataset, batch_size=batch_size)

roberta_dataset = TensorDataset(
    roberta_inputs["input_ids"],
    roberta_inputs["attention_mask"]
)
roberta_loader = DataLoader(roberta_dataset, batch_size=batch_size)

In [28]:
bert_model.eval()
roberta_model.eval()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

Ensemble probabilities

In [29]:
bert_probs_all = []
roberta_probs_all = []

In [30]:
with torch.no_grad():
    for (bert_input_ids, bert_attention_mask), (roberta_input_ids, roberta_attention_mask) in zip(bert_loader, roberta_loader):
        bert_input_ids = bert_input_ids.to(device)
        bert_attention_mask = bert_attention_mask.to(device)
        roberta_input_ids = roberta_input_ids.to(device)
        roberta_attention_mask = roberta_attention_mask.to(device)

        # Get model outputs
        bert_logits = bert_model(input_ids=bert_input_ids, attention_mask=bert_attention_mask).logits
        roberta_logits = roberta_model(input_ids=roberta_input_ids, attention_mask=roberta_attention_mask).logits

        # Convert to softmax probabilities
        bert_probs = torch.nn.functional.softmax(bert_logits, dim=1).cpu().numpy()
        roberta_probs = torch.nn.functional.softmax(roberta_logits, dim=1).cpu().numpy()

        bert_probs_all.append(bert_probs)
        roberta_probs_all.append(roberta_probs)

# Concatenate all batches
import numpy as np
bert_probs_all = np.concatenate(bert_probs_all, axis=0)
roberta_probs_all = np.concatenate(roberta_probs_all, axis=0)

# Ensemble prediction
ensemble_probs = (bert_probs_all + roberta_probs_all) / 2
ensemble_preds = np.argmax(ensemble_probs, axis=1)

  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)


In [31]:
# Evaluate performance
true_labels = test_df["label"].values

acc = accuracy_score(true_labels, ensemble_preds)
f1 = f1_score(true_labels, ensemble_preds, average="macro")

print(f"✅ Ensemble Accuracy: {acc:.4f}")
print(f"✅ Ensemble F1 Macro: {f1:.4f}")


✅ Ensemble Accuracy: 0.8743
✅ Ensemble F1 Macro: 0.8775


In [32]:
print(classification_report(true_labels, ensemble_preds, target_names=unique_labels))

                    precision    recall  f1-score   support

Extremely Negative       0.85      0.92      0.88       911
Extremely Positive       0.88      0.93      0.91      1084
          Negative       0.84      0.85      0.85      1644
           Neutral       0.92      0.86      0.89      1250
          Positive       0.89      0.85      0.87      1855

          accuracy                           0.87      6744
         macro avg       0.87      0.88      0.88      6744
      weighted avg       0.88      0.87      0.87      6744

