In [None]:
# ✅ INSTALL DEPENDENCIES
!pip install evaluate
!pip install optuna
!pip install -U transformers

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.5
Collecting optuna
  Downloading optuna-4.5.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.4-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.5.0-py3-none-any.whl (400 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.9/400.9 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.16.4-py3-none-any.whl (247 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m247.0/247.0 kB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.

In [None]:
# ✅ IMPORTS
import os
import re
import wandb
import optuna
import evaluate
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import torch
from torch import nn
from torch.utils.data import DataLoader

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification, AutoConfig,
    TrainingArguments, Trainer, EarlyStoppingCallback,
    default_data_collator
)
from transformers.integrations import WandbCallback
from datasets import Dataset
from datetime import datetime

from optuna import trial


In [None]:
# ✅ MOUNT DRIVE
from google.colab import drive
drive.mount('/content/drive')



Mounted at /content/drive


In [None]:
import wandb
wandb.login(key="c76b3f4273b893fc55052d914d403d9718122ab7")

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmaximilianon[0m ([33mmaximilianon-tel-aviv-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
# ✅ CONSTANTS
MODEL_NAME_TWITTER = "cardiffnlp/twitter-roberta-base"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# Optional: safer SDPA fallback on some Colab combos
if torch.cuda.is_available():
    torch.backends.cuda.sdp_kernel(enable_flash=False, enable_mem_efficient=False, enable_math=True)


  self.gen = func(*args, **kwds)


In [None]:
# ✅ LOAD & CLEAN DATA
label2id = {
    "Extremely Negative": 0,
    "Negative": 1,
    "Neutral": 2,
    "Positive": 3,
    "Extremely Positive": 4
}

train_path = "/content/drive/MyDrive/Colab Notebooks/Corona_NLP_train_clean.csv"
test_path  = "/content/drive/MyDrive/Colab Notebooks/Corona_NLP_test_clean.csv"

train_df = pd.read_csv(train_path, encoding="ISO-8859-1")
test_df  = pd.read_csv(test_path,  encoding="ISO-8859-1")

train_df["label"] = train_df["Sentiment"].map(label2id)
test_df["label"]  = test_df["Sentiment"].map(label2id)

train_df = train_df[["cleaned_tweets","label"]].dropna(subset=["cleaned_tweets"])
test_df  = test_df[["cleaned_tweets","label"]].dropna(subset=["cleaned_tweets"])

def normalize_tweet(t: str) -> str:
    t = str(t)
    t = re.sub(r"http\S+", "<url>", t)
    t = re.sub(r"@\w+", "<user>", t)
    t = re.sub(r"\d+", "<number>", t)
    return t.strip()

for df in (train_df, test_df):
    df["cleaned_tweets"] = df["cleaned_tweets"].astype(str).map(normalize_tweet)
    df.query("cleaned_tweets.str.len() >= 3", engine="python", inplace=True)

train_df["label"] = train_df["label"].astype(int)
test_df["label"]  = test_df["label"].astype(int)

# Split
train_df, val_df = train_test_split(
    train_df, test_size=0.2, stratify=train_df["label"], random_state=42
)
test_df_final = test_df.copy()

# Hard checks
assert {"cleaned_tweets","label"}.issubset(train_df.columns)
assert train_df["label"].between(0,4).all() and val_df["label"].between(0,4).all()


In [None]:
# ✅ TOKENIZER + SPECIAL TOKENS (ADD ONCE, OUTSIDE OBJECTIVE)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME_TWITTER, use_fast=True)

# Ensure pad token (usually present for RoBERTa; safe to check)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token": "<pad>"})

# Add your custom specials only if missing
custom_specials = ["<url>", "<user>", "<number>"]
to_add = [t for t in custom_specials if t not in tokenizer.get_vocab()]
num_added = 0
if to_add:
    num_added = tokenizer.add_special_tokens({"additional_special_tokens": to_add})

print(f"Added {num_added} new tokens: {to_add if to_add else '[]'}")



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/565 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

Added 3 new tokens: ['<url>', '<user>', '<number>']


In [None]:
# ✅ DATASETS
train_dataset_hf = Dataset.from_pandas(train_df.reset_index(drop=True))
val_dataset_hf = Dataset.from_pandas(val_df.reset_index(drop=True))
test_dataset_hf = Dataset.from_pandas(test_df_final.reset_index(drop=True))


In [None]:
# ✅ METRICS
acc_metric = evaluate.load("accuracy")
f1_metric  = evaluate.load("f1")
prec_metric = evaluate.load("precision")
rec_metric  = evaluate.load("recall")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": acc_metric.compute(predictions=preds, references=labels)["accuracy"],
        # use macro F1 to match metric_for_best_model
        "f1_macro": f1_metric.compute(predictions=preds, references=labels, average="macro")["f1"],
        # keep weighted variants if you like viewing them
        "precision_weighted": prec_metric.compute(predictions=preds, references=labels, average="weighted")["precision"],
        "recall_weighted": rec_metric.compute(predictions=preds, references=labels, average="weighted")["recall"],
    }


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

In [None]:
# ✅ MODEL FACTORY (RESIZES EMBEDDINGS TO MATCH TOKENIZER)
def build_model(dropout: float | None = None):
    config = AutoConfig.from_pretrained(
        MODEL_NAME_TWITTER,
        num_labels=5,
        problem_type="single_label_classification",
        id2label={0:"Extremely Negative",1:"Negative",2:"Neutral",3:"Positive",4:"Extremely Positive"},
        label2id={"Extremely Negative":0,"Negative":1,"Neutral":2,"Positive":3,"Extremely Positive":4},
        # If dropout is provided from Optuna, apply it to all relevant fields
        hidden_dropout_prob=dropout if dropout is not None else None,
        attention_probs_dropout_prob=dropout if dropout is not None else None,
        classifier_dropout=dropout if dropout is not None else None,
    )
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME_TWITTER, config=config)
    # keep tokenizer/model in sync (because you added <url>/<user>/<number>)
    model.resize_token_embeddings(len(tokenizer))
    if tokenizer.pad_token_id is not None:
        model.config.pad_token_id = tokenizer.pad_token_id
    return model


In [None]:
# ✅ OPTUNA OBJECTIVE — logs to Weights & Biases per trial
def objective(trial):
    import wandb

    # ---- Hyperparams to search (your requested space)
    learning_rate = trial.suggest_float("learning_rate", 1.8e-5, 6.5e-5, log=True)
    max_length    = trial.suggest_categorical("max_length", [128, 192, 256])
    batch_size    = trial.suggest_categorical("batch_size", [16, 32])
    patience      = trial.suggest_int("patience", 2, 4)
    weight_decay  = trial.suggest_categorical("weight_decay", [0.0, 0.01, 0.02])
    epochs        = trial.suggest_int("epochs", 7, 10)  # higher epoch range
    warmup_ratio  = trial.suggest_categorical("warmup_ratio", [0.05, 0.1])
    dropout       = trial.suggest_float("dropout", 0.1, 0.16)

    # ---- Start a fresh W&B run for this trial
    run = wandb.init(
        project="hf-robertatwitter-attempt3",
        name=f"trial-{trial.number}",
        reinit=True,
        config={
            "learning_rate": learning_rate,
            "max_length": max_length,
            "batch_size": batch_size,
            "patience": patience,
            "epochs": epochs,
            "weight_decay": weight_decay,
            "warmup_ratio": warmup_ratio,
            "dropout": dropout,
            "model_name": MODEL_NAME_TWITTER,
        },
    )

    try:
        # ---- Tokenize for this trial's max_length
        def preprocess(examples):
            enc = tokenizer(
                examples["cleaned_tweets"],
                truncation=True,
                padding="max_length",
                max_length=max_length,
            )
            enc["labels"] = examples["label"]
            return enc

        print(f"[DEBUG] Trial #{trial.number} → max_length = {max_length}")


        train_tok = train_dataset_hf.map(preprocess, batched=True, remove_columns=train_dataset_hf.column_names)
        val_tok   = val_dataset_hf.map(preprocess,   batched=True, remove_columns=val_dataset_hf.column_names)
        train_tok.set_format(type="torch")
        val_tok.set_format(type="torch")

        # ---- Build model (now receives dropout)
        model = build_model(dropout=dropout)

        # ---- Training args with W&B reporting
        args = TrainingArguments(
            output_dir=f"./hf_roberta_optuna/{trial.number}",
            run_name=f"trial-{trial.number}",        # W&B run name
            report_to=["wandb"],                     # enable W&B logging

            learning_rate=learning_rate,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            num_train_epochs=epochs,
            weight_decay=weight_decay,
            warmup_ratio=warmup_ratio,

            # 🔧 FIX: correct argument name
            eval_strategy="steps",
            save_strategy="steps",
            eval_steps=300,
            save_steps=300,
            load_best_model_at_end=True,
            metric_for_best_model="accuracy",
            greater_is_better=True,

            save_total_limit=1,                      # keep disk usage tiny
            logging_steps=100,
            seed=42,
            fp16=torch.cuda.is_available(),
        )

        trainer = Trainer(
            model=model,
            args=args,
            train_dataset=train_tok,
            eval_dataset=val_tok,
            tokenizer=tokenizer,
            data_collator=default_data_collator,
            compute_metrics=compute_metrics,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=patience)],
        )

        trainer.train()
        metrics = trainer.evaluate()

        # record best checkpoint dir for this trial
        best_ckpt = trainer.state.best_model_checkpoint
        trial.set_user_attr("best_checkpoint", best_ckpt)

        # ensure best model (already loaded) is saved in output_dir
        trainer.save_model()

        # Log final eval metrics explicitly too
        wandb.log({
            "final_eval/accuracy": metrics.get("eval_accuracy"),
            "final_eval/f1_macro": metrics.get("eval_f1_macro"),
        })

        # 🔧 Return the SAME metric used for model selection
        return metrics["eval_accuracy"]

    finally:
        # Ensure the run is closed even if an error occurs
        wandb.finish()

In [None]:
# ✅ RUN OPTUNA
study = optuna.create_study(direction="maximize", study_name="hf-robertatwitter-attempt3")
study.optimize(objective, n_trials=10)
print("Best value:", study.best_value)
print("Best params:", study.best_params)


[I 2025-08-19 08:27:51,991] A new study created in memory with name: hf-robertatwitter-attempt3


[DEBUG] Trial #0 → max_length = 128


Map:   0%|          | 0/32100 [00:00<?, ? examples/s]

Map:   0%|          | 0/8026 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
  trainer = Trainer(


Step,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision Weighted,Recall Weighted
300,1.2975,1.204246,0.488413,0.496838,0.489372,0.488413
600,1.0159,1.000072,0.599427,0.611506,0.605489,0.599427
900,0.941,0.942268,0.631822,0.643377,0.64059,0.631822
1200,0.8625,0.969122,0.626838,0.638506,0.637861,0.626838
1500,0.8176,0.872001,0.676676,0.687181,0.681582,0.676676
1800,0.8076,0.865608,0.671194,0.681375,0.680613,0.671194
2100,0.725,0.808036,0.702965,0.712414,0.705445,0.702965
2400,0.7424,0.837597,0.689509,0.699223,0.694368,0.689509
2700,0.6917,0.801215,0.708697,0.718307,0.711383,0.708697
3000,0.7115,0.782373,0.705333,0.71458,0.710668,0.705333


0,1
eval/accuracy,▁▅▆▅▇▇█▇██████
eval/f1_macro,▁▅▆▅▇▇█▇██████
eval/loss,█▅▄▄▂▂▁▂▁▁▁▂▁▁
eval/precision_weighted,▁▅▆▆▇▇█▇██████
eval/recall_weighted,▁▅▆▅▇▇█▇██████
eval/runtime,▃▁▂▂▄▃▂▅▅▂▆█▅▃
eval/samples_per_second,▆█▇▇▅▆▇▄▄▇▃▁▄▆
eval/steps_per_second,▆█▇▇▅▆▇▄▄▇▃▁▄▆
final_eval/accuracy,▁
final_eval/f1_macro,▁

0,1
eval/accuracy,0.71019
eval/f1_macro,0.71895
eval/loss,0.80217
eval/precision_weighted,0.71297
eval/recall_weighted,0.71019
eval/runtime,3.7762
eval/samples_per_second,2125.389
eval/steps_per_second,66.468
final_eval/accuracy,0.71019
final_eval/f1_macro,0.71895


[I 2025-08-19 08:32:48,751] Trial 0 finished with value: 0.7101918764016945 and parameters: {'learning_rate': 1.8811350713797068e-05, 'max_length': 128, 'batch_size': 32, 'patience': 2, 'weight_decay': 0.01, 'epochs': 10, 'warmup_ratio': 0.05, 'dropout': 0.14479257124768727}. Best is trial 0 with value: 0.7101918764016945.


[DEBUG] Trial #1 → max_length = 256


Map:   0%|          | 0/32100 [00:00<?, ? examples/s]

Map:   0%|          | 0/8026 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision Weighted,Recall Weighted
300,1.3216,1.263146,0.451906,0.457657,0.476068,0.451906
600,1.0366,1.118661,0.54772,0.559451,0.583237,0.54772
900,0.9477,1.01412,0.605532,0.617097,0.62097,0.605532
1200,0.8837,0.933873,0.650885,0.658075,0.666506,0.650885
1500,0.8203,0.889746,0.678919,0.68958,0.683956,0.678919
1800,0.8108,0.865583,0.679292,0.688229,0.692142,0.679292
2100,0.7072,0.814881,0.705457,0.715276,0.705651,0.705457
2400,0.6973,0.844604,0.689509,0.699166,0.695993,0.689509
2700,0.6821,0.80305,0.7077,0.717302,0.711131,0.7077
3000,0.701,0.791881,0.705457,0.715292,0.71266,0.705457


0,1
eval/accuracy,▁▄▅▆▇▇█▇███████▇▇█
eval/f1_macro,▁▄▅▆▇▇█▇███████▇▇█
eval/loss,█▆▄▃▂▂▁▂▁▁▂▂▂▃▂▃▄▂
eval/precision_weighted,▁▄▅▇▇▇█▇████████▇█
eval/recall_weighted,▁▄▅▆▇▇█▇███████▇▇█
eval/runtime,▃▂▂▂▂▃▁▁▃▃▃▄▃▂▂▂▃█
eval/samples_per_second,▆▇▇▇▇▆██▆▆▆▅▆▇▇▇▆▁
eval/steps_per_second,▆▇▇▇▇▆██▆▆▆▅▆▇▇▇▆▁
final_eval/accuracy,▁
final_eval/f1_macro,▁

0,1
eval/accuracy,0.71455
eval/f1_macro,0.72368
eval/loss,0.82595
eval/precision_weighted,0.71713
eval/recall_weighted,0.71455
eval/runtime,4.5381
eval/samples_per_second,1768.59
eval/steps_per_second,55.31
final_eval/accuracy,0.71455
final_eval/f1_macro,0.72368


[I 2025-08-19 08:40:30,491] Trial 1 finished with value: 0.714552703712933 and parameters: {'learning_rate': 3.284189147336627e-05, 'max_length': 256, 'batch_size': 32, 'patience': 4, 'weight_decay': 0.0, 'epochs': 9, 'warmup_ratio': 0.1, 'dropout': 0.15904622517062814}. Best is trial 1 with value: 0.714552703712933.


[DEBUG] Trial #2 → max_length = 256


Map:   0%|          | 0/32100 [00:00<?, ? examples/s]

Map:   0%|          | 0/8026 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision Weighted,Recall Weighted
300,1.3347,1.239428,0.469848,0.480701,0.474521,0.469848
600,1.0328,1.085724,0.552828,0.564714,0.586318,0.552828
900,0.9289,0.966504,0.624595,0.634881,0.639922,0.624595
1200,0.8487,0.907514,0.654498,0.661808,0.667991,0.654498
1500,0.8041,0.865944,0.680538,0.691198,0.686115,0.680538
1800,0.7946,0.840505,0.68602,0.694853,0.699053,0.68602
2100,0.6892,0.803439,0.706828,0.71635,0.707944,0.706828
2400,0.6896,0.815912,0.70147,0.71112,0.707643,0.70147
2700,0.6579,0.812497,0.705706,0.715389,0.708876,0.705706
3000,0.6779,0.775488,0.710316,0.717423,0.722188,0.710316


0,1
eval/accuracy,▁▃▅▆▇▇█▇████████
eval/f1_macro,▁▃▅▆▇▇██████████
eval/loss,█▆▄▃▂▂▁▂▂▁▁▂▁▃▂▁
eval/precision_weighted,▁▄▆▆▇▇██████████
eval/recall_weighted,▁▃▅▆▇▇█▇████████
eval/runtime,▅▃▃▃▂▁▂▂▂▃▃▃▃▂▃█
eval/samples_per_second,▄▆▆▆▇█▇▇▇▆▆▆▆▇▆▁
eval/steps_per_second,▄▆▆▆▇█▇▇▇▆▆▆▆▇▆▁
final_eval/accuracy,▁
final_eval/f1_macro,▁

0,1
eval/accuracy,0.71941
eval/f1_macro,0.72813
eval/loss,0.79614
eval/precision_weighted,0.72475
eval/recall_weighted,0.71941
eval/runtime,4.5414
eval/samples_per_second,1767.307
eval/steps_per_second,55.27
final_eval/accuracy,0.71941
final_eval/f1_macro,0.72813


[I 2025-08-19 08:47:22,507] Trial 2 finished with value: 0.7194119112883129 and parameters: {'learning_rate': 2.461901621835963e-05, 'max_length': 256, 'batch_size': 32, 'patience': 4, 'weight_decay': 0.02, 'epochs': 8, 'warmup_ratio': 0.1, 'dropout': 0.11995614220404022}. Best is trial 2 with value: 0.7194119112883129.


[DEBUG] Trial #3 → max_length = 128


Map:   0%|          | 0/32100 [00:00<?, ? examples/s]

Map:   0%|          | 0/8026 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision Weighted,Recall Weighted
300,1.2479,1.185541,0.494892,0.50364,0.495131,0.494892
600,0.9929,0.98882,0.61226,0.621767,0.629352,0.61226
900,0.9264,0.932122,0.638674,0.648859,0.64902,0.638674
1200,0.8353,0.922328,0.652878,0.662516,0.658926,0.652878
1500,0.7859,0.848745,0.692001,0.70134,0.69473,0.692001
1800,0.7728,0.833267,0.688388,0.697517,0.700779,0.688388
2100,0.678,0.80515,0.708697,0.718001,0.710935,0.708697
2400,0.684,0.83947,0.701719,0.710717,0.705763,0.701719
2700,0.6465,0.778039,0.716172,0.725231,0.717742,0.716172
3000,0.668,0.780248,0.70608,0.714928,0.712455,0.70608


0,1
eval/accuracy,▁▅▅▆▇▇█▇█████▇██
eval/f1_macro,▁▅▆▆▇▇█▇█████▇██
eval/loss,█▅▄▃▂▂▁▂▁▁▂▂▂▃▂▂
eval/precision_weighted,▁▅▆▆▇▇█▇█████▇██
eval/recall_weighted,▁▅▅▆▇▇█▇█████▇██
eval/runtime,▆▂█▅▂▇▁▄▆▅▁▅▄▆▅▇
eval/samples_per_second,▃▇▁▄▇▂█▅▃▄█▄▅▃▄▂
eval/steps_per_second,▃▇▁▄▇▂█▅▃▄█▄▅▃▄▂
final_eval/accuracy,▁
final_eval/f1_macro,▁

0,1
eval/accuracy,0.71891
eval/f1_macro,0.72685
eval/loss,0.81425
eval/precision_weighted,0.72381
eval/recall_weighted,0.71891
eval/runtime,3.8408
eval/samples_per_second,2089.677
eval/steps_per_second,65.351
final_eval/accuracy,0.71891
final_eval/f1_macro,0.72685


[I 2025-08-19 08:52:56,704] Trial 3 finished with value: 0.7189135310241714 and parameters: {'learning_rate': 2.1973496131763074e-05, 'max_length': 128, 'batch_size': 32, 'patience': 4, 'weight_decay': 0.01, 'epochs': 10, 'warmup_ratio': 0.05, 'dropout': 0.11561807252696295}. Best is trial 2 with value: 0.7194119112883129.


[DEBUG] Trial #4 → max_length = 192


Map:   0%|          | 0/32100 [00:00<?, ? examples/s]

Map:   0%|          | 0/8026 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision Weighted,Recall Weighted
300,1.5649,1.525567,0.315973,0.157768,0.167569,0.315973
600,1.1972,1.230281,0.472464,0.478806,0.508986,0.472464
900,1.0908,1.021566,0.589708,0.600083,0.605865,0.589708
1200,1.0007,0.950518,0.628084,0.638549,0.63161,0.628084
1500,0.9812,1.014963,0.593695,0.598071,0.627384,0.593695
1800,0.9302,0.952933,0.638799,0.651341,0.648174,0.638799
2100,0.8406,0.882835,0.678171,0.686207,0.683316,0.678171
2400,0.8552,0.876324,0.673935,0.68424,0.67937,0.673935
2700,0.8325,0.866006,0.686145,0.694812,0.690679,0.686145
3000,0.8261,0.862599,0.69063,0.696745,0.702741,0.69063


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


0,1
eval/accuracy,▁▄▆▇▆▇█████▇██
eval/f1_macro,▁▅▇▇▇▇████████
eval/loss,█▅▃▂▃▂▂▁▁▁▁▂▁▁
eval/precision_weighted,▁▅▇▇▇▇████████
eval/recall_weighted,▁▄▆▇▆▇█████▇██
eval/runtime,▆▄▁▅▃▄▅▂▄▃▅▇█▃
eval/samples_per_second,▃▅█▄▅▅▄▇▅▆▄▂▁▆
eval/steps_per_second,▃▅█▄▅▅▄▇▅▆▄▂▁▆
final_eval/accuracy,▁
final_eval/f1_macro,▁

0,1
eval/accuracy,0.69599
eval/f1_macro,0.70567
eval/loss,0.83338
eval/precision_weighted,0.69801
eval/recall_weighted,0.69599
eval/runtime,7.6734
eval/samples_per_second,1045.947
eval/steps_per_second,65.421
final_eval/accuracy,0.69599
final_eval/f1_macro,0.70567


[I 2025-08-19 08:58:44,526] Trial 4 finished with value: 0.6959880388736606 and parameters: {'learning_rate': 3.082148638857395e-05, 'max_length': 192, 'batch_size': 16, 'patience': 2, 'weight_decay': 0.0, 'epochs': 8, 'warmup_ratio': 0.1, 'dropout': 0.1555863612948673}. Best is trial 2 with value: 0.7194119112883129.


[DEBUG] Trial #5 → max_length = 192


Map:   0%|          | 0/32100 [00:00<?, ? examples/s]

Map:   0%|          | 0/8026 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision Weighted,Recall Weighted
300,1.1772,1.108054,0.5309,0.541685,0.548548,0.5309
600,0.9705,0.928709,0.638051,0.648687,0.645145,0.638051
900,0.9204,0.957813,0.631323,0.642404,0.650658,0.631323
1200,0.802,0.863113,0.683653,0.691638,0.690627,0.683653
1500,0.7516,0.862454,0.696611,0.707494,0.705637,0.696611
1800,0.7525,0.815618,0.706579,0.715264,0.71521,0.706579
2100,0.6391,0.868888,0.700972,0.710626,0.70237,0.700972
2400,0.6281,0.806426,0.710441,0.719716,0.715968,0.710441
2700,0.5908,0.799994,0.709693,0.71671,0.71702,0.709693
3000,0.6133,0.7634,0.722776,0.730937,0.729685,0.722776


0,1
eval/accuracy,▁▅▅▆▇▇▇▇▇███████
eval/f1_macro,▁▅▅▆▇▇▇▇▇███████
eval/loss,█▄▅▃▃▂▃▂▂▁▃▂▃▆▄▃
eval/precision_weighted,▁▅▅▆▇▇▇▇█████▇██
eval/recall_weighted,▁▅▅▆▇▇▇▇▇███████
eval/runtime,▃█▅▅▄▅▁▄▃▇▅▅▂▄▄▅
eval/samples_per_second,▆▁▄▄▅▄█▅▆▂▄▄▇▅▅▄
eval/steps_per_second,▆▁▄▄▅▄█▅▆▂▄▄▇▅▅▄
final_eval/accuracy,▁
final_eval/f1_macro,▁

0,1
eval/accuracy,0.72577
eval/f1_macro,0.73381
eval/loss,0.87111
eval/precision_weighted,0.7285
eval/recall_weighted,0.72577
eval/runtime,4.0646
eval/samples_per_second,1974.613
eval/steps_per_second,61.753
final_eval/accuracy,0.72577
final_eval/f1_macro,0.73381


[I 2025-08-19 09:04:50,068] Trial 5 finished with value: 0.7257662596561176 and parameters: {'learning_rate': 4.4479754385313785e-05, 'max_length': 192, 'batch_size': 32, 'patience': 4, 'weight_decay': 0.02, 'epochs': 7, 'warmup_ratio': 0.1, 'dropout': 0.10214366269172147}. Best is trial 5 with value: 0.7257662596561176.


[DEBUG] Trial #6 → max_length = 256


Map:   0%|          | 0/32100 [00:00<?, ? examples/s]

Map:   0%|          | 0/8026 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision Weighted,Recall Weighted
300,1.3673,1.238854,0.463992,0.469817,0.480656,0.463992
600,1.0658,1.091343,0.547845,0.560391,0.567271,0.547845
900,0.9531,0.998126,0.611263,0.622368,0.627189,0.611263
1200,0.8878,0.933217,0.636058,0.641822,0.661002,0.636058
1500,0.83,0.918527,0.663469,0.673061,0.672422,0.663469
1800,0.8198,0.841046,0.683902,0.691872,0.698684,0.683902
2100,0.7094,0.8256,0.699975,0.709901,0.701429,0.699975
2400,0.7237,0.830927,0.694742,0.704194,0.699319,0.694742
2700,0.684,0.809026,0.702218,0.711796,0.704093,0.702218
3000,0.7147,0.794092,0.698605,0.708604,0.707076,0.698605


0,1
eval/accuracy,▁▃▅▆▇▇█▇███████▇█
eval/f1_macro,▁▄▅▆▇▇█▇███████▇█
eval/loss,█▆▄▃▃▂▁▂▁▁▁▂▁▂▂▃▁
eval/precision_weighted,▁▄▅▆▇▇▇▇███████▇█
eval/recall_weighted,▁▃▅▆▇▇█▇███████▇█
eval/runtime,▁▃▃▃▃▄▆▂▁▃▃▄▄▅▅▃█
eval/samples_per_second,█▆▆▆▆▅▃▇█▆▆▅▅▄▄▆▁
eval/steps_per_second,█▆▆▆▆▅▃▇█▆▆▅▅▄▄▆▁
final_eval/accuracy,▁
final_eval/f1_macro,▁

0,1
eval/accuracy,0.71368
eval/f1_macro,0.72312
eval/loss,0.80608
eval/precision_weighted,0.71626
eval/recall_weighted,0.71368
eval/runtime,4.5485
eval/samples_per_second,1764.536
eval/steps_per_second,55.183
final_eval/accuracy,0.71368
final_eval/f1_macro,0.72312


[I 2025-08-19 09:12:05,961] Trial 6 finished with value: 0.7136805382506852 and parameters: {'learning_rate': 2.7828412180000833e-05, 'max_length': 256, 'batch_size': 32, 'patience': 3, 'weight_decay': 0.02, 'epochs': 10, 'warmup_ratio': 0.1, 'dropout': 0.13781554804190327}. Best is trial 5 with value: 0.7257662596561176.


[DEBUG] Trial #7 → max_length = 192


Map:   0%|          | 0/32100 [00:00<?, ? examples/s]

Map:   0%|          | 0/8026 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision Weighted,Recall Weighted
300,1.2954,1.288688,0.444181,0.421603,0.534685,0.444181
600,1.0697,1.085971,0.550835,0.55906,0.581316,0.550835
900,1.0168,1.006369,0.613382,0.620843,0.639963,0.613382
1200,0.9676,0.9124,0.657613,0.667419,0.666438,0.657613
1500,0.9457,0.974359,0.6155,0.630531,0.630871,0.6155
1800,0.904,0.98156,0.632694,0.644363,0.642534,0.632694
2100,0.7883,0.904786,0.676302,0.677611,0.702452,0.676302
2400,0.8147,0.829711,0.692873,0.702125,0.70437,0.692873
2700,0.7752,0.861772,0.689509,0.69579,0.696529,0.689509
3000,0.766,0.842899,0.706205,0.715023,0.719169,0.706205


0,1
eval/accuracy,▁▄▅▆▅▆▇▇▇██▇████▇█
eval/f1_macro,▁▄▆▇▆▆▇▇▇██▇████▇█
eval/loss,█▅▄▃▄▄▃▂▂▂▁▂▁▂▂▁▄▁
eval/precision_weighted,▁▃▅▆▅▅▇▇▇██▇████▇█
eval/recall_weighted,▁▄▅▆▅▆▇▇▇██▇████▇█
eval/runtime,▁▂▅▅▇███▆▁▂▂▂▃▂▂▂▂
eval/samples_per_second,█▇▄▃▂▁▁▁▃▇▇▇▇▆▇▇▇▇
eval/steps_per_second,█▇▄▃▂▁▁▁▃▇▇▇▇▆▇▇▇▇
final_eval/accuracy,▁
final_eval/f1_macro,▁

0,1
eval/accuracy,0.71692
eval/f1_macro,0.72666
eval/loss,0.78154
eval/precision_weighted,0.71995
eval/recall_weighted,0.71692
eval/runtime,7.7252
eval/samples_per_second,1038.935
eval/steps_per_second,64.982
final_eval/accuracy,0.71692
final_eval/f1_macro,0.72666


[I 2025-08-19 09:19:53,546] Trial 7 finished with value: 0.7169200099676053 and parameters: {'learning_rate': 5.4096850674953444e-05, 'max_length': 192, 'batch_size': 16, 'patience': 4, 'weight_decay': 0.01, 'epochs': 8, 'warmup_ratio': 0.05, 'dropout': 0.11749815564958319}. Best is trial 5 with value: 0.7257662596561176.


[DEBUG] Trial #8 → max_length = 192


Map:   0%|          | 0/32100 [00:00<?, ? examples/s]

Map:   0%|          | 0/8026 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision Weighted,Recall Weighted
300,1.3004,1.246233,0.463743,0.465349,0.482934,0.463743
600,1.0534,1.110903,0.549215,0.560535,0.587245,0.549215
900,0.9529,0.996567,0.616496,0.627234,0.635555,0.616496
1200,0.8721,0.993452,0.623723,0.631607,0.641591,0.623723
1500,0.815,0.879088,0.680164,0.690117,0.686584,0.680164
1800,0.8158,0.899646,0.665711,0.675308,0.678927,0.665711
2100,0.7094,0.872674,0.693122,0.703588,0.695534,0.693122
2400,0.7061,0.850211,0.694368,0.70348,0.699654,0.694368
2700,0.6781,0.792069,0.711313,0.721354,0.714053,0.711313
3000,0.6974,0.775035,0.707825,0.717772,0.714354,0.707825


0,1
eval/accuracy,▁▃▅▅▇▇▇▇███▇████
eval/f1_macro,▁▄▅▅▇▇▇▇████████
eval/loss,█▆▄▄▃▃▂▂▁▁▁▂▂▃▂▁
eval/precision_weighted,▁▄▆▆▇▇▇▇████████
eval/recall_weighted,▁▃▅▅▇▇▇▇███▇████
eval/runtime,▃▂█▇▆▂▁▁▃▂▃▁▂▂▂▃
eval/samples_per_second,▆▇▁▂▃▇▇█▆▇▆█▇▇▇▅
eval/steps_per_second,▆▇▁▂▃▇▇█▆▇▆█▇▇▇▅
final_eval/accuracy,▁
final_eval/f1_macro,▁

0,1
eval/accuracy,0.71979
eval/f1_macro,0.72752
eval/loss,0.8086
eval/precision_weighted,0.71964
eval/recall_weighted,0.71979
eval/runtime,4.0956
eval/samples_per_second,1959.651
eval/steps_per_second,61.285
final_eval/accuracy,0.71979
final_eval/f1_macro,0.72752


[I 2025-08-19 09:26:03,054] Trial 8 finished with value: 0.7197856964864191 and parameters: {'learning_rate': 3.3043881342260845e-05, 'max_length': 192, 'batch_size': 32, 'patience': 4, 'weight_decay': 0.01, 'epochs': 8, 'warmup_ratio': 0.1, 'dropout': 0.15758643081662502}. Best is trial 5 with value: 0.7257662596561176.


[DEBUG] Trial #9 → max_length = 256


Map:   0%|          | 0/32100 [00:00<?, ? examples/s]

Map:   0%|          | 0/8026 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision Weighted,Recall Weighted
300,1.5524,1.500401,0.329679,0.24474,0.318594,0.329679
600,1.1984,1.21386,0.463867,0.468985,0.518181,0.463867
900,1.0677,1.019648,0.589584,0.598279,0.612277,0.589584
1200,1.0097,0.956223,0.621106,0.628744,0.644247,0.621106
1500,0.979,0.946149,0.624844,0.637677,0.638112,0.624844
1800,0.9259,0.992672,0.622103,0.634744,0.629452,0.622103
2100,0.8654,0.867126,0.673187,0.68281,0.681042,0.673187
2400,0.8596,0.901794,0.657613,0.668401,0.663823,0.657613
2700,0.8371,0.888124,0.673063,0.683845,0.678667,0.673063
3000,0.8187,0.892365,0.670571,0.678587,0.677886,0.670571


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


0,1
eval/accuracy,▁▄▆▇▇▇█████
eval/f1_macro,▁▅▇▇▇▇█████
eval/loss,█▅▃▂▂▂▁▁▁▁▁
eval/precision_weighted,▁▅▇▇▇▇█████
eval/recall_weighted,▁▄▆▇▇▇█████
eval/runtime,▃▁▃▃█▇▂▂▂▅▆
eval/samples_per_second,▆█▆▆▁▂▇▇▇▄▃
eval/steps_per_second,▆█▆▆▁▂▇▇▇▄▃
final_eval/accuracy,▁
final_eval/f1_macro,▁

0,1
eval/accuracy,0.67319
eval/f1_macro,0.68281
eval/loss,0.86713
eval/precision_weighted,0.68104
eval/recall_weighted,0.67319
eval/runtime,7.3033
eval/samples_per_second,1098.957
eval/steps_per_second,68.736
final_eval/accuracy,0.67319
final_eval/f1_macro,0.68281


[I 2025-08-19 09:30:30,247] Trial 9 finished with value: 0.6731871417891852 and parameters: {'learning_rate': 3.135530469379381e-05, 'max_length': 256, 'batch_size': 16, 'patience': 3, 'weight_decay': 0.01, 'epochs': 8, 'warmup_ratio': 0.1, 'dropout': 0.1562360771456489}. Best is trial 5 with value: 0.7257662596561176.


Best value: 0.7257662596561176
Best params: {'learning_rate': 4.4479754385313785e-05, 'max_length': 192, 'batch_size': 32, 'patience': 4, 'weight_decay': 0.02, 'epochs': 7, 'warmup_ratio': 0.1, 'dropout': 0.10214366269172147}


Taking the best model trial from the study (5)

In [None]:
import shutil

safe_dir = "./best_model_for_compression"
shutil.copytree("./hf_roberta_optuna/5", safe_dir, dirs_exist_ok=True)
print("✅ Model copied to:", safe_dir)


✅ Model copied to: ./best_model_for_compression


In [None]:
model = AutoModelForSequenceClassification.from_pretrained(safe_dir)
model.to(DEVICE)


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50268, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.10214366269172147, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.10214366269172147, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_fe

In [None]:
# Re-tokenize test set using best max_length = 192
def preprocess_test(examples):
    enc = tokenizer(
        examples["cleaned_tweets"],
        truncation=True,
        padding="max_length",
        max_length=192
    )
    enc["labels"] = examples["label"]
    return enc

test_tok = test_dataset_hf.map(preprocess_test, batched=True, remove_columns=test_dataset_hf.column_names)
test_tok.set_format(type="torch")


Map:   0%|          | 0/3796 [00:00<?, ? examples/s]

In [None]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=default_data_collator,
    compute_metrics=compute_metrics
)

test_metrics = trainer.evaluate(test_tok)
print(test_metrics)

  trainer = Trainer(


{'eval_loss': 0.9647884368896484, 'eval_model_preparation_time': 0.0038, 'eval_accuracy': 0.690200210748156, 'eval_f1_macro': 0.7014725308466996, 'eval_precision_weighted': 0.6931109649746451, 'eval_recall_weighted': 0.690200210748156, 'eval_runtime': 7.4631, 'eval_samples_per_second': 508.635, 'eval_steps_per_second': 63.646}


Compression techniques

In [18]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import os, torch, copy
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch import nn
import torch.nn.utils.prune as prune
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, classification_report

from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch


DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Point to your saved checkpoint folder (where config.json + pytorch_model.bin live)
BASE_DIR = "/content/drive/MyDrive/Colab Notebooks/best_roberta_trial5"  # or your local path

model = AutoModelForSequenceClassification.from_pretrained(BASE_DIR).to(DEVICE)
tokenizer = AutoTokenizer.from_pretrained(BASE_DIR)




In [19]:
def evaluate_on(df, model, tokenizer, max_length=192, batch_size=64, device=DEVICE, desc="Eval"):
    class TweetDataset(torch.utils.data.Dataset):
        def __init__(self, dataframe, tokenizer, max_length=128):
            self.texts = dataframe["cleaned_tweets"].tolist()
            self.labels = dataframe["label"].tolist()
            self.tokenizer = tokenizer
            self.max_length = max_length
        def __len__(self): return len(self.texts)
        def __getitem__(self, idx):
            enc = self.tokenizer(
                str(self.texts[idx]),
                truncation=True,
                padding="max_length",
                max_length=self.max_length,
                return_tensors="pt",
            )
            return {
                "input_ids": enc["input_ids"][0],
                "attention_mask": enc["attention_mask"][0],
                "labels": torch.tensor(self.labels[idx], dtype=torch.long),
            }

    ds = TweetDataset(test_df_final, tokenizer, max_length=max_length)
    dl = DataLoader(ds, batch_size=batch_size, pin_memory=True)

    model = model.to(device).eval()
    all_y, all_p = [], []
    with torch.no_grad():
        for b in dl:
            ids = b["input_ids"].to(device)
            att = b["attention_mask"].to(device)
            y   = b["labels"].to(device)
            logits = model(ids, attention_mask=att).logits
            p = logits.argmax(dim=1)
            all_y.extend(y.cpu().numpy().tolist())
            all_p.extend(p.cpu().numpy().tolist())

    print(f"{desc} accuracy:", accuracy_score(all_y, all_p))
    print(classification_report(all_y, all_p, digits=4))

In [22]:
def compressed_models(base_model):
    compressed = {}

    # 1) Dynamic Quantization (CPU-only module)
    cpu_model = copy.deepcopy(base_model).to("cpu")
    qmodel = torch.quantization.quantize_dynamic(
        cpu_model,
        {nn.Linear},
        dtype=torch.qint8
    )
    qmodel = qmodel.eval()  # ✅ Set to eval mode
    compressed["quantized_cpu"] = qmodel

    # 2) Pruning (unstructured L1 across Linear layers)
    pruned = copy.deepcopy(base_model).to(DEVICE)
    params_to_prune = []
    for m in pruned.modules():
        if isinstance(m, nn.Linear):
            params_to_prune.append((m, "weight"))

    if len(params_to_prune) > 0:
        prune.global_unstructured(
            params_to_prune,
            pruning_method=prune.L1Unstructured,
            amount=0.40,  # 40% sparsity
        )
        # Remove reparametrization to bake in the zeroed-out weights
        for (m, _) in params_to_prune:
            try:
                prune.remove(m, "weight")
            except Exception:
                pass
    pruned = pruned.eval()  # ✅ Set to eval mode
    compressed["pruned"] = pruned

    # 3) FP16 (good for GPU inference)
    half_model = copy.deepcopy(base_model).half().to(DEVICE)
    half_model = half_model.eval()  # ✅ Set to eval mode
    compressed["fp16"] = half_model

    return compressed


In [24]:
# Base FP32
evaluate_on(test_df_final, model, tokenizer, max_length=192, desc="Base FP32")

# Get the compressed models dictionary
compressed_models_dict = compressed_models(model)

# FP16 (GPU)
evaluate_on(test_df_final, compressed_models_dict["fp16"], tokenizer, max_length=192, desc="FP16")

# Pruned (GPU)
evaluate_on(test_df_final, compressed_models_dict["pruned"], tokenizer, max_length=192, desc="Pruned")

# Quantized (CPU)
evaluate_on(test_df_final, compressed_models_dict["quantized_cpu"], tokenizer, max_length=192, device="cpu", desc="Quantized CPU")

Base FP32 accuracy: 0.690463645943098
              precision    recall  f1-score   support

           0     0.6986    0.7517    0.7242       592
           1     0.6438    0.6302    0.6369      1041
           2     0.7686    0.7374    0.7527       617
           3     0.6399    0.6906    0.6643       947
           4     0.7814    0.6861    0.7307       599

    accuracy                         0.6905      3796
   macro avg     0.7064    0.6992    0.7017      3796
weighted avg     0.6934    0.6905    0.6910      3796



For migrations of users: 
1. Eager mode quantization (torch.ao.quantization.quantize, torch.ao.quantization.quantize_dynamic), please migrate to use torchao eager mode quantize_ API instead 
2. FX graph mode quantization (torch.ao.quantization.quantize_fx.prepare_fx,torch.ao.quantization.quantize_fx.convert_fx, please migrate to use torchao pt2e quantization API instead (prepare_pt2e, convert_pt2e) 
3. pt2e quantization has been migrated to torchao (https://github.com/pytorch/ao/tree/main/torchao/quantization/pt2e) 
see https://github.com/pytorch/ao/issues/2259 for more details
  qmodel = torch.quantization.quantize_dynamic(


FP16 accuracy: 0.690463645943098
              precision    recall  f1-score   support

           0     0.6975    0.7517    0.7236       592
           1     0.6434    0.6292    0.6362      1041
           2     0.7686    0.7374    0.7527       617
           3     0.6403    0.6917    0.6650       947
           4     0.7829    0.6861    0.7313       599

    accuracy                         0.6905      3796
   macro avg     0.7065    0.6992    0.7018      3796
weighted avg     0.6934    0.6905    0.6910      3796

Pruned accuracy: 0.5753424657534246
              precision    recall  f1-score   support

           0     0.9172    0.2247    0.3609       592
           1     0.5482    0.6561    0.5973      1041
           2     0.6503    0.8169    0.7241       617
           3     0.4944    0.7856    0.6069       947
           4     0.9600    0.2003    0.3315       599

    accuracy                         0.5753      3796
   macro avg     0.7140    0.5367    0.5241      3796
weighted

In [27]:
OUT_DIR = "/content/drive/MyDrive/Colab Notebooks/best_roberta_trial5_compressed"
os.makedirs(OUT_DIR, exist_ok=True)

# A) Save FP16
fp16_dir = os.path.join(OUT_DIR, "fp16")
os.makedirs(fp16_dir, exist_ok=True)
compressed_models_dict["fp16"].save_pretrained(fp16_dir)
tokenizer.save_pretrained(fp16_dir)

# B) Save pruned (after prune.remove, save_pretrained works)
pruned_dir = os.path.join(OUT_DIR, "pruned")
os.makedirs(pruned_dir, exist_ok=True)
compressed_models_dict["pruned"].save_pretrained(pruned_dir)
tokenizer.save_pretrained(pruned_dir)

# C) Save quantized CPU (state_dict + a tiny loader script)
q_dir = os.path.join(OUT_DIR, "quantized_cpu")
os.makedirs(q_dir, exist_ok=True)
torch.save(compressed_models_dict["quantized_cpu"].state_dict(), os.path.join(q_dir, "quantized_state_dict.pt"))

# Save a small loader so future-you can reload easily
with open(os.path.join(q_dir, "load_quantized.py"), "w") as f:
    f.write(
        "import torch\n"
        "from torch import nn\n"
        "from transformers import AutoModelForSequenceClassification\n"
        "def load_quantized(model_dir, state_path):\n"
        "    model = AutoModelForSequenceClassification.from_pretrained(model_dir)\n"
        "    model = torch.quantization.quantize_dynamic(model, {nn.Linear}, dtype=torch.qint8)\n"
        "    sd = torch.load(state_path, map_location='cpu')\n"
        "    model.load_state_dict(sd, strict=False)\n"
        "    model.eval()\n"
        "    return model\n"
    )

print("Saved to:", OUT_DIR)

Saved to: /content/drive/MyDrive/Colab Notebooks/best_roberta_trial5_compressed
