<a href="https://colab.research.google.com/github/Hillascher5/nlp-tweets-sentiment-analysis/blob/main/Model_compression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **Model Compression**

In [None]:
# # Needed for Google Colab
# !pip install --quiet evaluate transformers optuna datasets nltk scikit-learn
# !pip install numpy==1.26.4

In [None]:
%env CUDA_LAUNCH_BLOCKING=1

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from contextlib import nullcontext
from torch.utils.data import DataLoader, TensorDataset
from transformers import Trainer, AutoTokenizer, AutoModelForSequenceClassification, get_scheduler, TrainingArguments, DataCollatorWithPadding
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from copy import deepcopy
from datasets import Dataset
from torch.optim import AdamW
from torch.quantization import quantize_dynamic
from torch import nn
from torch.nn import functional as F
from torch.nn.utils import prune

import time
import json
import gc
import torch
import os
import random
import copy
import numpy as np
import pandas as pd
import evaluate
import wandb
wandb.login()
# API key - 0cbd7fe3cffd71df993b30edb4fa0db94f114413 - uni
# API key - 65fb8494261cc49f8d09e6c57ef80bcad6a653b9 - pers


from google.colab import drive
drive.mount('/content/drive')
torch.backends.quantized.engine = "fbgemm"

In [None]:
def set_seed_all(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed_all(42)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

is_preprocessed = 'minimal_preprocess'
num_train_samples = 5000

In [None]:
# Load data
df_train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/nlp_project/Data/Corona_NLP_train.csv', encoding='latin1')
df_test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/nlp_project/Data/Corona_NLP_test.csv', encoding='latin1')

In [None]:
# Merge and shuffle for better stratified splits
df_full = pd.concat([df_train, df_test], ignore_index=True)
df_full = df_full.sample(frac=1.0, random_state=42).reset_index(drop=True)

In [None]:
# Minimal pre-processing
def light_preprocess(text):
    return text.strip()                             # Remove unnecessary spaces

df_full["clean_text"] = df_full["OriginalTweet"].apply(light_preprocess)

In [None]:
# Mapping sentiments to unique numeric IDs
unique_labels = sorted(df_full["Sentiment"].unique())
label2id = {label: idx for idx, label in enumerate(unique_labels)}
df_full["label"] = df_full["Sentiment"].map(label2id)

In [None]:
# Stratified split: 70% train, 15% val, 15% test
train_val_df, test_df = train_test_split(df_full, test_size=0.15, stratify=df_full["label"], random_state=42)
train_df, val_df = train_test_split(train_val_df, test_size=0.1765, stratify=train_val_df["label"], random_state=42)

# Confirm sizes
print("Train size:", len(train_df))
print("Val size:", len(val_df))
print("Test size:", len(test_df))

In [None]:
train_subset_df, _ = train_test_split(
    train_df[["clean_text", "label"]],
    train_size=num_train_samples,
    stratify=train_df["label"],
    random_state=42
)

val_subset_df, _ = train_test_split(
    val_df[["clean_text", "label"]],
    train_size=500,
    stratify=val_df["label"],
    random_state=42
)

In [None]:
# Load HF Trainer models
bert_trainer_dir = f"/content/drive/MyDrive/Colab Notebooks/nlp_project/models/w_test_split/HF_Trainer/bert_best_model_stratify_maxl_256_{is_preprocessed}_{num_train_samples}_samples_optuna"
roberta_trainer_dir = f"/content/drive/MyDrive/Colab Notebooks/nlp_project/models/w_test_split/HF_Trainer/roberta_best_model_stratify_maxl_256_{is_preprocessed}_{num_train_samples}_samples_optuna"
deberta_trainer_dir = f"/content/drive/MyDrive/Colab Notebooks/nlp_project/models/w_test_split/HF_Trainer/deberta_best_model_stratify_maxl_128_{is_preprocessed}_{num_train_samples}_samples_optuna"

bert_model = AutoModelForSequenceClassification.from_pretrained(bert_trainer_dir).to(device)
bert_tokenizer = AutoTokenizer.from_pretrained(bert_trainer_dir)
print("Original bert model (HF) size:", sum(p.numel() for p in bert_model.parameters()))

roberta_model = AutoModelForSequenceClassification.from_pretrained(roberta_trainer_dir).to(device)
roberta_tokenizer = AutoTokenizer.from_pretrained(roberta_trainer_dir)
print("Original roberta model (HF) size:", sum(p.numel() for p in roberta_model.parameters()))

deberta_model = AutoModelForSequenceClassification.from_pretrained(deberta_trainer_dir).to(device)
deberta_tokenizer = AutoTokenizer.from_pretrained(deberta_trainer_dir)
print("Original deberta model (HF) size:", sum(p.numel() for p in deberta_model.parameters()))

In [None]:
# Convert DataFrame to Hugging Face Dataset
hf_train = Dataset.from_pandas(train_df[["clean_text", "label"]])
hf_val = Dataset.from_pandas(val_df[["clean_text", "label"]])
hf_test = Dataset.from_pandas(test_df[["clean_text", "label"]])

In [None]:
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"],
        "f1_macro": f1_metric.compute(predictions=predictions, references=labels, average="macro")["f1"],
        "precision_macro": precision_metric.compute(predictions=predictions, references=labels, average="macro")["precision"],
        "recall_macro": recall_metric.compute(predictions=predictions, references=labels, average="macro")["recall"],
    }

**Comparison between Compression Methods**

### Functions for Compression Evaluation

The following functions were used to evaluate and compare the compressed versions of the used models (BERT, RoBERTa, and DeBERTa). They support all the experiments conducted on quantization, pruning and knowledge distillation.  

Specifically, the functions provide:

- **Model size** – measures checkpoint disk usage to compare storage savings.  
- **Dataset prep** – tokenizes tweets and labels into tensors for evaluation.  
- **Accuracy & F1** – evaluates predictive performance on GPU/CPU.  
- **Latency** – benchmarks inference speed (sec/1000 examples, throughput).  
  

Together, these functions formed the backbone of the experimental pipeline, ensuring that baseline and compressed models were evaluated under the same conditions for fairness. This enabled a consistent comparison across techniques and models.


In [None]:
# Comparison parameters
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def model_disk_size(path_or_file):
    if os.path.isdir(path_or_file):
        total = 0
        for root, _, files in os.walk(path_or_file):
            for f in files:
                total += os.path.getsize(os.path.join(root, f))
        return total / (1024**2)  # MB
    return os.path.getsize(path_or_file) / (1024**2)


def texts_to_dataset(tokenizer, texts, labels=None, max_length=256):
    enc = tokenizer(list(texts), padding=True, truncation=True, max_length=max_length, return_tensors="pt")
    if labels is None:
        return TensorDataset(enc["input_ids"], enc["attention_mask"])
    return TensorDataset(enc["input_ids"], enc["attention_mask"], torch.tensor(labels, dtype=torch.long))


@torch.no_grad()
def eval_accuracy_f1(model, tokenizer, texts, labels, batch_size=32, max_length=256, amp=False):
    model = model.to(device)
    model.eval()
    ds = texts_to_dataset(tokenizer, texts, labels, max_length)
    dl = DataLoader(ds, batch_size=batch_size)
    preds, gts = [], []
    ctx = torch.cuda.amp.autocast() if (amp and device.type == "cuda") else nullcontext()
    for batch in dl:
        if len(batch) == 3:
            input_ids, attn_mask, y = batch
        else:
            input_ids, attn_mask = batch; y = None
        input_ids, attn_mask = input_ids.to(device), attn_mask.to(device)
        with ctx:
            logits = model(input_ids=input_ids, attention_mask=attn_mask).logits
        preds.extend(logits.argmax(dim=1).cpu().numpy())
        if y is not None:
            gts.extend(y.numpy())
    acc = accuracy_score(gts, preds)
    f1m = f1_score(gts, preds, average="macro")
    return acc, f1m

@torch.no_grad()
def eval_accuracy_f1_cpu(model, tokenizer, texts, labels, batch_size=32, max_length=256):
    model.cpu().eval()                 # <-- force CPU
    # IMPORTANT: no autocast / AMP here
    from torch.utils.data import TensorDataset, DataLoader
    import numpy as np
    from sklearn.metrics import accuracy_score, f1_score
    enc = tokenizer(list(texts), padding=True, truncation=True, max_length=max_length, return_tensors="pt")
    ds = TensorDataset(enc["input_ids"], enc["attention_mask"], torch.tensor(labels, dtype=torch.long))
    dl = DataLoader(ds, batch_size=batch_size)
    preds, gts = [], []
    for input_ids, attn_mask, y in dl:
        logits = model(input_ids=input_ids, attention_mask=attn_mask).logits
        preds.extend(logits.argmax(1).cpu().numpy())
        gts.extend(y.cpu().numpy())
    acc = accuracy_score(gts, preds)
    f1m = f1_score(gts, preds, average="macro")
    return acc, f1m


@torch.no_grad()
def measure_latency(model, tokenizer, texts, batch_size=32, warmup=2, runs=5, max_length=256, amp=False):
    """
    Returns (sec_per_1000_examples, throughput_examples_per_sec)
    """
    model = model.to(device)
    model.eval()
    ds = texts_to_dataset(tokenizer, texts, None, max_length)
    dl = DataLoader(ds, batch_size=batch_size)
    ctx = torch.cuda.amp.autocast() if (amp and device.type == "cuda") else nullcontext()

    # Warmup
    it = iter(dl)
    for _ in range(warmup):
        try:
            input_ids, attn_mask = next(it)
        except StopIteration:
            it = iter(dl); input_ids, attn_mask = next(it)
        input_ids, attn_mask = input_ids.to(device), attn_mask.to(device)
        with ctx:
            _ = model(input_ids=input_ids, attention_mask=attn_mask).logits
    if device.type == "cuda":
        torch.cuda.synchronize()

    # Timed passes
    totals = []
    for _ in range(runs):
        seen = 0
        t0 = time.perf_counter()
        for input_ids, attn_mask in dl:
            input_ids, attn_mask = input_ids.to(device), attn_mask.to(device)
            with ctx:
                _ = model(input_ids=input_ids, attention_mask=attn_mask).logits
            seen += input_ids.size(0)
        if device.type == "cuda":
            torch.cuda.synchronize()
        totals.append((time.perf_counter() - t0, seen))

    mean_time = np.mean([t for t, _ in totals])
    mean_seen = np.mean([s for _, s in totals])
    sec_per_1000 = mean_time * (1000.0 / mean_seen)
    throughput = mean_seen / mean_time
    return sec_per_1000, throughput

@torch.no_grad()
def measure_latency_cpu(model, tokenizer, texts, batch_size=32, runs=3, max_length=256):
    model.cpu().eval()
    enc = tokenizer(list(texts), padding=True, truncation=True, max_length=max_length, return_tensors="pt")
    ds  = TensorDataset(enc["input_ids"], enc["attention_mask"])
    dl  = DataLoader(ds, batch_size=batch_size)

    # Warmup (1 pass)
    for input_ids, attn_mask in dl:
        _ = model(input_ids=input_ids, attention_mask=attn_mask).logits
        break

    # Timed passes
    totals = []
    for _ in range(runs):
        seen = 0
        t0 = time.perf_counter()
        for input_ids, attn_mask in dl:
            _ = model(input_ids=input_ids, attention_mask=attn_mask).logits
            seen += input_ids.size(0)
        totals.append((time.perf_counter() - t0, seen))

    mean_time = np.mean([t for t, _ in totals])
    mean_seen = np.mean([s for _, s in totals])
    sec_per_1000 = mean_time * (1000.0 / mean_seen)
    throughput   = mean_seen / mean_time
    return sec_per_1000, throughput

The following loads baseline FP32 models and applies **dynamic quantization** either to all linear layers (BERT/RoBERTa) or only the classification head (DeBERTa-v3), enabling lightweight models for faster inference while preserving accuracy.  


In [None]:
def load_fp32_hf(model_dir_or_id: str, num_labels: int, map_location="cpu"):
    model = AutoModelForSequenceClassification.from_pretrained(model_dir_or_id, num_labels=num_labels)
    tok   = AutoTokenizer.from_pretrained(model_dir_or_id)
    model.to(map_location).eval()
    return model, tok

def quantize_full_linear(model_cpu):
    # quantize ALL nn.Linear layers (good for BERT/RoBERTa)
    return quantize_dynamic(model_cpu, {nn.Linear}, dtype=torch.qint8).eval()

def quantize_head_only(model_cpu):
    # quantize ONLY the classification head (safer for DeBERTa-v3)
    q = copy.deepcopy(model_cpu).eval()
    if hasattr(q, "classifier") and isinstance(q.classifier, nn.Module):
        q.classifier = quantize_dynamic(q.classifier, {nn.Linear}, dtype=torch.qint8).eval()
    return q

def load_int8_dynamic_from_baseline(baseline_dir: str, num_labels: int, strategy: str):
    # Load FP32 on CPU, then quantize on CPU
    model_cpu, tok = load_fp32_hf(baseline_dir, num_labels=num_labels, map_location="cpu")
    if strategy == "full":
        model_q = quantize_full_linear(model_cpu)
    elif strategy == "head":
        model_q = quantize_head_only(model_cpu)
    else:
        raise ValueError("strategy must be 'full' or 'head'")
    return model_q, tok

This function saves a given model variant (quantized, pruned, or distilled) together with its tokenizer, either in Hugging Face format or as raw PyTorch weights, ensuring reproducibility and easy reloading for further experiments.  


In [None]:
def save_model_variant(
    model_name: str,
    variant: str,                 # "quantized", "pruned", "distilled"
    model,
    tokenizer,
    base_dir: str,
    save_type: str = "hf",        # "hf" | "torch"
    filename_stub: str = None     # f"{model_name}_{variant}"
):
    folder = f"{model_name}_{variant}_model"
    save_dir = os.path.join(base_dir, folder)
    os.makedirs(save_dir, exist_ok=True)

    stub = filename_stub or f"{model_name}_{variant}"

    if save_type == "hf":
        # HF format
        model.save_pretrained(save_dir)
        tokenizer.save_pretrained(save_dir)
        print(f"Saved {model_name} ({variant}, HF) -> {save_dir}")

    elif save_type == "torch":
        # CPU is safer for pickled quantized models
        model = model.to("cpu")
        torch.save(model.state_dict(), os.path.join(save_dir, f"{stub}.pt"))
        tokenizer.save_pretrained(save_dir)  # tokenizer can still be HF
        print(f"Saved {model_name} ({variant}, torch) -> {save_dir}")
    else:
        raise ValueError("save_type must be one of: 'hf', 'torch'")

    return save_dir

In [None]:
base_save_dir = "/content/drive/MyDrive/Colab Notebooks/nlp_project/models/w_test_split/HF_Trainer/Compressed_models"

test_texts  = list(test_df["clean_text"])
test_labels = test_df["label"].values

### Quantization

Applying dynamic quantization to all models (BERT, RoBERTa, DeBERTa) and saving the quantized variants for later evaluation.  


In [None]:
def apply_quantization(model):
    return quantize_dynamic(model, {nn.Linear}, dtype=torch.qint8)

bert_quantized = apply_quantization(bert_model)
roberta_quantized = apply_quantization(roberta_model)
deberta_quantized = apply_quantization(deberta_model)

# Quantized model must be on CPU
bert_quantized = bert_quantized.to("cpu").eval()
roberta_quantized = roberta_quantized.to("cpu").eval()
deberta_quantized = deberta_quantized.to("cpu").eval()

print("BERT quantized size:", sum(p.numel() for p in bert_quantized.parameters()))
print("RoBERTa quantized size:", sum(p.numel() for p in roberta_quantized.parameters()))
print("DeBERTa quantized size:", sum(p.numel() for p in deberta_quantized.parameters()))

# Save quantized models
bert_quant_dir = save_model_variant("bert", "quantized", bert_quantized, bert_tokenizer, base_save_dir, save_type="torch")
roberta_quant_dir = save_model_variant("roberta", "quantized", roberta_quantized, roberta_tokenizer, base_save_dir, save_type="torch")
deberta_quant_dir = save_model_variant("deberta", "quantized", deberta_quantized, deberta_tokenizer, base_save_dir, save_type="torch")

### Pruning

Applies pruning to the linear layers of each model (removing 40% of weights), then performing short recovery fine-tuning on subsets of the training and validation data to regain accuracy. Pruned models are saved for later evaluation.  


In [None]:
def apply_pruning(model, amount=0.2):
    modules = [m for m in model.modules() if isinstance(m, nn.Linear)]
    parameters_to_prune = [(m, "weight") for m in modules]

    prune.global_unstructured(
        parameters_to_prune,
        pruning_method=prune.L1Unstructured,
        amount=amount,
    )
    # Make pruning permanent
    for m in modules:
        if hasattr(m, "weight_mask"):
            prune.remove(m, "weight")
    return model

# Fine-tuning after pruning to restore lost accuracy caused due to the removed weights
def recover_finetune(model, tokenizer, train_texts, train_labels, val_texts, val_labels, max_length=256, epochs=1, lr=1e-5, bs=32):
    model.train().to(device)
    train_ds = texts_to_dataset(tokenizer, train_texts, train_labels, max_length)
    val_ds   = texts_to_dataset(tokenizer, val_texts,   val_labels,   max_length)
    train_dl = DataLoader(train_ds, batch_size=bs, shuffle=True)
    val_dl   = DataLoader(val_ds,   batch_size=bs)
    opt = torch.optim.AdamW(model.parameters(), lr=lr)
    for _ in range(epochs):
        for input_ids, attn_mask, y in train_dl:
            input_ids, attn_mask, y = input_ids.to(device), attn_mask.to(device), y.to(device)
            opt.zero_grad()
            loss = model(input_ids=input_ids, attention_mask=attn_mask, labels=y).loss
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            opt.step()
    model.eval()
    return model

bert_pruned = apply_pruning(bert_model, amount=0.4)
roberta_pruned = apply_pruning(roberta_model, amount=0.4)
deberta_pruned = apply_pruning(deberta_model, amount=0.4)
print("Pruned bert model non-zero parameters:", sum((p != 0).sum().item() for p in bert_pruned.parameters()))
print("Pruned roberta model non-zero parameters:", sum((p != 0).sum().item() for p in roberta_pruned.parameters()))
print("Pruned deberta model non-zero parameters:", sum((p != 0).sum().item() for p in deberta_pruned.parameters()))

# Recover fine-tune
bert_model_recov = recover_finetune(bert_pruned, bert_tokenizer, train_subset_df["clean_text"].to_list(), train_subset_df["label"].to_list(), val_subset_df['clean_text'].to_list(), val_subset_df['label'].to_list(), max_length=256, epochs=2, lr=1e-5, bs=32)
roberta_model_recov = recover_finetune(roberta_pruned, roberta_tokenizer, train_subset_df["clean_text"].to_list(), train_subset_df["label"].to_list(), val_subset_df['clean_text'].to_list(), val_subset_df['label'].to_list(), max_length=256, epochs=2, lr=1e-5, bs=32)
deberta_model_recov = recover_finetune(deberta_pruned, deberta_tokenizer, train_subset_df["clean_text"].to_list(), train_subset_df["label"].to_list(), val_subset_df['clean_text'].to_list(), val_subset_df['label'].to_list(), max_length=128, epochs=2, lr=1e-5, bs=32)

# Save pruned models
bert_pruned_dir = save_model_variant("bert", "pruned", bert_model_recov, bert_tokenizer, base_save_dir, save_type="hf")
roberta_pruned_dir = save_model_variant("roberta", "pruned", roberta_model_recov, roberta_tokenizer, base_save_dir, save_type="hf")
deberta_pruned_dir = save_model_variant("deberta", "pruned", deberta_model_recov, deberta_tokenizer, base_save_dir, save_type="hf")

### Knowledge Distillation

In [None]:
# Defining students
bert_student_id = "distilbert-base-uncased"
roberta_student_id = "distilroberta-base"
deberta_student_id = "microsoft/deberta-v3-small"

student_map ={
    "bert": bert_student_id,
    "roberta": roberta_student_id,
    "deberta": deberta_student_id,
}

Implementing a class DistillationTrainer that combines **cross-entropy loss** with **knowledge distillation loss**. A large teacher model guides a smaller student model by providing softened probability distributions, improving generalization and efficiency. A function that handles tokenization, training, evaluation, and saving the distilled student model.


In [None]:
class DistillationTrainer(Trainer):
    def __init__(self, *args, teacher_model=None, temperature=4.0, alpha=0.5, **kwargs):
        super().__init__(*args, **kwargs)
        assert teacher_model is not None, "teacher_model is required"
        self.teacher = teacher_model.eval()
        for p in self.teacher.parameters():
            p.requires_grad = False
        self.temperature = temperature
        self.alpha = alpha
        self._teacher_device = None  # where the teacher currently is

    def _ensure_teacher_device(self, ref_device):
        # Move teacher once if device changed
        if self._teacher_device != ref_device:
            self.teacher.to(ref_device)
            self._teacher_device = ref_device

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels", inputs.get("label", None))
        if labels is None:
            raise ValueError("No labels found in inputs (expected 'labels' or 'label').")

        # Ensure teacher is on the same device as the student/labels
        ref_device = labels.device
        self._ensure_teacher_device(ref_device)

        # student forward
        outputs_s = model(**inputs)
        logits_s = outputs_s.logits

        with torch.no_grad():
            outputs_t = self.teacher(**inputs)
            logits_t  = outputs_t.logits

        # CE
        loss_ce = F.cross_entropy(logits_s, labels)

        # soft-label KD
        T = self.temperature
        loss_kd = F.kl_div(
            F.log_softmax(logits_s / T, dim=-1),
            F.softmax(logits_t / T, dim=-1),
            reduction="batchmean",
        ) * (T * T)

        loss = self.alpha * loss_kd + (1.0 - self.alpha) * loss_ce
        return (loss, outputs_s) if return_outputs else loss

# Tokenize helper
def build_tokenized_splits(train_df, val_df, test_df, tokenizer, max_length=256):
    def tok(batch):
        return tokenizer(
            batch["clean_text"],
            truncation=True,
            padding="max_length",
            max_length=max_length,
        )

    hf_train = Dataset.from_pandas(train_df[["clean_text", "label"]])
    hf_val   = Dataset.from_pandas(val_df[["clean_text", "label"]])
    hf_test  = Dataset.from_pandas(test_df[["clean_text", "label"]])

    t_train = hf_train.map(tok, batched=True)
    t_val   = hf_val.map(tok, batched=True)
    t_test  = hf_test.map(tok, batched=True)

    cols = ["input_ids", "attention_mask", "label"]
    t_train.set_format("torch", columns=cols)
    t_val.set_format("torch", columns=cols)
    t_test.set_format("torch", columns=cols)
    return t_train, t_val, t_test

# Distillation runner
def distill_one(
    teacher_id: str,
    student_id: str,
    train_df,
    val_df,
    test_df,
    num_labels: int,
    out_dir: str,
    run_name: str,
    max_length: int = 256,
    learning_rate: float = 5e-5,
    batch_size: int = 32,
    num_train_epochs: int = 3,
    weight_decay: float = 0.01,
    temperature: float = 4.0,
    alpha: float = 0.5,
    use_teacher_tokenizer: bool = True,
    seed: int = 42,
):
    os.makedirs(out_dir, exist_ok=True)
    torch.manual_seed(seed); np.random.seed(seed)

    # Tokenizer
    tok_id = teacher_id if use_teacher_tokenizer else student_id
    tokenizer = AutoTokenizer.from_pretrained(tok_id, use_fast=True)

    # Build splits
    t_train, t_val, t_test = build_tokenized_splits(train_df, val_df, test_df, tokenizer, max_length=max_length)

    # Models
    teacher = AutoModelForSequenceClassification.from_pretrained(teacher_id, num_labels=num_labels)
    student = AutoModelForSequenceClassification.from_pretrained(student_id, num_labels=num_labels)

    collator = DataCollatorWithPadding(tokenizer=tokenizer)
    args = TrainingArguments(
        output_dir=os.path.join(out_dir, "student_ckpt"),
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1_macro",
        greater_is_better=True,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_train_epochs,
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        fp16=torch.cuda.is_available(),
        logging_strategy="epoch",
        run_name=run_name,
        seed=seed,
    )

    trainer = DistillationTrainer(
        teacher_model=teacher,
        temperature=temperature,
        alpha=alpha,
        model=student,
        args=args,
        train_dataset=t_train,
        eval_dataset=t_val,
        data_collator=collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    trainer.train()
    val_metrics = trainer.evaluate()
    print("Best (val):", val_metrics)

    # Evaluate on test
    test_metrics = trainer.evaluate(eval_dataset=t_test)
    print("Test:", test_metrics)

    # Save distilled student
    save_dir = os.path.join(out_dir, "student_distilled")
    trainer.save_model(save_dir)
    tokenizer.save_pretrained(save_dir)
    print(f"Saved student to {save_dir}")

    return {
        "val_metrics": val_metrics,
        "test_metrics": test_metrics,
        "save_dir": save_dir,
        "student_id": student_id,
        "teacher_id": teacher_id,
        "max_length": max_length,
    }

Distills each base model (teacher) into a smaller student version.    
Each run trains the student with guidance from its teacher, evaluates on validation and test sets, logs results to Weights & Biases, and saves the distilled models.


In [None]:
num_labels = len(label2id)
n_epochs = 3
alpha = 0.5
T = 4.0
max_l_bert_roberta = 256
max_l_deberta = 128
lr = 5e-5
bs = 32

# BERT
res_bert = distill_one(
    teacher_id="bert-base-uncased",
    student_id=student_map["bert"],
    train_df=train_df, val_df=val_df, test_df=test_df,
    num_labels=num_labels,
    out_dir="/content/drive/MyDrive/Colab Notebooks/nlp_project/models/w_test_split/HF_Trainer/Compressed_models/bert_distill",
    run_name=f"bert_kd_T{T}_a{alpha}_e{n_epochs}",
    max_length=max_l_bert_roberta, learning_rate=lr, batch_size=bs, num_train_epochs=n_epochs,
    temperature=T, alpha=alpha)
wandb.finish()

# RoBERTa
res_roberta = distill_one(
    teacher_id="roberta-base",
    student_id=student_map["roberta"],
    train_df=train_df, val_df=val_df, test_df=test_df,
    num_labels=num_labels,
    out_dir="/content/drive/MyDrive/Colab Notebooks/nlp_project/models/w_test_split/HF_Trainer/Compressed_models/roberta_distill",
    run_name=f"roberta_kd_T{T}_a{alpha}_e{n_epochs}",
    max_length=max_l_bert_roberta, learning_rate=lr, batch_size=bs, num_train_epochs=n_epochs,
    temperature=T, alpha=alpha)
wandb.finish()

# DeBERTa
res_deberta = distill_one(
    teacher_id="microsoft/deberta-v3-base",
    student_id=student_map["deberta"],
    train_df=train_df, val_df=val_df, test_df=test_df,
    num_labels=num_labels,
    out_dir="/content/drive/MyDrive/Colab Notebooks/nlp_project/models/w_test_split/HF_Trainer/Compressed_models/deberta_distill",
    run_name=f"deberta_kd_T{T}_a{alpha}_e{n_epochs}",
    max_length=max_l_deberta,
    learning_rate=lr, batch_size=bs, num_train_epochs=n_epochs,
    temperature=T, alpha=alpha)
wandb.finish()

Evaluating a single model variant on accuracy, macro-F1, size, and latency.  
If the variant is INT8 Dynamic, it loads a quantized CPU-only model; otherwise it loads a Hugging Face model.  
Returning performance and efficiency metrics for easy comparison across variants.


In [None]:
def load_hf(path_or_dir, num_labels):
    tok = AutoTokenizer.from_pretrained(path_or_dir)
    mdl = AutoModelForSequenceClassification.from_pretrained(path_or_dir, num_labels=num_labels)
    return mdl, tok


def eval_one_row(model_name, variant_name, variant_cfg, max_length, texts, labels,
                 latency_sample=1500, batch_size=32, amp=True, num_labels=5):
    if variant_name == "INT8 Dynamic":
        strategy = variant_cfg.get("strategy", "full")
        model, tokenizer = load_int8_dynamic_from_baseline(
            variant_cfg["quantize_from"], num_labels=num_labels, strategy=strategy
        )
        size_mb = model_disk_size(variant_cfg["quantize_from"])
        # CPU-only latency and eval
        lat, _ = measure_latency_cpu(model, tokenizer, texts[:latency_sample],
                                     batch_size=batch_size, max_length=max_length)
        acc, f1m = eval_accuracy_f1_cpu(model, tokenizer, texts, labels,
                                        batch_size=batch_size, max_length=max_length)
        return {
            "Model": model_name,
            "Variant": variant_name + (f" ({strategy})" if model_name=="DeBERTa" else ""),
            "MaxLen": max_length,
            "Size_MB": round(size_mb, 2),
            "Sec_per_1000": round(lat, 3),
            "Accuracy": round(acc, 4),
            "MacroF1": round(f1m, 4),
        }


    model, tokenizer = load_hf(variant_cfg["path"], num_labels=num_labels)
    size_mb = model_disk_size(variant_cfg["path"])
    model = model.to(device).eval()
    lat, _ = measure_latency(model, tokenizer, texts[:latency_sample],
                             batch_size=batch_size, runs=3, max_length=max_length, amp=amp)
    acc, f1m = eval_accuracy_f1(model, tokenizer, texts, labels,
                                batch_size=batch_size, max_length=max_length, amp=amp)
    return {
        "Model": model_name,
        "Variant": variant_name,
        "MaxLen": max_length,
        "Size_MB": round(size_mb, 2),
        "Sec_per_1000": round(lat, 3),
        "Accuracy": round(acc, 4),
        "MacroF1": round(f1m, 4),
    }

Defining configuration for each model family (BERT, RoBERTa, DeBERTa), including their baseline, quantized, pruned, and distilled variants.  
Each variant is evaluated on accuracy, F1, latency, and size, and the results are collected into a comparison DataFrame for analysis.


In [None]:
BASE = "/content/drive/MyDrive/Colab Notebooks/nlp_project/models/w_test_split/HF_Trainer"
COMP = f"{BASE}/Compressed_models"

cfg = {
    "BERT": {
        "maxlen": 256,
        "baseline":  {"path": f"{BASE}/bert_best_model_stratify_maxl_256_{is_preprocessed}_{num_train_samples}_samples_optuna"},
        "quantized": {"quantize_from": f"{BASE}/bert_best_model_stratify_maxl_256_{is_preprocessed}_{num_train_samples}_samples_optuna",
                      "strategy": "full"},
        "pruned":    {"path": f"{COMP}/bert_pruned_model"},
        "distilled": {"path": f"{COMP}/bert_distill/student_distilled"},
    },
    "RoBERTa": {
        "maxlen": 256,
        "baseline":  {"path": f"{BASE}/roberta_best_model_stratify_maxl_256_{is_preprocessed}_{num_train_samples}_samples_optuna"},
        "quantized": {"quantize_from": f"{BASE}/roberta_best_model_stratify_maxl_256_{is_preprocessed}_{num_train_samples}_samples_optuna",
                      "strategy": "full"},
        "pruned":    {"path": f"{COMP}/roberta_pruned_model"},
        "distilled": {"path": f"{COMP}/roberta_distill/student_distilled"},
    },
    "DeBERTa": {
        "maxlen": 128,
        "baseline":  {"path": f"{BASE}/deberta_best_model_stratify_maxl_128_{is_preprocessed}_{num_train_samples}_samples_optuna"},
        "quantized": {"quantize_from": f"{BASE}/deberta_best_model_stratify_maxl_128_{is_preprocessed}_{num_train_samples}_samples_optuna",
                      "strategy": "head"},
        "pruned":    {"path": f"{COMP}/deberta_pruned_model"},
        "distilled": {"path": f"{COMP}/deberta_distill/student_distilled"},
    },
}

rows = []
for name, fam in cfg.items():
    maxlen = fam["maxlen"]
    rows.append(eval_one_row(name, "Baseline",     fam["baseline"],  maxlen, test_texts, test_labels))
    rows.append(eval_one_row(name, "INT8 Dynamic", fam["quantized"], maxlen, test_texts, test_labels))
    rows.append(eval_one_row(name, "Pruned",       fam["pruned"],    maxlen, test_texts, test_labels))
    rows.append(eval_one_row(name, "Distilled",    fam["distilled"], maxlen, test_texts, test_labels))

comp_df = pd.DataFrame(rows).sort_values(["Model","Variant"]).reset_index(drop=True)
comp_df

In [None]:
# from IPython.display import Javascript

# def disconnect_runtime():
#     display(Javascript('google.colab.kernel.disconnect()'))

# disconnect_runtime()