# **02 - DistilBERT Sweep**

In [None]:
pip install langdetect

In [None]:
# Imports & reproducibility
import os
import random
import shutil
import gc

import numpy as np
import pandas as pd
import torch
import joblib

from transformers import (
    DistilBertTokenizerFast as DistilBertTokenizer,
    DistilBertForSequenceClassification,
    DistilBertConfig,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split
from langdetect import DetectorFactory

# seeds
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
DetectorFactory.seed = RANDOM_SEED

In [None]:
# Paths & constants
BASE    = "C:/Users/indur/OneDrive - University of Westminster/GitHub/FYP_Project/Models/Review_Score/OutPuts"
TRAIN_P = f"{BASE}/train_df.pkl"
OUTPUT  = f"{BASE}/sweep_ckpt"
NUM_L   = 5

In [None]:
# Hyperparameter grid
bert_param_grid = [
    {"learning_rate": 2e-5, "weight_decay": 0.01, "classifier_dropout": 0.2, "freeze_layers": "first2"},
    {"learning_rate": 2e-5, "weight_decay": 0.01, "classifier_dropout": 0.3, "freeze_layers": "first1"},
    {"learning_rate": 3e-5, "weight_decay": 0.00, "classifier_dropout": 0.1, "freeze_layers": "none"},
    {"learning_rate": 5e-5, "weight_decay": 0.00, "classifier_dropout": 0.1, "freeze_layers": "none"},
    {"learning_rate": 3e-5, "weight_decay": 0.01, "classifier_dropout": 0.2, "freeze_layers": "first1"}
]

In [None]:
# Load & subsample data
df     = joblib.load(TRAIN_P)
texts  = df.text.tolist()
labels = df.label.tolist()

sss = StratifiedShuffleSplit(n_splits=1, train_size=0.2, random_state=RANDOM_SEED)
sub_idx, _ = next(sss.split(texts, labels))
xsmall = [texts[i] for i in sub_idx]
ysmall = [labels[i] for i in sub_idx]

x_tr, x_val, y_tr, y_val = train_test_split(
    xsmall, ysmall, test_size=0.2, stratify=ysmall, random_state=RANDOM_SEED
)

In [None]:
# Tokenizer & device
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
device    = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Dataset wrapper
class TxtDS(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels    = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        item = {k: v[idx] for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

In [None]:
# Hyperparameter sweep
best_f1, best_cfg, best_model = 0.0, None, None

for cfg in bert_param_grid:
    # a) Build & (optionally) freeze
    config = DistilBertConfig.from_pretrained(
        "distilbert-base-uncased",
        dropout=cfg["classifier_dropout"],
        num_labels=NUM_L
    )
    model = DistilBertForSequenceClassification.from_pretrained(
        "distilbert-base-uncased", config=config
    ).to(device)

    if cfg["freeze_layers"] == "first2":
        for p in model.distilbert.transformer.layer[:2].parameters(): p.requires_grad = False
    elif cfg["freeze_layers"] == "first1":
        for p in model.distilbert.transformer.layer[:1].parameters(): p.requires_grad = False

    # b) Tokenize
    def enc(xs):
        return tokenizer(xs, padding="max_length", truncation=True,
                         max_length=256, return_tensors="pt")
    tr_ds = TxtDS(enc(x_tr), y_tr)
    vl_ds = TxtDS(enc(x_val), y_val)

    # c) TrainingArguments
    if os.path.isdir(OUTPUT): shutil.rmtree(OUTPUT)
    args = TrainingArguments(
        output_dir=OUTPUT,
        num_train_epochs= 20,
        per_device_train_batch_size= 16 ,
        per_device_eval_batch_size= 32,
        eval_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=1,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        learning_rate=cfg["learning_rate"],
        weight_decay=cfg["weight_decay"],
        logging_steps=50,
        report_to=[]
    )

    # d) Trainer
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=tr_ds,
        eval_dataset=vl_ds,
        compute_metrics=lambda p: {
            "accuracy": accuracy_score(p.label_ids, p.predictions.argmax(-1)),
            "f1":       f1_score(p.label_ids, p.predictions.argmax(-1), average="weighted")
        },
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )

    # e) Train & eval
    trainer.train()
    metrics = trainer.evaluate()
    f = metrics["eval_f1"]
    print(f"Config {cfg} → eval_f1 = {f:.4f}")

    if f > best_f1:
        best_f1, best_cfg, best_model = f, cfg.copy(), trainer.model

    # f) Cleanup  
    del model, trainer
    gc.collect()
    torch.cuda.empty_cache()

In [None]:
# Report & save best
print("\nBest F1:", best_f1)
print("Best cfg:", best_cfg)
best_model.save_pretrained(f"{BASE}/best_distilbert_model")
tokenizer.save_pretrained(f"{BASE}/best_distilbert_tokenizer")
joblib.dump(best_cfg, f"{BASE}/best_bert_cfg.pkl")