# **03 – DistilBERT Final**

In [None]:
!pip install langdetect transformers

In [None]:
# Imports & reproducibility
import os, shutil, gc
import numpy as np
import torch
import joblib

from transformers import (
    DistilBertForSequenceClassification,
    DistilBertTokenizerFast,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.utils import resample
from langdetect import DetectorFactory
import matplotlib.pyplot as plt

# seeds
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
DetectorFactory.seed = RANDOM_SEED

In [None]:
# Paths & constants
NUM_L           = 5
BASE            = "/kaggle/working"
TRAIN_P         = "/kaggle/input/01-data-loading-preprocessing/train_df.pkl"
VAL_P           = "/kaggle/input/01-data-loading-preprocessing/val_df.pkl"
TEST_P          = "/kaggle/input/01-data-loading-preprocessing/test_df.pkl"
BEST_CFG_P      = "/kaggle/input/02-distilbert-sweep/best_bert_cfg.pkl"
BEST_MODEL_DIR  = "/kaggle/input/02-distilbert-sweep/best_distilbert_model"
TOKENIZER_DIR  = "/kaggle/input/02-distilbert-sweep/best_distilbert_tokenizer"
FINAL_MODEL_DIR = f"{BASE}/distilbert_model"
CKPT_DIR        = f"{BASE}/checkpoints"

os.makedirs(FINAL_MODEL_DIR, exist_ok=True)

In [None]:
# 1) Load & subsample splits
train_df = joblib.load(TRAIN_P)
val_df   = joblib.load(VAL_P)
test_df  = joblib.load(TEST_P)
best_cfg = joblib.load(BEST_CFG_P)

train_texts, train_labels = train_df.text.tolist(), train_df.label.tolist()
val_texts,   val_labels   = val_df.text.tolist(),   val_df.label.tolist()
test_texts,  test_labels  = test_df.text.tolist(),  test_df.label.tolist()

In [None]:
# 2) Load model & tokenizer from your saved checkpoint
device    = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model     = DistilBertForSequenceClassification.from_pretrained(BEST_MODEL_DIR).to(device)
tokenizer = DistilBertTokenizerFast.from_pretrained(TOKENIZER_DIR)

In [None]:
# 3) Tokenize & wrap in Dataset
def tokenize_batch(texts, max_length=256):
    return tokenizer(
        texts,
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )

enc_train = tokenize_batch(train_texts)
enc_val   = tokenize_batch(val_texts)
enc_test  = tokenize_batch(test_texts)

class ReviewDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels    = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        item = {k: v[idx] for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

train_ds = ReviewDataset(enc_train, train_labels)
val_ds   = ReviewDataset(enc_val,   val_labels)
test_ds  = ReviewDataset(enc_test,  test_labels)

In [None]:
# 4) Trainer setup
args = TrainingArguments(
    output_dir=CKPT_DIR,
    save_strategy="epoch",
    eval_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    num_train_epochs=20,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    learning_rate=best_cfg["learning_rate"],
    weight_decay=best_cfg["weight_decay"],
    logging_steps=50,
    report_to=[]
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    compute_metrics=lambda p: {
        "accuracy": accuracy_score(p.label_ids, p.predictions.argmax(-1)),
        "f1":       f1_score(p.label_ids, p.predictions.argmax(-1), average="weighted")
    },
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

In [None]:
# 5) Train & save best
trainer.train()
trainer.save_model(FINAL_MODEL_DIR)
tokenizer.save_pretrained(FINAL_MODEL_DIR)

In [None]:
# 6) Test evaluation
pred_out = trainer.predict(test_ds)
print(f"Test Accuracy: {pred_out.metrics['test_accuracy']:.4f}")
print(f"Test F1-score: {pred_out.metrics['test_f1']:.4f}")

In [None]:
# 7) Confusion Matrix
y_true = pred_out.label_ids
y_pred = pred_out.predictions.argmax(-1)

cm = confusion_matrix(y_true, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.title("Test Confusion Matrix")
plt.show()