# Final Model Training with Best Hyperparameters

In [None]:
# 1)Configuration & Data Loading

import os
import json
import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt

from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    EarlyStoppingCallback
)
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, ConfusionMatrixDisplay

SPLITS_DIR     = "/kaggle/input/01-preprocessing-ai-genuine/splits"
HPO_OUTPUT_DIR = "/kaggle/input/02-distilbert-hyperparameter-tuning/hpo_output"
FINAL_DIR      = "/kaggle/working/distilbert_final"
SEED           = 42

np.random.seed(SEED)
torch.manual_seed(SEED)

for path in (SPLITS_DIR, HPO_OUTPUT_DIR):
    if not os.path.isdir(path):
        raise FileNotFoundError(f"Directory not found: {path}")

os.makedirs(FINAL_DIR, exist_ok=True)

train_df = pd.read_csv(os.path.join(SPLITS_DIR, "train.csv"))
val_df   = pd.read_csv(os.path.join(SPLITS_DIR, "val.csv"))
test_df  = pd.read_csv(os.path.join(SPLITS_DIR, "test.csv"))

for df, name in ((train_df, "train"), (val_df, "val"), (test_df, "test")):
    for col in ("clean_review", "label"):
        if col not in df.columns:
            raise ValueError(f"{name}.csv missing column: {col}")
    if set(df.label.unique()) - {0,1}:
        raise ValueError(f"{name}.csv contains invalid labels: {df.label.unique()}")

best_params_path = os.path.join(HPO_OUTPUT_DIR, "best_params.json")
if not os.path.isfile(best_params_path):
    raise FileNotFoundError(f"Missing: {best_params_path}")
best_params = json.load(open(best_params_path))
for p in ("per_device_train_batch_size","learning_rate","weight_decay"):
    if p not in best_params:
        raise KeyError(f"Hyperparameter missing: {p}")

print("Loaded best hyperparameters:", best_params)

In [None]:
# 2) Dataset & Tokenization

class ReviewDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        enc = tokenizer(
            texts.tolist(),
            padding="max_length",
            truncation=True,
            max_length=max_length,
            return_tensors="pt"
        )
        self.input_ids      = enc.input_ids
        self.attention_mask = enc.attention_mask
        self.labels         = torch.tensor(labels.tolist(), dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids":      self.input_ids[idx],
            "attention_mask": self.attention_mask[idx],
            "labels":         self.labels[idx],
        }

tokenizer     = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
full_train_ds = ReviewDataset(train_df.clean_review, train_df.label, tokenizer, 256)
val_ds        = ReviewDataset(val_df.clean_review,   val_df.label,   tokenizer, 256)
test_ds       = ReviewDataset(test_df.clean_review,  test_df.label,  tokenizer, 256)

In [None]:
# 3) Metrics & Model Initialization

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1":       f1_score(labels, preds)
    }

def model_init():
    return DistilBertForSequenceClassification.from_pretrained(
        "distilbert-base-uncased",
        num_labels=2
    )

In [None]:
# 4) Compute steps & TrainingArguments

batch_size      = best_params["per_device_train_batch_size"]
steps_per_epoch = len(full_train_ds) // batch_size

training_args = TrainingArguments(
    output_dir=FINAL_DIR,

    per_device_train_batch_size=batch_size,
    learning_rate=best_params["learning_rate"],
    weight_decay=best_params["weight_decay"],
    num_train_epochs=20,
    warmup_steps=best_params.get("warmup_steps", 0),
    fp16=torch.cuda.is_available(),
    seed=SEED,
    report_to=[],
    logging_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="f1"
)

trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=full_train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

In [None]:
# 5) Train

train_output = trainer.train()
print("Training completed. Metrics:", train_output.metrics)

In [None]:
# 6) Final Evaluation on Test Set

metrics = trainer.evaluate(test_ds)
print(f"Test → Acc: {metrics['eval_accuracy']:.4f}, F1: {metrics['eval_f1']:.4f}")

pred_out = trainer.predict(test_ds)
labels   = pred_out.label_ids
preds    = np.argmax(pred_out.predictions, axis=-1)

cm   = confusion_matrix(labels, preds)
disp = ConfusionMatrixDisplay(cm, display_labels=["genuine","ai"])
disp.plot(xticks_rotation="vertical")
plt.title("Test Confusion Matrix")
plt.tight_layout()
plt.show()

In [None]:
# 7) Save Best Model
trainer.save_model(FINAL_DIR)
print(f"Saved best model to {FINAL_DIR}")