In [None]:
%pip install pandas scikit-learn torch transformers matplotlib

In [None]:
import pandas as pd
import numpy as np
import torch
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    EarlyStoppingCallback
)
import matplotlib.pyplot as plt

# 1. CONFIGURATION
BASE = "C:/Users/indur/OneDrive - University of Westminster/GitHub/FYP_Project/Models/ModelTesting/Ai_Genuine_ReviewsTest"
TRAIN_PATH  = f"{BASE}/DataPreparation/DataSet/train.csv"
VAL_PATH    = f"{BASE}/DataPreparation/DataSet/val.csv"
TEST_PATH   = f"{BASE}/DataPreparation/DataSet/test.csv"
OUTPUT_DIR  = f"{BASE}/Train_Distilber/distilbert_output"

NUM_EPOCHS  = 15
BATCH_SIZE  = 16
MAX_LENGTH  = 256
LR          = 1e-5
LOG_STEPS   = 100
EVAL_STEPS  = 500
PATIENCE    = 3    

# 2. LOAD DATA
train_df = pd.read_csv(TRAIN_PATH)
val_df   = pd.read_csv(VAL_PATH)
test_df  = pd.read_csv(TEST_PATH)

# 3. DEVICE CHECK
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# 4. TOKENIZER & DATASET
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

class ReviewDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels):
        enc = tokenizer(
            texts.tolist(),
            padding=True,
            truncation=True,
            max_length=MAX_LENGTH,
            return_tensors="pt"
        )
        self.input_ids      = enc.input_ids
        self.attention_mask = enc.attention_mask
        self.labels         = torch.tensor(labels.values, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids":      self.input_ids[idx],
            "attention_mask": self.attention_mask[idx],
            "labels":         self.labels[idx]
        }

train_dataset = ReviewDataset(train_df.clean_review, train_df.label)
val_dataset   = ReviewDataset(val_df.clean_review, val_df.label)
test_dataset  = ReviewDataset(test_df.clean_review, test_df.label)

# 5. MODEL INITIALIZATION
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2
).to(device)

# 6. TRAINING ARGUMENTS 
args = TrainingArguments(
    output_dir=OUTPUT_DIR,

    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",     
    logging_steps=LOG_STEPS,     
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE * 2,
    num_train_epochs=NUM_EPOCHS,
    learning_rate=LR,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    report_to=[],
    disable_tqdm=False,
    fp16=True
)

# 7. METRICS FUNCTION
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1":       f1_score(labels, preds)
    }

# 8. DATA COLLATOR & TRAINER
data_collator = DataCollatorWithPadding(tokenizer)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=PATIENCE)]
)

# 9. TRAINING & EVALUATION
trainer.train()

val_metrics = trainer.evaluate()  # validation set
print(f"\nValidation -> Acc: {val_metrics['eval_accuracy']:.4f}, F1: {val_metrics['eval_f1']:.4f}")

test_metrics = trainer.evaluate(test_dataset)  # test set
print(f"\nTest -> Acc: {test_metrics['eval_accuracy']:.4f}, F1: {test_metrics['eval_f1']:.4f}")

# confusion matrix on validation
pred_output = trainer.predict(val_dataset)
preds  = np.argmax(pred_output.predictions, axis=1)
labels = pred_output.label_ids

cm   = confusion_matrix(labels, preds)
disp = ConfusionMatrixDisplay(cm, display_labels=["genuine", "ai"])
disp.plot(xticks_rotation="vertical")
plt.title("Validation Confusion Matrix")
plt.tight_layout()
plt.show()

# 10. SAVE BEST MODEL
trainer.save_model(OUTPUT_DIR)
print(f"Saved best model to {OUTPUT_DIR}")