In [None]:
import os
import glob
import json
import re
import random
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
import joblib

#  Configuration 
BASE = "C:/Users/indur/OneDrive - University of Westminster/GitHub/FYP_Project/Models/ModelTesting/Ai_Genuine_ReviewsTest"
SEED = 42
DATA_SPLIT = f"{BASE}/DataPreparation/DataSet/"  
MODEL_OUT = f"{BASE}/Train_Svm/svm_model_with_cm.joblib"
CV_FOLDS = 5
DROP_TOP_N = 100

# Set random seeds for reproducibility
np.random.seed(SEED)
random.seed(SEED)
matplotlib.use("Agg")  # Non-interactive backend for plots

# Download NLTK data
nltk.download("punkt", quiet=True)
nltk.download("stopwords", quiet=True)

#  0. Load your splits 
try:
    train_df = pd.read_csv(os.path.join(DATA_SPLIT, "train.csv"))
    val_df = pd.read_csv(os.path.join(DATA_SPLIT, "val.csv"))
    test_df = pd.read_csv(os.path.join(DATA_SPLIT, "test.csv"))
except FileNotFoundError:
    raise FileNotFoundError(f"Data files not found in {DATA_SPLIT}")

# Check for missing values
print("Checking for missing values")
for name, df in [("Train", train_df), ("Val", val_df), ("Test", test_df)]:
    missing = df.isna().sum()
    if missing.any():
        print(f"{name} missing values:\n{missing}")

# Verify label classes
print(f"Unique labels: {np.unique(train_df.label)}")

#  1. Sanity checks 
def check_overlap(a, b, name_a, name_b):
    overlap = len(set(a) & set(b))
    print(f"{name_a} / {name_b} overlap: {overlap}")
    return overlap

print("Checking for leaks between splits")
check_overlap(train_df.clean_review, val_df.clean_review, "Train", "Val")
check_overlap(train_df.clean_review, test_df.clean_review, "Train", "Test")
check_overlap(val_df.clean_review, test_df.clean_review, "Val", "Test")

#  2. Build TF-IDF (full) 
tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    min_df=5,
    stop_words="english"
)

X_train = tfidf.fit_transform(train_df.clean_review)
X_val = tfidf.transform(val_df.clean_review)
X_test = tfidf.transform(test_df.clean_review)

y_train = train_df.label
y_val = val_df.label
y_test = test_df.label

print(f"TF-IDF vocabulary size: {len(tfidf.get_feature_names_out())}")

#  3. k-Fold Cross-Validation 
svc = SVC(kernel="linear", C=1.0, random_state=SEED, max_iter=1000)
cv = StratifiedKFold(n_splits=CV_FOLDS, shuffle=True, random_state=SEED)
cv_scores_acc = cross_val_score(svc, X_train, y_train, cv=cv, scoring="accuracy")
cv_scores_f1 = cross_val_score(svc, X_train, y_train, cv=cv, scoring="f1")
print(f"{CV_FOLDS}-fold CV accuracy: {cv_scores_acc.mean():.3f} ± {cv_scores_acc.std():.3f}")
print(f"{CV_FOLDS}-fold CV F1 score: {cv_scores_f1.mean():.3f} ± {cv_scores_f1.std():.3f}")

#  4. Train on full train split 
svc.fit(X_train, y_train)

#  5. Evaluation helper 
def eval_and_plot(X, y, title):
    preds = svc.predict(X)
    acc = accuracy_score(y, preds)
    f1 = f1_score(y, preds, average="binary")  
    print(f"\n{title}  Acc: {acc:.4f}, F1: {f1:.4f}")
    print(classification_report(y, preds, target_names=["genuine", "ai"]))

    # Confusion matrix plot
    cm = confusion_matrix(y, preds)
    plt.figure(figsize=(5, 4))
    sns.heatmap(
        cm, annot=True, fmt="d", cmap="Blues",
        xticklabels=["genuine", "ai"],
        yticklabels=["genuine", "ai"]
    )
    plt.title(f"{title} Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.savefig(f"{title.lower()}_cm.png")
    plt.close()

#  6. Plot for Validation and Test 
eval_and_plot(X_val, y_val, "Validation")
eval_and_plot(X_test, y_test, "Test")

#  7. Feature-Ablation Sanity Check 
coefs = np.abs(svc.coef_.toarray()).ravel()
top_idx = np.argsort(coefs)[::-1][:DROP_TOP_N]
mask = np.ones(len(coefs), dtype=bool)
mask[top_idx] = False

# Log top features for interpretability
top_features = tfidf.get_feature_names_out()[top_idx]
print(f"Top {DROP_TOP_N} features dropped: {top_features[:10]}")

svc_ablate = SVC(kernel="linear", C=1.0, random_state=SEED, max_iter=1000)
svc_ablate.fit(X_train[:, mask], y_train)
ablate_acc = accuracy_score(y_val, svc_ablate.predict(X_val[:, mask]))
print(f"\nAfter dropping top {DROP_TOP_N} features, Val Acc: {ablate_acc:.4f}")

#  8. Save final model 
os.makedirs(os.path.dirname(MODEL_OUT), exist_ok=True)
try:
    joblib.dump({"tfidf": tfidf, "model": svc}, MODEL_OUT)
    print(f"Saved TF-IDF + SVM (with CM plots) to {MODEL_OUT}")
except Exception as e:
    print(f"Error saving model: {e}")
