In [None]:
%pip install nltk transformers scikit-learn

In [None]:
import os
import re
import numpy as np
import pandas as pd
import nltk

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split

#  Configuration 
BASE = "C:/Users/indur/OneDrive - University of Westminster/GitHub/FYP_Project/Models"
INPUT_CSV   = f"{BASE}/Ai_Genuine_Reviews/FinalDataSet/filtered_reviews.csv"
OUT_SPLITS  = f"{BASE}/ModelTesting/Ai_Genuine_ReviewsTest/DataPreparation/DataSet"
SEED        = 42
N_TOTAL     = 40000
LEAK_TOKENS = ["ai", "quillbot", "genuine"]

np.random.seed(SEED)
nltk.download("punkt")
nltk.download("stopwords")
STOP = set(stopwords.words("english"))

#  1. Load & label 
df = pd.read_csv(INPUT_CSV)
df["label"] = df["source"].map({"genuine": 0, "ai": 1})

# preserve original index for traceability
df.reset_index(inplace=True)
df.rename(columns={"index":"orig_idx"}, inplace=True)

#  2. Strip leakage tokens & drop raw duplicates 
pat = r"\b(" + "|".join(LEAK_TOKENS) + r")\b"
df["review"] = df["review"].str.replace(pat, "", case=False, regex=True)
before = len(df)
df.drop_duplicates(subset=["review"], inplace=True)
print(f"Dropped {before - len(df)} raw duplicates → {len(df)} remain")

#  3. Define cleaning function 
def preprocess(text: str) -> str:
    t = str(text).lower()
    t = re.sub(r"https?://\S+", "", t)
    t = re.sub(r"<.*?>",      "", t)
    t = re.sub(r"[^\w\s]",    " ", t)
    t = re.sub(r"\s+",        " ", t).strip()
    toks = [w for w in word_tokenize(t) if w not in STOP and len(w)>=2]
    return " ".join(toks) if toks else "no_content"

#  4. Clean & compute lengths 
df["clean_review"]  = df["review"].apply(preprocess)
df["review_length"] = df["clean_review"].str.split().apply(len)

#  5. Remove per-class length outliers 
def bounds(s: pd.Series):
    q1,q3 = s.quantile([.25,.75])
    iqr   = q3 - q1
    return q1 - 1.5*iqr, q3 + 1.5*iqr

for lbl in (0,1):
    low,high = bounds(df.loc[df.label==lbl, "review_length"])
    before = len(df)
    df = df.loc[~((df.label==lbl) & 
                  ((df.review_length<low)|(df.review_length>high)))]
    print(f"Dropped {before - len(df)} length outliers for label={lbl}")

#  6. Drop cleaned-text duplicates 
before = len(df)
df.drop_duplicates(subset=["clean_review"], inplace=True)
print(f"Dropped {before - len(df)} cleaned-text duplicates -> {len(df)} remain")

#  7. Balance & subsample 
half = N_TOTAL // 2
genuine = df[df.label==0].sample(n=half, random_state=SEED)
ai      = df[df.label==1].sample(n=half, random_state=SEED)
small   = pd.concat([genuine, ai]).sample(frac=1, random_state=SEED).reset_index(drop=True)
print(f"Balanced dataset size: {len(small)} (20k genuine / 20k AI)")

#  8. Stratified 80/10/10 split 
tv, test_df = train_test_split(
    small, test_size=0.10, stratify=small.label, random_state=SEED
)
train_df, val_df = train_test_split(
    tv, test_size=0.1111, stratify=tv.label, random_state=SEED
)
print(f"Train/Val/Test sizes: {len(train_df)}/{len(val_df)}/{len(test_df)}")

# verify zero overlap
for A,B in [("Train","Val"),("Train","Test"),("Val","Test")]:
    Aset = set(locals()[f"{A.lower()}_df"]["clean_review"])
    Bset = set(locals()[f"{B.lower()}_df"]["clean_review"])
    print(f"{A}-{B} overlap:", len(Aset & Bset))

#  9. Save splits 
os.makedirs(OUT_SPLITS, exist_ok=True)
for name, subset in [("train",train_df),("val",val_df),("test",test_df)]:
    subset.to_csv(
        os.path.join(OUT_SPLITS, f"{name}.csv"),
        index=False,
        columns=["orig_idx","clean_review","label"]
    )
print("Preprocessing complete; splits saved to", OUT_SPLITS)