# **Preprocess AI vs Genuine Reviews**

In [None]:
import os
import re
import numpy as np
import pandas as pd
import nltk

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split

In [None]:
# 1. Configuration
INPUT_CSV  = "/kaggle/input/verified-ai-genuine-reviews/filtered_reviews.csv"
OUTPUT_DIR = "/kaggle/working/splits/"
SEED       = 42
TEST_FRAC  = 0.10
VAL_FRAC   = 0.10

np.random.seed(SEED)
nltk.download("punkt")
nltk.download("stopwords")
STOP = set(stopwords.words("english"))

In [None]:
# 2. Text preprocessing
def preprocess_text(text: str) -> str:
    t = str(text).lower()
    t = re.sub(r"https?://\S+", "", t)
    t = re.sub(r"<.*?>", "", t)
    t = re.sub(r"[^\w\s]", " ", t)
    t = re.sub(r"\s+", " ", t).strip()
    toks = [w for w in word_tokenize(t) if w not in STOP and len(w) >= 2]
    return " ".join(toks) if toks else "no_content"

def bounds(s: pd.Series):
    q1, q3 = s.quantile([.25, .75])
    iqr = q3 - q1
    return q1 - 1.5 * iqr, q3 + 1.5 * iqr

## 3. Load, Label & Drop Raw Duplicates

In [None]:
df = pd.read_csv(INPUT_CSV)
df["label"] = df["source"].map({"genuine": 0, "ai": 1})
print(f"Loaded {len(df)} rows")

before = len(df)
df.drop_duplicates(subset="review", inplace=True)
print(f"Dropped {before - len(df)} raw duplicates → {len(df)} remain")

## 4. Clean Text & Compute Lengths

In [None]:
df["clean_review"]  = df["review"].apply(preprocess_text)
df["review_length"] = df["clean_review"].str.split().apply(len)

## 5. Remove Length Outliers per Class

In [None]:
for lbl in (0, 1):
    low, high = bounds(df.loc[df.label == lbl, "review_length"])
    before = len(df)
    df = df.loc[~((df.label == lbl) &
                  ((df.review_length < low) | (df.review_length > high)))]
    print(f"Dropped {before - len(df)} length outliers for label={lbl}")

In [None]:
# 6. Drop cleaned-text duplicates
before = len(df)
df.drop_duplicates(subset="clean_review", inplace=True)
print(f"Dropped {before - len(df)} cleaned-text duplicates → {len(df)} remain")

## 7. Stratified 80/10/10 Split

In [None]:
tv, test_df = train_test_split(
    df, test_size=TEST_FRAC, stratify=df.label, random_state=SEED
)
train_df, val_df = train_test_split(
    tv, test_size=VAL_FRAC/(1 - TEST_FRAC),
    stratify=tv.label, random_state=SEED
)
print(f"Train/Val/Test sizes → {len(train_df)}/{len(val_df)}/{len(test_df)}")

In [None]:
# 8. Save splits
os.makedirs(OUTPUT_DIR, exist_ok=True)
for name, subset in [("train", train_df), ("val", val_df), ("test", test_df)]:
    path = os.path.join(OUTPUT_DIR, f"{name}.csv")
    subset.to_csv(path, index=False, columns=["clean_review","label"])
    print(f"path :- {path}")