# **Import Necessary Libraries**

In [None]:
import pandas as pd
import joblib
from textblob import TextBlob
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import os

# **Check GPU Availability**

In [None]:
import subprocess

def is_gpu_available():
    try:
        # Try to run nvidia-smi to check for GPU presence
        result = subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        return result.returncode == 0
    except Exception:
        return False

gpu_available = is_gpu_available()
print("GPU Available:", gpu_available)

# **Load Dataset**

In [None]:
# In Kaggle, data is usually stored in the input folder.
file_path = "C:/Users/indur/OneDrive - University of Westminster/GitHub/FYP_Project/Models/Ai_Genuine_Reviews/FinalDataSet/filtered_reviews.csv"
df = pd.read_csv(file_path)
print(f"Loaded dataset with {len(df)} reviews!")

# **Clean Data**

In [None]:
# Remove extra spaces in column names and drop duplicate reviews.
df.columns = df.columns.str.strip()
df = df.drop_duplicates(subset=["Review"], keep="first").reset_index(drop=True)
print(f"Dataset size after removing duplicates: {len(df)}")

# **Convert Labels and Balance Dataset**

In [None]:
# Convert labels: fake reviews (-1) become 0, genuine reviews (1) remain 1.
df["Label"] = df["Label"].map({-1: 0, 1: 1})

# Balance the dataset using undersampling.
min_class_count = df["Label"].value_counts().min()
df_fake = df[df["Label"] == 0].sample(min_class_count, random_state=42)
df_genuine = df[df["Label"] == 1].sample(min_class_count, random_state=42)
df_balanced = pd.concat([df_fake, df_genuine]).sample(frac=1, random_state=42).reset_index(drop=True)
print(f"Balanced dataset size: {len(df_balanced)}")

# **Feature Engineering**

In [None]:
# Compute sentiment score using TextBlob.
df_balanced["sentiment_score"] = df_balanced["Review"].apply(lambda x: TextBlob(str(x)).sentiment.polarity)

# Flag promotional words.
df_balanced["contains_promo_words"] = df_balanced["Review"].apply(
    lambda x: int(any(word in str(x).lower() for word in ["best", "amazing", "awesome", "perfect", "incredible", "must-buy"]))
)

# Flag sarcasm words.
df_balanced["contains_sarcasm_words"] = df_balanced["Review"].apply(
    lambda x: int(any(word in str(x).lower() for word in ["lmao", "lol", "smh", "yeah right"]))
)

# Convert Date to a numeric feature: days since review.
df_balanced["Date"] = pd.to_datetime(df_balanced["Date"], errors="coerce")
df_balanced["days_since_review"] = (pd.Timestamp.today() - df_balanced["Date"]).dt.days

# **Prepare Features and Labels**

In [None]:
# Drop features that could lead to overfitting.
X = df_balanced.drop(columns=["User_id", "Product_id", "Date", "Review", "Rating", "Label"], errors="ignore")
y = df_balanced["Label"]

# **Normalize Numeric Features**

In [None]:
# Scale numeric features to help the model converge.
scaler = StandardScaler()
X[["sentiment_score", "days_since_review"]] = scaler.fit_transform(X[["sentiment_score", "days_since_review"]])

# Save the scaler for future use.
scaler_path = "/kaggle/working/scaler.pkl"
joblib.dump(scaler, scaler_path)
print(f"Scaler saved at {scaler_path}!")

# **Train-Test Split**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# **Convert to XGBoost DMatrix**

In [None]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# **Train XGBoost Model with Early Stopping**

In [None]:
# Define common parameters for XGBoost
params = {
    "objective": "binary:logistic",
    "eval_metric": ["logloss", "auc"],
    "learning_rate": 0.01,
    "max_depth": 3,
    "colsample_bytree": 0.5,
    "subsample": 0.5,
    "gamma": 10.0,
    "reg_lambda": 20,
    "tree_method": "hist"  # Use histogram-based method
}

# Conditionally add GPU support if available
if gpu_available:
    print("Using GPU for training.")
    params["device"] = "cuda"
else:
    print("GPU not available, using CPU.")

evals = [(dtrain, "train"), (dtest, "eval")]

# Train the model with early stopping
xgb_model = xgb.train(
    params,
    dtrain,
    num_boost_round=500,
    evals=evals,
    early_stopping_rounds=20,
    verbose_eval=10
)


# **Evaluate Model**

In [None]:
# Safely predict using the best iteration if early stopping was triggered.
if hasattr(xgb_model, "best_ntree_limit"):
    y_pred_proba = xgb_model.predict(dtest, ntree_limit=xgb_model.best_ntree_limit)
else:
    y_pred_proba = xgb_model.predict(dtest)

y_pred = (y_pred_proba > 0.5).astype(int)

accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"XGBoost Accuracy: {accuracy * 100:.2f}%")
print(f"XGBoost ROC-AUC: {roc_auc:.4f}\n")
print("Classification Report:\n", classification_report(y_test, y_pred))

# **Save Model**

In [None]:
model_path = "/kaggle/working/xgboost_fakereview_model.pkl"
joblib.dump(xgb_model, model_path)
print(f"XGBoost Model saved at {model_path}")