# 📘 Task 2: Fraud Model Training & Evaluation
Project: Adey Innovations - E-Commerce Fraud Detection

In [1]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report,
    f1_score,
    confusion_matrix,
    precision_recall_curve,
    auc
)
from imblearn.over_sampling import SMOTE

sns.set(style='whitegrid')
os.makedirs("../outputs/plots", exist_ok=True)


In [2]:
# Load cleaned dataset
fraud_df = pd.read_csv("../data/fraud_data_cleaned.csv")
print(fraud_df.head())


FileNotFoundError: [Errno 2] No such file or directory: '../data/fraud_data_cleaned.csv'

In [None]:
# Drop unhelpful columns and separate features/target
fraud_df_proc = fraud_df.drop(columns=['user_id', 'device_id', 'ip_address', 'signup_time', 'purchase_time'])

y_fraud = fraud_df_proc['class']
X_fraud = fraud_df_proc.drop(columns=['class'])

# Encode categorical features
cat_cols = ['source', 'browser', 'sex', 'country']
num_cols = [col for col in X_fraud.columns if col not in cat_cols]

X_fraud_encoded = pd.get_dummies(X_fraud, columns=cat_cols, drop_first=True)

# Scale numerical features
scaler_fraud = StandardScaler()
X_fraud_encoded[num_cols] = scaler_fraud.fit_transform(X_fraud_encoded[num_cols])

print("✅ Preprocessing complete.")
print("Final shape:", X_fraud_encoded.shape)


In [None]:
# Split and apply SMOTE
Xf_train, Xf_test, yf_train, yf_test = train_test_split(
    X_fraud_encoded, y_fraud, test_size=0.2, stratify=y_fraud, random_state=42
)

print("Before SMOTE:", np.bincount(yf_train))

Xf_train = Xf_train.fillna(0)
smote = SMOTE(random_state=42)
Xf_train_res, yf_train_res = smote.fit_resample(Xf_train, yf_train)

print("After SMOTE:", np.bincount(yf_train_res))


In [None]:
# Evaluation function
def evaluate_model(model, X_test, y_test, model_name="Model", dataset_name="Dataset"):
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    print(f"\n🔍 {model_name} on {dataset_name}")
    print(classification_report(y_test, y_pred, digits=4))
    print("F1 Score:", f1_score(y_test, y_pred, average='binary'))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

    precision, recall, _ = precision_recall_curve(y_test, y_prob)
    pr_auc = auc(recall, precision)
    print("PR AUC:", pr_auc)

    plt.figure(figsize=(6, 4))
    plt.plot(recall, precision, label=f'PR AUC = {pr_auc:.4f}')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(f'Precision-Recall Curve: {model_name} ({dataset_name})')
    plt.legend()
    plt.tight_layout()

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"{dataset_name.lower().replace(' ', '_')}_{model_name.lower().replace(' ', '_')}_pr_curve_{timestamp}.png"
    filepath = os.path.join("../outputs/plots", filename)
    plt.savefig(filepath)
    plt.close()
    print(f"✅ Saved to {filepath}")


In [None]:
# Train and evaluate models
log_fraud = LogisticRegression(max_iter=1000, random_state=42)
log_fraud.fit(Xf_train_res, yf_train_res)
evaluate_model(log_fraud, Xf_test, yf_test, "Logistic Regression", "E-commerce")

rf_fraud = RandomForestClassifier(n_estimators=100, random_state=42)
rf_fraud.fit(Xf_train_res, yf_train_res)
evaluate_model(rf_fraud, Xf_test, yf_test, "Random Forest", "E-commerce")
