In [3]:
# Step 1: Imports and Load Cleaned Data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# For model training and evaluation
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, f1_score, confusion_matrix, precision_recall_curve, auc
from sklearn.preprocessing import OneHotEncoder
# For handling imbalance
from imblearn.over_sampling import SMOTE

# Visualization settings
%matplotlib inline
sns.set(style='whitegrid')

# Load cleaned datasets
fraud_df = pd.read_csv("../data/fraud_data_cleaned.csv")
credit_df = pd.read_csv("../data/creditcard_cleaned.csv")

# Show basic info
print("E-Commerce Dataset:")
print(fraud_df.head())
print("\nCredit Card Dataset:")
print(credit_df.head())

# Check class distribution
print("\nClass distribution (E-commerce):")
print(fraud_df['class'].value_counts(normalize=True))

print("\nClass distribution (Credit Card):")
print(credit_df['Class'].value_counts(normalize=True))

# Drop non-useful or ID-like columns
fraud_df_proc = fraud_df.drop(columns=['user_id', 'device_id', 'ip_address', 'signup_time', 'purchase_time'])

# Separate target
y_fraud = fraud_df_proc['class']
X_fraud = fraud_df_proc.drop(columns=['class'])

# Identify categorical columns
cat_cols = ['source', 'browser', 'sex', 'country']
num_cols = [col for col in X_fraud.columns if col not in cat_cols]

# One-hot encode categorical features
X_fraud_encoded = pd.get_dummies(X_fraud, columns=cat_cols, drop_first=True)

# Scale numeric features
scaler_fraud = StandardScaler()
X_fraud_encoded[num_cols] = scaler_fraud.fit_transform(X_fraud_encoded[num_cols])

print("✅ E-commerce data preprocessing complete.")
print("Final shape:", X_fraud_encoded.shape)

# -----------------------
# Credit Card Preprocessing
# -----------------------
# Drop Time (optional) and isolate target
credit_df_proc = credit_df.drop(columns=['Time'])

y_credit = credit_df_proc['Class']
X_credit = credit_df_proc.drop(columns=['Class'])

# Scale 'Amount'
scaler_credit = StandardScaler()
X_credit['Amount'] = scaler_credit.fit_transform(X_credit[['Amount']])

print("✅ Credit card data preprocessing complete.")
print("Final shape:", X_credit.shape)


E-Commerce Dataset:
   user_id          signup_time        purchase_time  purchase_value  \
0    22058  2015-02-24 22:55:49  2015-04-18 02:47:11              34   
1   333320  2015-06-07 20:39:50  2015-06-08 01:38:54              16   
2     1359  2015-01-01 18:52:44  2015-01-01 18:52:45              15   
3   150084  2015-04-28 21:13:25  2015-05-04 13:54:50              44   
4   221365  2015-07-21 07:09:52  2015-09-09 18:40:53              39   

       device_id source browser sex  age    ip_address  class  \
0  QVPSPJUOCKZAR    SEO  Chrome   M   39  7.327584e+08      0   
1  EOGFQPIZPYXFZ    Ads  Chrome   F   53  3.503114e+08      0   
2  YSSKYOSJHPPLJ    SEO   Opera   M   53  2.621474e+09      1   
3  ATGTXKYKUDUQN    SEO  Safari   M   41  3.840542e+09      0   
4  NAUITBZFJKHWW    Ads  Safari   M   45  4.155831e+08      0   

   time_since_signup  hour_of_day  day_of_week  transaction_count  ip_int  \
0          4506682.0            2            5                  1     NaN   
1 

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


In [4]:
# Step 3: Train-Test Split + SMOTE

from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
import numpy as np

# -----------------------------
# E-commerce dataset
# -----------------------------
Xf_train, Xf_test, yf_train, yf_test = train_test_split(
    X_fraud_encoded, y_fraud, test_size=0.2, stratify=y_fraud, random_state=42
)

print("Before SMOTE (E-commerce):", np.bincount(yf_train))

# Fill NaNs before SMOTE
Xf_train = Xf_train.fillna(0)

# Apply SMOTE
smote = SMOTE(random_state=42)
Xf_train_res, yf_train_res = smote.fit_resample(Xf_train, yf_train)

print("After SMOTE (E-commerce):", np.bincount(yf_train_res))


# -----------------------------
# Credit card dataset
# -----------------------------
Xc_train, Xc_test, yc_train, yc_test = train_test_split(
    X_credit, y_credit, test_size=0.2, stratify=y_credit, random_state=42
)

print("\nBefore SMOTE (Credit Card):", np.bincount(yc_train))

# Fill NaNs before SMOTE
Xc_train = Xc_train.fillna(0)

# Apply SMOTE
Xc_train_res, yc_train_res = smote.fit_resample(Xc_train, yc_train)

print("After SMOTE (Credit Card):", np.bincount(yc_train_res))


Before SMOTE (E-commerce): [109568  11321]
After SMOTE (E-commerce): [109568 109568]

Before SMOTE (Credit Card): [226602    378]
After SMOTE (Credit Card): [226602 226602]


In [7]:

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score, confusion_matrix, precision_recall_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Set plot style
sns.set(style='whitegrid')
os.makedirs("../outputs/plots", exist_ok=True)

def evaluate_model(model, X_test, y_test, model_name="Model", dataset_name="Dataset"):
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    print(f"\n🔍 {model_name} on {dataset_name}")
    print(classification_report(y_test, y_pred, digits=4))
    print("F1 Score:", f1_score(y_test, y_pred, average='binary'))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

    # PR AUC
    precision, recall, _ = precision_recall_curve(y_test, y_prob)
    pr_auc = auc(recall, precision)
    print("PR AUC:", pr_auc)

    # Plot and save PR Curve
    plt.figure(figsize=(6,4))
    plt.plot(recall, precision, label=f'PR AUC = {pr_auc:.4f}')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(f'Precision-Recall Curve: {model_name} ({dataset_name})')
    plt.legend()
    plt.tight_layout()
    fig_path = f"../outputs/plots/{dataset_name.lower().replace(' ', '_')}_{model_name.lower().replace(' ', '_')}_pr_curve.png"
    plt.savefig(fig_path)
    plt.show()


In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Load and prepare dataset
fraud_df = pd.read_csv("../data/fraud_data_cleaned.csv")
fraud_df_proc = fraud_df.drop(columns=['user_id', 'device_id', 'ip_address', 'signup_time', 'purchase_time'])
y_fraud = fraud_df_proc['class']
X_fraud = fraud_df_proc.drop(columns=['class'])

# Define columns
cat_cols = ['source', 'browser', 'sex', 'country']
num_cols = [col for col in X_fraud.columns if col not in cat_cols]

# Handle NaNs and Infs
X_fraud[num_cols] = X_fraud[num_cols].replace([np.inf, -np.inf], np.nan)
X_fraud[num_cols] = X_fraud[num_cols].fillna(X_fraud[num_cols].median())

# Encode categoricals
X_fraud_encoded = pd.get_dummies(X_fraud, columns=cat_cols, drop_first=True)

# Scale numeric
scaler = StandardScaler()
X_fraud_encoded[num_cols] = scaler.fit_transform(X_fraud_encoded[num_cols])


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


In [9]:
# 3. Split Data and Apply SMOTE
Xf_train, Xf_test, yf_train, yf_test = train_test_split(
    X_fraud_encoded, y_fraud, test_size=0.2, stratify=y_fraud, random_state=42
)

print("Before SMOTE (E-commerce):", np.bincount(yf_train))

# Fill NaNs before SMOTE
Xf_train = Xf_train.fillna(0)

# Apply SMOTE
smote = SMOTE(random_state=42)
Xf_train_res, yf_train_res = smote.fit_resample(Xf_train, yf_train)

print("After SMOTE (E-commerce):", np.bincount(yf_train_res))

Before SMOTE (E-commerce): [109568  11321]
After SMOTE (E-commerce): [109568 109568]


In [10]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load data
fraud_df = pd.read_csv("../data/fraud_data_cleaned.csv")
fraud_df_proc = fraud_df.drop(columns=['user_id', 'device_id', 'ip_address', 'signup_time', 'purchase_time'])

y_fraud = fraud_df_proc['class']
X_fraud = fraud_df_proc.drop(columns=['class'])

# Encode and scale
X_fraud = pd.get_dummies(X_fraud, columns=['source', 'browser', 'sex', 'country'], drop_first=True)
scaler = StandardScaler()
num_cols = ['purchase_value', 'age', 'time_since_signup', 'hour_of_day', 'day_of_week', 'transaction_count', 'ip_int']
X_fraud[num_cols] = scaler.fit_transform(X_fraud[num_cols])

# Split
X_train, X_test, y_train, y_test = train_test_split(X_fraud.fillna(0), y_fraud, test_size=0.2, stratify=y_fraud)

# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# View sample
print(X_train_res.head())
print(y_train_res.value_counts(normalize=True))


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


   purchase_value       age  time_since_signup  hour_of_day  day_of_week  \
0        2.132034 -0.480488           0.936929    -1.232789     1.489476   
1        0.167258 -0.828608           1.607740     1.226541     0.492565   
2       -0.705975 -1.060689          -0.099615    -1.522122    -0.005891   
3        3.005268  0.099713           1.491283    -0.075457     1.489476   
4       -0.596821 -0.016327          -0.485322     0.647875    -0.504347   

   transaction_count  ip_int  source_Direct  source_SEO  browser_FireFox  \
0                0.0     0.0           True       False            False   
1                0.0     0.0          False        True            False   
2                0.0     0.0          False       False            False   
3                0.0     0.0          False        True            False   
4                0.0     0.0          False        True            False   

   browser_IE  browser_Opera  browser_Safari  sex_M  
0        True          False    

In [1]:

# Fraud Classification Evaluation - Self-Contained Notebook

# 1. Imports
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report,
    f1_score,
    confusion_matrix,
    precision_recall_curve,
    auc,
)
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import os
from datetime import datetime

# 2. Setup
sns.set(style='whitegrid')
os.makedirs("../outputs/plots", exist_ok=True)

# 3. Load Preprocessed E-commerce Dataset
fraud_df = pd.read_csv("../data/fraud_data_cleaned.csv")
fraud_df_proc = fraud_df.drop(columns=['user_id', 'device_id', 'ip_address', 'signup_time', 'purchase_time'])
y_fraud = fraud_df_proc['class']
X_fraud = fraud_df_proc.drop(columns=['class'])

cat_cols = ['source', 'browser', 'sex', 'country']
num_cols = [col for col in X_fraud.columns if col not in cat_cols]

X_fraud_encoded = pd.get_dummies(X_fraud, columns=cat_cols, drop_first=True)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_fraud_encoded[num_cols] = scaler.fit_transform(X_fraud_encoded[num_cols])
X_fraud_encoded = X_fraud_encoded.fillna(0)

# 4. Train-Test Split and SMOTE
Xf_train, Xf_test, yf_train, yf_test = train_test_split(X_fraud_encoded, y_fraud, test_size=0.2, stratify=y_fraud, random_state=42)
print("Before SMOTE:", np.bincount(yf_train))
smote = SMOTE(random_state=42)
Xf_train_res, yf_train_res = smote.fit_resample(Xf_train, yf_train)
print("After SMOTE:", np.bincount(yf_train_res))

# 5. Define Evaluation Function
def evaluate_model(model, X_test, y_test, model_name="Model", dataset_name="Dataset"):
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    print(f"\n🔍 {model_name} on {dataset_name}")
    print(classification_report(y_test, y_pred, digits=4))
    print("F1 Score:", f1_score(y_test, y_pred, average='binary'))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    precision, recall, _ = precision_recall_curve(y_test, y_prob)
    pr_auc = auc(recall, precision)
    print("PR AUC:", pr_auc)
    plt.figure(figsize=(6, 4))
    plt.plot(recall, precision, label=f'PR AUC = {pr_auc:.4f}')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(f'Precision-Recall Curve: {model_name} ({dataset_name})')
    plt.legend()
    plt.tight_layout()
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"{dataset_name.lower().replace(' ', '_')}_{model_name.lower().replace(' ', '_')}_pr_curve_{timestamp}.png"
    filepath = os.path.join("../outputs/plots", filename)
    plt.savefig(filepath)
    plt.close()
    print(f"✅ Saved to {filepath}")

# 6. Train & Evaluate Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(Xf_train_res, yf_train_res)
evaluate_model(rf_model, Xf_test, yf_test, "Random Forest", "E-commerce")

# 7. Train & Evaluate Logistic Regression
log_model = LogisticRegression(max_iter=1000, random_state=42)
log_model.fit(Xf_train_res, yf_train_res)
evaluate_model(log_model, Xf_test, yf_test, "Logistic Regression", "E-commerce")


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


Before SMOTE: [109568  11321]
After SMOTE: [109568 109568]

🔍 Random Forest on E-commerce
              precision    recall  f1-score   support

           0     0.9535    0.9937    0.9732     27393
           1     0.8968    0.5314    0.6674      2830

    accuracy                         0.9504     30223
   macro avg     0.9252    0.7626    0.8203     30223
weighted avg     0.9482    0.9504    0.9446     30223

F1 Score: 0.6674062569336587
Confusion Matrix:
 [[27220   173]
 [ 1326  1504]]
PR AUC: 0.6224860000146311
✅ Saved to ../outputs/plots/e-commerce_random_forest_pr_curve_20250802_152354.png

🔍 Logistic Regression on E-commerce
              precision    recall  f1-score   support

           0     0.9530    0.6430    0.7679     27393
           1     0.1671    0.6933    0.2693      2830

    accuracy                         0.6478     30223
   macro avg     0.5601    0.6682    0.5186     30223
weighted avg     0.8794    0.6478    0.7213     30223

F1 Score: 0.26932052161976666
C

In [2]:
# 1. Train Random Forest (if not done already)
from sklearn.ensemble import RandomForestClassifier

rf_fraud = RandomForestClassifier(n_estimators=100, random_state=42)
rf_fraud.fit(Xf_train_res, yf_train_res)

# 2. SHAP Interpretability
import shap
import matplotlib.pyplot as plt
import os

# Ensure plot directory exists
os.makedirs("../outputs/plots", exist_ok=True)

# Initialize SHAP TreeExplainer
explainer = shap.TreeExplainer(rf_fraud)

# Sample a subset of the test set to speed up computation
Xf_test_sample = Xf_test.sample(n=300, random_state=42)

# Compute SHAP values for the fraud class (class 1)
shap_values = explainer.shap_values(Xf_test_sample)

# 3. SHAP Summary Bar Plot
plt.figure()
# Check if shap_values is a list (as in classifiers)
if isinstance(shap_values, list):
    shap_values_class1 = shap_values[1]  # For binary classifiers: class 1 (fraud)
else:
    shap_values_class1 = shap_values  # For regression or single-output

# Align dimensions by converting DataFrame (some dtypes like bool can cause issues)
X_sample_fixed = Xf_test_sample.astype(float)

# SHAP Summary Bar Plot
plt.figure()
shap.summary_plot(shap_values_class1, X_sample_fixed, plot_type='bar', show=False)
bar_path = "../outputs/plots/shap_summary_bar_ecommerce.png"
plt.savefig(bar_path)
plt.close()
print(f"✅ SHAP summary bar saved at: {bar_path}")

# SHAP Beeswarm Plot
plt.figure()
shap.summary_plot(shap_values_class1, X_sample_fixed, show=False)
bee_path = "../outputs/plots/shap_beeswarm_ecommerce.png"
plt.savefig(bee_path)
plt.close()
print(f"✅ SHAP beeswarm plot saved at: {bee_path}")


✅ SHAP summary bar saved at: ../outputs/plots/shap_summary_bar_ecommerce.png
✅ SHAP beeswarm plot saved at: ../outputs/plots/shap_beeswarm_ecommerce.png


<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

In [1]:
import shap
import os

# Ensure output directory exists
os.makedirs("../outputs/plots", exist_ok=True)

# Initialize explainer for Random Forest
explainer = shap.TreeExplainer(rf_fraud)
shap_values = explainer.shap_values(Xf_test)

# Get indices for fraud and non-fraud cases
fraud_idx = yf_test[yf_test == 1].index[0]
nonfraud_idx = yf_test[yf_test == 0].index[0]

# Select samples
fraud_sample = Xf_test.loc[[fraud_idx]]
nonfraud_sample = Xf_test.loc[[nonfraud_idx]]

# Force Plot - Fraud
fraud_force_plot = shap.force_plot(
    explainer.expected_value[1],
    shap_values[1][Xf_test.index.get_loc(fraud_idx)],
    fraud_sample,
    matplotlib=False
)
shap.save_html("../outputs/plots/shap_force_fraud.html", fraud_force_plot)
print("✅ Fraud force plot saved: shap_force_fraud.html")

# Force Plot - Non-Fraud
nonfraud_force_plot = shap.force_plot(
    explainer.expected_value[1],
    shap_values[1][Xf_test.index.get_loc(nonfraud_idx)],
    nonfraud_sample,
    matplotlib=False
)
shap.save_html("../outputs/plots/shap_force_nonfraud.html", nonfraud_force_plot)
print("✅ Non-fraud force plot saved: shap_force_nonfraud.html")


NameError: name 'rf_fraud' is not defined