# Fraud Detection – Modeling (Task 2)

This notebook builds and evaluates classification models for fraud detection
using metrics suitable for imbalanced data.


In [16]:
import sys
from pathlib import Path

# Add project root to PYTHONPATH
project_root = Path("..").resolve()
sys.path.append(str(project_root))


In [17]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

from src.models import (
    train_logistic_regression,
    train_random_forest
)
from src.tuning import tune_random_forest
from src.evaluation import (
    evaluate_model,
    cross_validate_model
)
from src.imbalance import apply_smote
from src.utils import build_preprocessor


In [18]:
df = pd.read_csv("../data/processed/fraud_processed.csv")
df.head()


Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class,ip_int,lower_bound_ip_address,upper_bound_ip_address,country,time_since_signup,hour_of_day,day_of_week,tx_count_24h
0,2,2015-01-11 03:47:13,2015-02-21 10:03:37,54,FGBQNDNBETFJJ,SEO,Chrome,F,25,880217500.0,0,880217484,872415232,889192447,United States,990.273333,10,5,1.0
1,4,2015-06-02 16:40:57,2015-09-26 21:32:16,41,MKFUIVOHLJBYN,Direct,Safari,F,38,2785906000.0,0,-1509061190,-2147483648,-1044550657,Germany,2788.855278,21,5,1.0
2,8,2015-05-28 07:53:06,2015-08-13 11:53:07,47,SCQGQALXBUQZJ,SEO,Chrome,M,25,356056700.0,0,356056736,352321536,369098751,United States,1852.000278,11,3,1.0
3,12,2015-01-10 06:25:12,2015-03-04 20:56:37,35,MSNWCFEHKTIOY,Ads,Safari,M,19,2985180000.0,0,-1309786944,-2147483648,-1044550657,Germany,1286.523611,20,2,1.0
4,16,2015-02-03 13:48:23,2015-03-12 12:46:23,9,FROZWSSWOHZBE,Direct,IE,M,32,578312500.0,0,578312545,570425344,587202559,United States,886.966667,12,3,1.0


In [19]:
target = 'class'

numeric_features = [
    'purchase_value',
    'age',
    'time_since_signup',
    'hour_of_day',
    'day_of_week',
    'tx_count_24h'
]

categorical_features = [
    'source',
    'browser',
    'sex',
    'country'
]

X = df[numeric_features + categorical_features]
y = df[target]


In [20]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)


In [21]:
preprocessor = build_preprocessor(
    numeric_features,
    categorical_features
)

X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)


In [31]:
import pandas as pd
import os

os.makedirs("../data/processed", exist_ok=True)

# Save processed train data for SHAP
pd.DataFrame(
    X_train_processed.toarray() if hasattr(X_train_processed, "toarray") else X_train_processed
).to_csv("../data/processed/X_train_processed.csv", index=False)

pd.DataFrame(y_train).to_csv("../data/processed/y_train.csv", index=False)


In [22]:
X_train_resampled, y_train_resampled = apply_smote(
    X_train_processed,
    y_train
)

print("Before SMOTE:", y_train.value_counts())
print("After SMOTE:", pd.Series(y_train_resampled).value_counts())


Before SMOTE: class
0    80568
1     8492
Name: count, dtype: int64
After SMOTE: class
0    80568
1    80568
Name: count, dtype: int64


In [23]:
lr_model = train_logistic_regression()
lr_model.fit(X_train_resampled, y_train_resampled)


In [24]:
lr_results = evaluate_model(
    lr_model,
    X_test_processed,
    y_test
)

lr_results


{'F1': 0.2838574022746822,
 'AUC_PR': 0.4199729948285208,
 'Confusion_Matrix': array([[13287,  6855],
        [  638,  1485]], dtype=int64)}

In [25]:
lr_cv = cross_validate_model(
    lr_model,
    X_train_resampled,
    y_train_resampled
)

lr_cv


{'f1_mean': 0.691208619243766,
 'f1_std': 0.0005521797836155273,
 'pr_auc_mean': 0.8041158673806129,
 'pr_auc_std': 0.0010683490889131168}

In [26]:
from sklearn.ensemble import RandomForestClassifier

best_rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=20,
    min_samples_split=5,
    min_samples_leaf=2,
    max_features="sqrt",
    class_weight="balanced",
    random_state=42,
    n_jobs=1
)

best_rf.fit(X_train_resampled, y_train_resampled)


In [27]:
import numpy as np

idx = np.random.choice(
    X_train_resampled.shape[0],
    size=30000,
    replace=False
)

X_tune = X_train_resampled[idx]
y_tune = y_train_resampled.iloc[idx]


In [28]:
rf_results = evaluate_model(
    best_rf,
    X_test_processed,
    y_test
)

rf_results


{'F1': 0.703849651409518,
 'AUC_PR': 0.6390071651160484,
 'Confusion_Matrix': array([[20127,    15],
        [  962,  1161]], dtype=int64)}

In [29]:
import joblib

joblib.dump(best_rf, "../models/random_forest_fraud.pkl")
joblib.dump(preprocessor, "../models/preprocessor.pkl")


['../models/preprocessor.pkl']