In [8]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import (
    f1_score,
    confusion_matrix,
    average_precision_score
)

from imblearn.over_sampling import SMOTE
import joblib
Xf_train.dtypes[Xf_train.dtypes == "object"]


signup_time      object
purchase_time    object
device_id        object
dtype: object

In [2]:
fraud_df = pd.read_csv("../data/processed/fraud_processed.csv")
credit_df = pd.read_csv("../data/processed/creditcard_processed.csv")


In [10]:
ID_COLS = ["signup_time", "purchase_time", "device_id"]
X_fraud = fraud_df.drop(columns=["class"] + ID_COLS)
y_fraud = fraud_df["class"]


In [11]:
X_credit = credit_df.drop("Class", axis=1)
y_credit = credit_df["Class"]


In [12]:
Xf_train, Xf_test, yf_train, yf_test = train_test_split(
    X_fraud, y_fraud,
    test_size=0.2,
    stratify=y_fraud,
    random_state=42
)

Xc_train, Xc_test, yc_train, yc_test = train_test_split(
    X_credit, y_credit,
    test_size=0.2,
    stratify=y_credit,
    random_state=42
)


In [13]:
smote = SMOTE(random_state=42)

Xf_train_res, yf_train_res = smote.fit_resample(Xf_train, yf_train)
Xc_train_res, yc_train_res = smote.fit_resample(Xc_train, yc_train)


In [14]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    print("F1 Score:", f1_score(y_test, y_pred))
    print("AUC-PR:", average_precision_score(y_test, y_prob))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


In [15]:
lr_fraud = LogisticRegression(max_iter=1000, class_weight="balanced")
lr_fraud.fit(Xf_train_res, yf_train_res)

evaluate_model(lr_fraud, Xf_test, yf_test)


F1 Score: 0.1534314103057845
AUC-PR: 0.09714840867794983
Confusion Matrix:
 [[18236  9157]
 [ 1834   996]]


In [16]:
lr_credit = LogisticRegression(max_iter=1000, class_weight="balanced")
lr_credit.fit(Xc_train_res, yc_train_res)

evaluate_model(lr_credit, Xc_test, yc_test)


F1 Score: 0.10018105009052504
AUC-PR: 0.6768371367155248
Confusion Matrix:
 [[55172  1479]
 [   12    83]]


In [17]:
rf_fraud = RandomForestClassifier(
    n_estimators=200,
    max_depth=12,
    class_weight="balanced",
    random_state=42
)

rf_fraud.fit(Xf_train_res, yf_train_res)
evaluate_model(rf_fraud, Xf_test, yf_test)


F1 Score: 0.6176354290233294
AUC-PR: 0.6350486692478058
Confusion Matrix:
 [[26727   666]
 [ 1268  1562]]


In [18]:
rf_credit = RandomForestClassifier(
    n_estimators=200,
    max_depth=12,
    class_weight="balanced",
    random_state=42
)

rf_credit.fit(Xc_train_res, yc_train_res)
evaluate_model(rf_credit, Xc_test, yc_test)


F1 Score: 0.7437185929648241
AUC-PR: 0.7946574219786234
Confusion Matrix:
 [[56621    30]
 [   21    74]]


In [19]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_scores = cross_val_score(
    rf_fraud,
    Xf_train_res,
    yf_train_res,
    scoring="average_precision",
    cv=cv
)

print("Mean AUC-PR:", cv_scores.mean())
print("Std AUC-PR:", cv_scores.std())


Mean AUC-PR: 0.9477156928308752
Std AUC-PR: 0.0002371930104741627


In [20]:
joblib.dump(rf_fraud, "../models/rf_fraud.pkl")
joblib.dump(rf_credit, "../models/rf_credit.pkl")


FileNotFoundError: [Errno 2] No such file or directory: '../models/rf_fraud.pkl'