### üìö Import Libraries

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [3]:
X_train = pd.read_csv("X_train.csv")
X_test  = pd.read_csv("X_test.csv")

y_train = pd.read_csv("y_train.csv").squeeze()
y_test  = pd.read_csv("y_test.csv").squeeze()

In [4]:
print(f"shape of X_train - {X_train.shape}")
print(f"shape of X_test - {X_test.shape}")
print(f"shape of y_train - {y_train.shape}")
print(f"shape of y_test - {y_test.shape}")

shape of X_train - (800000, 13)
shape of X_test - (200000, 13)
shape of y_train - (800000,)
shape of y_test - (200000,)


## Model trainng

In [5]:
result_df = pd.DataFrame({
    "method": [],

    # F1 score
    "train_f1": [],
    "valid_f1": [],
    "train_f1_std": [],
    "valid_f1_std": [],

    # Recall
    "train_recall": [],
    "valid_recall": [],

    # Precision
    "train_precision": [],
    "valid_precision": [],

    # ROC‚ÄìAUC
    "train_rocauc": [],
    "valid_rocauc": [],

    # PR‚ÄìAUC 
    "train_prauc": [],
    "valid_prauc": []
})


In [6]:
def imb_classification_result(method, model,
                              X_train=X_train, y_train=y_train,
                              result_df=result_df,
                              splits=5):

    from sklearn.model_selection import StratifiedKFold
    from sklearn.metrics import (
        f1_score, precision_score, recall_score,
        roc_auc_score, average_precision_score)
    

    skf = StratifiedKFold(n_splits=splits,shuffle=True,random_state=42)

    train_f1, valid_f1 = [], []
    train_recall, valid_recall = [], []
    train_precision, valid_precision = [], []
    train_aucroc, valid_aucroc = [], []
    train_prauc, valid_prauc = [], []  

    for train_idx, valid_idx in skf.split(X_train, y_train):

        X_train_fold = X_train.iloc[train_idx]
        y_train_fold = y_train.iloc[train_idx]

        X_valid_fold = X_train.iloc[valid_idx]
        y_valid_fold = y_train.iloc[valid_idx]

        model.fit(X_train_fold, y_train_fold)

        y_pred_train = model.predict(X_train_fold)
        y_pred_valid = model.predict(X_valid_fold)

        y_prob_train = model.predict_proba(X_train_fold)[:, 1]
        y_prob_valid = model.predict_proba(X_valid_fold)[:, 1]

        # ---- Metrics ----
        train_f1.append(f1_score(y_train_fold, y_pred_train))
        valid_f1.append(f1_score(y_valid_fold, y_pred_valid))

        train_recall.append(recall_score(y_train_fold, y_pred_train))
        valid_recall.append(recall_score(y_valid_fold, y_pred_valid))

        train_precision.append(precision_score(y_train_fold, y_pred_train))
        valid_precision.append(precision_score(y_valid_fold, y_pred_valid))

        train_aucroc.append(roc_auc_score(y_train_fold, y_prob_train))
        valid_aucroc.append(roc_auc_score(y_valid_fold, y_prob_valid))

        #  PR‚ÄìAUC (IMPORTANT for imbalanced data)
        train_prauc.append(average_precision_score(y_train_fold, y_prob_train))
        valid_prauc.append(average_precision_score(y_valid_fold, y_prob_valid))

    # ---- Mean & Std ----
    row_num = len(result_df)
    result_df.loc[row_num] = [
        method,
        np.round(np.mean(train_f1), 3),
        np.round(np.mean(valid_f1), 3),
        np.round(np.std(train_f1), 3),
        np.round(np.std(valid_f1), 3),
        np.round(np.mean(train_recall), 3),
        np.round(np.mean(valid_recall), 3),
        np.round(np.mean(train_precision), 3),
        np.round(np.mean(valid_precision), 3),
        np.round(np.mean(train_aucroc), 3),
        np.round(np.mean(valid_aucroc), 3),
        np.round(np.mean(train_prauc), 3),
        np.round(np.mean(valid_prauc), 3)
    ]

    return result_df

## Logistic Regression 

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

pipe_lr = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LogisticRegression(random_state=42,max_iter=500,class_weight="balanced"))
])

In [7]:
result_df = imb_classification_result(
    method="Baseline Logistic Regression",model=pipe_lr,
    X_train=X_train,y_train=y_train,
    result_df=result_df,splits=5
)
result_df

Unnamed: 0,method,train_f1,valid_f1,train_f1_std,valid_f1_std,train_recall,valid_recall,train_precision,valid_precision,train_rocauc,valid_rocauc,train_prauc,valid_prauc
0,Baseline Logistic Regression,0.769,0.769,0.001,0.002,0.966,0.966,0.639,0.638,0.988,0.988,0.875,0.875


#### Trying hyperparameter tuning

In [8]:
pipe_lr_fast = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LogisticRegression(C=1.0,penalty="l2",solver="liblinear",max_iter=1000,class_weight="balanced",
                                 random_state=42))
])

In [9]:
result_df = imb_classification_result(
    method="Lr with max_itr=1000",model=pipe_lr_fast,
    X_train=X_train,y_train=y_train,
    result_df=result_df,splits=5)
result_df

Unnamed: 0,method,train_f1,valid_f1,train_f1_std,valid_f1_std,train_recall,valid_recall,train_precision,valid_precision,train_rocauc,valid_rocauc,train_prauc,valid_prauc
0,Baseline Logistic Regression,0.769,0.769,0.001,0.002,0.966,0.966,0.639,0.638,0.988,0.988,0.875,0.875
1,Lr with max_itr=1000,0.769,0.768,0.001,0.001,0.966,0.966,0.638,0.638,0.988,0.988,0.876,0.876


‚ÄúAfter Logistic Regression saturated, I moved to tree-based ensemble models like Random Forest and Gradient Boosting to capture non-linear patterns, which typically improves precision and PR-AUC on imbalanced data.‚Äù

## Decision tree

In [10]:
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier(
    max_depth=8,
    min_samples_leaf=10,
    min_samples_split=20,
    class_weight="balanced",
    random_state=42
)

result_df = imb_classification_result(
    method="Decision Tree",
    model=dt_model,
    X_train=X_train,
    y_train=y_train,
    result_df=result_df,
    splits=5
)

result_df


Unnamed: 0,method,train_f1,valid_f1,train_f1_std,valid_f1_std,train_recall,valid_recall,train_precision,valid_precision,train_rocauc,valid_rocauc,train_prauc,valid_prauc
0,Baseline Logistic Regression,0.769,0.769,0.001,0.002,0.966,0.966,0.639,0.638,0.988,0.988,0.875,0.875
1,Lr with max_itr=1000,0.769,0.768,0.001,0.001,0.966,0.966,0.638,0.638,0.988,0.988,0.876,0.876
2,Decision Tree,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.999,1.0,1.0,1.0,1.0


## Random forest 

In [11]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(class_weight = "balanced", random_state = 42)

result_df = imb_classification_result("baseline random forest with cw = balanced", model, X_train, y_train,
                          result_df)
result_df

Unnamed: 0,method,train_f1,valid_f1,train_f1_std,valid_f1_std,train_recall,valid_recall,train_precision,valid_precision,train_rocauc,valid_rocauc,train_prauc,valid_prauc
0,Baseline Logistic Regression,0.769,0.769,0.001,0.002,0.966,0.966,0.639,0.638,0.988,0.988,0.875,0.875
1,Lr with max_itr=1000,0.769,0.768,0.001,0.001,0.966,0.966,0.638,0.638,0.988,0.988,0.876,0.876
2,Decision Tree,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.999,1.0,1.0,1.0,1.0
3,baseline random forest with cw = balanced,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [12]:
rf_fixed = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    min_samples_leaf=10,
    min_samples_split=20,
    max_features="sqrt",
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)

result_df = imb_classification_result(
    method="Random Forest (Regularized)",
    model=rf_fixed,
    X_train=X_train,
    y_train=y_train,
    result_df=result_df,
    splits=5
)

result_df


Unnamed: 0,method,train_f1,valid_f1,train_f1_std,valid_f1_std,train_recall,valid_recall,train_precision,valid_precision,train_rocauc,valid_rocauc,train_prauc,valid_prauc
0,Baseline Logistic Regression,0.769,0.769,0.001,0.002,0.966,0.966,0.639,0.638,0.988,0.988,0.875,0.875
1,Lr with max_itr=1000,0.769,0.768,0.001,0.001,0.966,0.966,0.638,0.638,0.988,0.988,0.876,0.876
2,Decision Tree,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.999,1.0,1.0,1.0,1.0
3,baseline random forest with cw = balanced,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,Random Forest (Regularized),1.0,0.999,0.0,0.0,1.0,1.0,0.999,0.999,1.0,1.0,1.0,1.0


In [13]:
rf_safe = RandomForestClassifier(
    n_estimators=200,
    max_depth=6,
    min_samples_leaf=20,
    min_samples_split=40,
    max_features=0.6,
    max_samples=0.8,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)
result_df = imb_classification_result(
    method=rf_safe,
    model=rf_fixed,
    X_train=X_train,
    y_train=y_train,
    result_df=result_df,
    splits=5
)

result_df

Unnamed: 0,method,train_f1,valid_f1,train_f1_std,valid_f1_std,train_recall,valid_recall,train_precision,valid_precision,train_rocauc,valid_rocauc,train_prauc,valid_prauc
0,Baseline Logistic Regression,0.769,0.769,0.001,0.002,0.966,0.966,0.639,0.638,0.988,0.988,0.875,0.875
1,Lr with max_itr=1000,0.769,0.768,0.001,0.001,0.966,0.966,0.638,0.638,0.988,0.988,0.876,0.876
2,Decision Tree,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.999,1.0,1.0,1.0,1.0
3,baseline random forest with cw = balanced,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,Random Forest (Regularized),1.0,0.999,0.0,0.0,1.0,1.0,0.999,0.999,1.0,1.0,1.0,1.0
5,RandomForestClassifier(class_weight='balanced'...,1.0,0.999,0.0,0.0,1.0,1.0,0.999,0.999,1.0,1.0,1.0,1.0


In [14]:
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:,1]

from sklearn.metrics import classification_report, roc_auc_score

print(classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00    182519
         1.0       1.00      1.00      1.00     17481

    accuracy                           1.00    200000
   macro avg       1.00      1.00      1.00    200000
weighted avg       1.00      1.00      1.00    200000

ROC-AUC: 0.9999999971792268


Since tree-based models showed unrealistically perfect validation scores, this indicates possible overfitting or data contamination. Feature-level data leakage was checked using correlation analysis

In [15]:
pd.concat([X_train, y_train], axis=1).corr()["fraud"].sort_values(ascending=False)

fraud                                        1.000000
ratio_to_median_purchase_price_log           0.472419
ratio_to_median_purchase_price_log_capped    0.464518
ratio_to_median_purchase_price               0.458927
online_order                                 0.191809
distance_from_home                           0.187716
distance_from_home_log                       0.136651
distance_from_home_log_capped                0.134932
distance_from_last_transaction               0.089970
distance_from_last_transaction_log           0.075980
distance_from_last_transaction_log_capped    0.053140
repeat_retailer                             -0.000932
used_chip                                   -0.061265
used_pin_number                             -0.100398
Name: fraud, dtype: float64

Correlation analysis was conducted to assess feature leakage, and no direct leakage was observed.

## MODEL SELECTION 

‚ÄúTree-based models showed unrealistically perfect cross-validation scores, suggesting memorization or fold contamination.
##### Logistic Regression  
showed stable, consistent performance and was selected for better generalization.‚Äù

In [7]:
## logistic Regression (we taking hyperparmeter tune model)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

final_model = Pipeline([("scaler", StandardScaler()),
    ("model", LogisticRegression(C=1.0,
        penalty="l2",
        solver="liblinear",
        max_iter=1000,
        class_weight="balanced",
        random_state=42))
])
final_model.fit(X_train, y_train)

0,1,2
,steps,"[('scaler', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [8]:
y_test_pred = final_model.predict(X_test)
y_test_prob = final_model.predict_proba(X_test)[:, 1]

In [10]:
from sklearn.metrics import (classification_report,roc_auc_score,average_precision_score)

print("Classification Report (Test):")
print(classification_report(y_test, y_test_pred))

print("ROC-AUC:", roc_auc_score(y_test, y_test_prob))
print("PR-AUC:", average_precision_score(y_test, y_test_prob))

Classification Report (Test):
              precision    recall  f1-score   support

         0.0       1.00      0.95      0.97    182519
         1.0       0.64      0.97      0.77     17481

    accuracy                           0.95    200000
   macro avg       0.82      0.96      0.87    200000
weighted avg       0.97      0.95      0.95    200000

ROC-AUC: 0.9886754528239347
PR-AUC: 0.8802821052749488


##### Save the model

In [11]:
import joblib
joblib.dump(final_model, "final_model.pkl")


['final_model.pkl']

### ‚ÄúThe final Logistic Regression model demonstrated strong and stable performance on the test set, achieving high recall for fraudulent transactions while maintaining reasonable precision, making it suitable for fraud detection.‚Äù