### Model BASELINE training with raw dataset features (no risk scores)

##### Import processed dataset

In [42]:
# Read CSV and transform to dataframe
import os
import pandas as pd

df = pd.read_csv(r'..\..\data\processed\dataset.csv')
print(df.head())


                            customer_id          transaction_timestamp  \
0  ab47a4be-d92d-499f-b6f3-9c07dcd948e5  2020-12-09 07:48:01.897480452   
1  ab47a4be-d92d-499f-b6f3-9c07dcd948e5  2021-01-09 17:07:54.659347898   
2  ab47a4be-d92d-499f-b6f3-9c07dcd948e5  2022-02-24 02:34:58.503224288   
3  ab47a4be-d92d-499f-b6f3-9c07dcd948e5  2022-05-01 02:52:46.265885528   
4  ab47a4be-d92d-499f-b6f3-9c07dcd948e5  2020-11-20 06:59:33.669327084   

   transaction_amount merchant_category transaction_status  \
0               12.92         groceries          completed   
1               17.18            travel          completed   
2               27.02       restaurants          completed   
3                4.59          clothing          completed   
4               14.56          clothing          completed   

                         transaction_id   channel   entry_mode  \
0  9274be52-c787-4d7c-a09b-1559b5d20fd1  in-store  contactless   
1  15483ab8-6985-4175-9ad0-91b8c32b0a6f    mobile 

In [43]:
# Dataframe info
print(df.info())
print(df.describe())
print(df['fraud_label'].value_counts())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3399996 entries, 0 to 3399995
Data columns (total 27 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   customer_id             object 
 1   transaction_timestamp   object 
 2   transaction_amount      float64
 3   merchant_category       object 
 4   transaction_status      object 
 5   transaction_id          object 
 6   channel                 object 
 7   entry_mode              object 
 8   transaction_country     object 
 9   is_international        bool   
 10  customer_name           object 
 11  age                     int64  
 12  income                  float64
 13  signup_date             object 
 14  region                  object 
 15  hour_of_day             int64  
 16  day_of_week             int64  
 17  is_weekend              bool   
 18  is_night                bool   
 19  monthly_income          float64
 20  amount_vs_income_ratio  float64
 21  category_risk_score     float64

##### Temporal train and test split (based on transaction_timestamp)

In [44]:
# Temporal train and test split (based on transaction_timestamp)
# Still indicating percentage split, but using time-based criteria
df['transaction_timestamp'] = pd.to_datetime(df['transaction_timestamp'])
split_date = df['transaction_timestamp'].quantile(0.8)
train_df = df[df['transaction_timestamp'] <= split_date]
test_df = df[df['transaction_timestamp'] > split_date]
print(f"Train set size: {train_df.shape}, Test set size: {test_df.shape}")

Train set size: (2719997, 27), Test set size: (679999, 27)


##### Remove columns

In [45]:
# Remove useless columns, risk features and fraud heuristics (TRAIN SET)
train_df = train_df.drop(columns=['customer_id', 'transaction_id', 'transaction_timestamp', 'transaction_status', 'signup_date', 'customer_name', 'category_risk_score', 'channel_risk_score', 'entry_mode_risk_score', 'country_risk_score', 'fraud_score'])
print(train_df.columns)

Index(['transaction_amount', 'merchant_category', 'channel', 'entry_mode',
       'transaction_country', 'is_international', 'age', 'income', 'region',
       'hour_of_day', 'day_of_week', 'is_weekend', 'is_night',
       'monthly_income', 'amount_vs_income_ratio', 'fraud_label'],
      dtype='object')


In [46]:
# Remove useless columns, risk features and fraud heuristics (TEST SET)
test_df = test_df.drop(columns=['customer_id', 'transaction_id', 'transaction_timestamp', 'transaction_status', 'signup_date', 'customer_name', 'category_risk_score', 'channel_risk_score', 'entry_mode_risk_score', 'country_risk_score', 'fraud_score'])
print(test_df.columns)

Index(['transaction_amount', 'merchant_category', 'channel', 'entry_mode',
       'transaction_country', 'is_international', 'age', 'income', 'region',
       'hour_of_day', 'day_of_week', 'is_weekend', 'is_night',
       'monthly_income', 'amount_vs_income_ratio', 'fraud_label'],
      dtype='object')


##### Generate X and y

In [47]:
x_train = train_df.drop(columns=['fraud_label'])
y_train = train_df['fraud_label']

x_test = test_df.drop(columns=['fraud_label'])
y_test = test_df['fraud_label']

In [48]:
# Visualize train vs testing data
print(f"x_train shape: {x_train.shape}, y_train shape: {y_train.shape}")
print(f"x_test shape: {x_test.shape}, y_test shape: {y_test.shape}")

x_train shape: (2719997, 15), y_train shape: (2719997,)
x_test shape: (679999, 15), y_test shape: (679999,)


#### Explore models

In [49]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

NUMERIC_FEATURES = [
    "transaction_amount",
    "monthly_income",
    "amount_vs_income_ratio",
    "age",
    "income",
    "hour_of_day",
    "day_of_week",
]

BOOLEAN_FEATURES = [
    "is_international",
    "is_weekend",
    "is_night",
]

CATEGORICAL_FEATURES = [
    "merchant_category",
    "channel",
    "entry_mode",
    "transaction_country",
    "region",
]

def build_preprocessor():
    numeric_transformer = StandardScaler()
    
    categorical_transformer = OneHotEncoder(
        handle_unknown="ignore",
        sparse_output=True
    )

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, NUMERIC_FEATURES),
            ("cat", categorical_transformer, CATEGORICAL_FEATURES),
            ("bool", "passthrough", BOOLEAN_FEATURES),
        ],
        remainder="drop",
    )
    return preprocessor



In [50]:
import numpy as np

def precision_at_k(y_true, y_scores, k):
    k = min(k, len(y_scores))
    idx = np.argsort(-y_scores)
    top_k_idx = idx[:k]
    return y_true[top_k_idx].mean()


In [None]:
# Define all these models if your PC can handle it (depending on dataset)

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

models = {
    "logreg": (
        LogisticRegression(max_iter=1000, class_weight="balanced"),
        {"model__C": [0.1, 1.0, 10.0]}
    ),
    "random_forest": (
        RandomForestClassifier(
            n_estimators=200,
            n_jobs=-1,
            class_weight="balanced",
            random_state=42
        ),
        {
            "model__max_depth": [5, 10, None],
            "model__min_samples_split": [2, 10]
        }
    ),
    "gradient_boosting": (
        GradientBoostingClassifier(random_state=42),
        {
            "model__n_estimators": [100, 200],
            "model__learning_rate": [0.05, 0.1],
            "model__max_depth": [2, 3]
        }
    )
}

# XGBoost opcional
try:
    from xgboost import XGBClassifier
    models["xgboost"] = (
        XGBClassifier(
            n_estimators=300,
            eval_metric="logloss",
            tree_method="hist",
            n_jobs=-1,
            random_state=42
        ),
        {
            "model__max_depth": [3, 5],
            "model__learning_rate": [0.05, 0.1],
            "model__subsample": [0.8, 1.0],
            "model__colsample_bytree": [0.8, 1.0],
        }
    )
except:
    pass


In [51]:
# I am only trying logreg because of compute power limitations
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

models = {
    "logreg": (
        LogisticRegression(max_iter=1000, class_weight="balanced"),
        {"model__C": [0.1, 1.0, 10.0]}
    )}

In [52]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    roc_auc_score, precision_score, recall_score, classification_report, confusion_matrix
)

def train_and_evaluate_all(X_train, y_train, X_test, y_test, k_precision=100):
    preprocessor = build_preprocessor()

    for name, (estimator, params) in models.items():
        print("="*80)
        print(f"TRAINING MODEL → {name}")
        print("="*80)

        pipe = Pipeline([
            ("preprocess", preprocessor),
            ("model", estimator),
        ])

        grid = GridSearchCV(
            estimator=pipe,
            param_grid=params,
            scoring="roc_auc",
            cv=3,
            n_jobs=-1,
            verbose=1
        )

        grid.fit(X_train, y_train)

        print("\nBest Params:", grid.best_params_)
        print("Best CV AUC:", grid.best_score_)

        best_model = grid.best_estimator_

        # ---- Test evaluation ----
        y_proba = best_model.predict_proba(X_test)[:, 1]
        y_pred = best_model.predict(X_test)

        auc = roc_auc_score(y_test, y_proba)
        prec = precision_score(y_test, y_pred, zero_division=0)
        rec = recall_score(y_test, y_pred, zero_division=0)

        print(f"\n[Test] AUC: {auc:.4f}")
        print(f"[Test] Precision: {prec:.4f}")
        print(f"[Test] Recall: {rec:.4f}")

        k = min(k_precision, len(y_test))
        p_at_k = precision_at_k(y_test.to_numpy(), y_proba, k)
        print(f"[Test] Precision@{k}: {p_at_k:.4f}")

        print("\nConfusion Matrix:")
        print(confusion_matrix(y_test, y_pred))
        
        print("\nClassification Report:")
        print(classification_report(y_test, y_pred, zero_division=0))


In [53]:
## Execute models
train_and_evaluate_all(x_train, y_train, x_test, y_test, k_precision=100)


TRAINING MODEL → logreg
Fitting 3 folds for each of 3 candidates, totalling 9 fits

Best Params: {'model__C': 1.0}
Best CV AUC: 0.9999902587738866

[Test] AUC: 1.0000
[Test] Precision: 0.9960
[Test] Recall: 1.0000
[Test] Precision@100: 1.0000

Confusion Matrix:
[[647964    127]
 [     0  31908]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    648091
           1       1.00      1.00      1.00     31908

    accuracy                           1.00    679999
   macro avg       1.00      1.00      1.00    679999
weighted avg       1.00      1.00      1.00    679999

