# Modeling

This notebook's objective is to train models, doing hyperparameter tunning with optuna, and saving the best models in a folder for later evaluation 

In [10]:
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import pandas as pd
import optuna
import pickle
import joblib

In [2]:
data = pd.read_csv("../data/train_preprocessed.csv")

X = data.drop(columns=["fraud_label","user_id","transaction_id"])
y = data.fraud_label

For this project we are using optuna for the hyperparameter optimization. Optuna to make the hyperparameter optimization requieres an objective function, this function returns a metric which will be the one that the model will be trying to minimize or maximize, in this case, maximize, as we will try to maximize the recall metric in order to get the best results.

## Random Forest

In [3]:
def objective(trial:optuna.Trial, X, y):

    params = {
        "n_estimators": trial.suggest_int('n_estimators', 100, 1000),
        "min_samples_split": trial.suggest_int('min_samples_split', 2, 10),
        "min_samples_leaf": trial.suggest_int('min_samples_leaf', 1, 5),
        "max_depth": trial.suggest_int('max_depth', 4, 30),
        "random_state": 42,
        "class_weight": "balanced"
    }

    rf = RandomForestClassifier(**params)
    score = cross_val_score(rf,X,y,cv=5,scoring='recall')
    recall = score.mean()
    return recall

In [4]:
study = optuna.create_study(direction='maximize')

# 2. Run the optimization
study.optimize(lambda trial: objective(trial, X, y), n_trials=50)
# 3. Results
print(f"Best trial: {study.best_trial.number}")
print(f"Best recall: {study.best_value:.4f}")
print(f"Best parameters: {study.best_params}")

[32m[I 2026-02-26 17:20:25,834][0m A new study created in memory with name: no-name-5bf15a61-0d45-468c-a7a8-5ad8ec2e8ba9[0m
[32m[I 2026-02-26 17:21:23,928][0m Trial 0 finished with value: 0.27252747252747256 and parameters: {'n_estimators': 829, 'min_samples_split': 9, 'min_samples_leaf': 1, 'max_depth': 18}. Best is trial 0 with value: 0.27252747252747256.[0m
[32m[I 2026-02-26 17:22:15,812][0m Trial 1 finished with value: 0.27252747252747256 and parameters: {'n_estimators': 776, 'min_samples_split': 7, 'min_samples_leaf': 3, 'max_depth': 30}. Best is trial 0 with value: 0.27252747252747256.[0m
[32m[I 2026-02-26 17:23:12,205][0m Trial 2 finished with value: 0.27252747252747256 and parameters: {'n_estimators': 851, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_depth': 19}. Best is trial 0 with value: 0.27252747252747256.[0m
[32m[I 2026-02-26 17:24:00,900][0m Trial 3 finished with value: 0.27912087912087913 and parameters: {'n_estimators': 832, 'min_samples_split': 7

Best trial: 46
Best recall: 0.4462
Best parameters: {'n_estimators': 175, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_depth': 5}


In [9]:
rf_best = RandomForestClassifier(**study.best_params)
rf_best.fit(X,y)

joblib.dump(rf_best,"../models/rf_20260226.joblib")

TypeError: RandomForestClassifier.__init__() got an unexpected keyword argument 'learning_rate'

## XGBoost

In [6]:
def objective(trial:optuna.Trial, X, y):

    params = {
        # 1. Complexity & Learning Rate
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        
        # 2. Regularization (Prevents Overfitting)
        "gamma": trial.suggest_float("gamma", 1e-8, 1.0, log=True),
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
        
        # 3. Sampling (Adds randomness/robustness)
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        
        # 4. Handling Imbalance for Recall
        # scale_pos_weight is (count of negative samples / count of positive samples)
        "scale_pos_weight": trial.suggest_float("scale_pos_weight", 1, 10),
        
        "eval_metric": "logloss",
        "random_state": 42
    }

    xgc = xgb.XGBClassifier(**params)
    score = cross_val_score(xgc,X,y,cv=5,scoring='recall')
    recall = score.mean()
    return recall

In [7]:
study = optuna.create_study(direction='maximize')

# 2. Run the optimization
study.optimize(lambda trial: objective(trial, X, y), n_trials=50)
# 3. Results
print(f"Best trial: {study.best_trial.number}")
print(f"Best recall: {study.best_value:.4f}")
print(f"Best parameters: {study.best_params}")

[32m[I 2026-02-26 17:32:53,183][0m A new study created in memory with name: no-name-8105e593-5402-4e31-b612-e4786ed129d1[0m
[32m[I 2026-02-26 17:32:56,224][0m Trial 0 finished with value: 0.27252747252747256 and parameters: {'n_estimators': 539, 'learning_rate': 0.05193693055856013, 'max_depth': 10, 'gamma': 3.7645277245101996e-06, 'lambda': 3.024431833378546e-08, 'alpha': 0.20932813688576962, 'subsample': 0.938010538076838, 'colsample_bytree': 0.5237318150803223, 'scale_pos_weight': 9.689223740534553}. Best is trial 0 with value: 0.27252747252747256.[0m
[32m[I 2026-02-26 17:32:56,708][0m Trial 1 finished with value: 0.28351648351648356 and parameters: {'n_estimators': 157, 'learning_rate': 0.07479152374968022, 'max_depth': 4, 'gamma': 3.1384751763677593e-07, 'lambda': 0.10415401920246103, 'alpha': 0.6445345200881499, 'subsample': 0.680581849554009, 'colsample_bytree': 0.7051848825962785, 'scale_pos_weight': 5.3837435270718155}. Best is trial 1 with value: 0.28351648351648356.

Best trial: 45
Best recall: 0.3407
Best parameters: {'n_estimators': 482, 'learning_rate': 0.025109074247529097, 'max_depth': 3, 'gamma': 0.0008657735871104861, 'lambda': 1.1638133011578406e-06, 'alpha': 0.08735859470886748, 'subsample': 0.5943480161407777, 'colsample_bytree': 0.8819443045289452, 'scale_pos_weight': 9.644668546009207}


In [8]:
xgb_best = xgb.XGBClassifier(**study.best_params)
xgb_best.fit(X,y)

joblib.dump(xgb_best,"../models/xgboost_20260226.joblib")

['../models/xgboost_20260226.joblib']