# Modeling

This notebook's objective is to train models, doing hyperparameter tunning with optuna, and saving the best models in a folder for later evaluation 

In [2]:
import xgboost
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import pandas as pd
import optuna

In [6]:
data = pd.read_csv("../data/train_preprocessed.csv")

X = data.drop(columns=["fraud_label","user_id","transaction_id"])
y = data.fraud_label

For this project we are using optuna for the hyperparameter optimization. Optuna to make the hyperparameter optimization requieres an objective function, this function returns a metric which will be the one that the model will be trying to minimize or maximize, in this case, maximize, as we will try to maximize the recall metric in order to get the best results.

## Random Forest

In [None]:
def objective(trial:optuna.Trial, X, y):

    params = {
        "n_estimators": trial.suggest_int('n_estimators', 100, 1000),
        "min_samples_split": trial.suggest_int('min_samples_split', 2, 10),
        "min_samples_leaf": trial.suggest_int('min_samples_leaf', 1, 5),
        "max_depth": trial.suggest_int('max_depth', 4, 30),
        "random_state": 42,
        "class_weight": "balanced"
    }

    rf = RandomForestClassifier(**params)
    score = cross_val_score(rf,X,y,cv=5,scoring='recall')
    recall = score.mean()
    return recall

In [8]:
study = optuna.create_study(direction='maximize')

# 2. Run the optimization
study.optimize(lambda trial: objective(trial, X, y), n_trials=50)
# 3. Results
print(f"Best trial: {study.best_trial.number}")
print(f"Best recall: {study.best_value:.4f}")
print(f"Best parameters: {study.best_params}")

[32m[I 2026-02-26 12:55:59,289][0m A new study created in memory with name: no-name-e70f0c82-e3a8-4d1b-a4ba-daded672b7f3[0m
[32m[I 2026-02-26 12:56:03,658][0m Trial 0 finished with value: 0.4065934065934066 and parameters: {'n_estimators': 220, 'min_samples_split': 7, 'min_samples_leaf': 3, 'max_depth': 7}. Best is trial 0 with value: 0.4065934065934066.[0m


[0.26373626 0.40659341 0.36263736 0.34065934 0.65934066]


[32m[I 2026-02-26 12:56:30,826][0m Trial 1 finished with value: 0.27252747252747256 and parameters: {'n_estimators': 956, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_depth': 29}. Best is trial 0 with value: 0.4065934065934066.[0m


[0.06593407 0.23076923 0.26373626 0.2967033  0.50549451]


[32m[I 2026-02-26 12:56:35,808][0m Trial 2 finished with value: 0.27252747252747256 and parameters: {'n_estimators': 180, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_depth': 19}. Best is trial 0 with value: 0.4065934065934066.[0m


[0.06593407 0.23076923 0.26373626 0.2967033  0.50549451]


[32m[I 2026-02-26 12:56:54,328][0m Trial 3 finished with value: 0.27472527472527475 and parameters: {'n_estimators': 753, 'min_samples_split': 2, 'min_samples_leaf': 3, 'max_depth': 11}. Best is trial 0 with value: 0.4065934065934066.[0m


[0.07692308 0.23076923 0.26373626 0.2967033  0.50549451]


[32m[I 2026-02-26 12:57:09,737][0m Trial 4 finished with value: 0.27252747252747256 and parameters: {'n_estimators': 554, 'min_samples_split': 3, 'min_samples_leaf': 3, 'max_depth': 22}. Best is trial 0 with value: 0.4065934065934066.[0m


[0.06593407 0.23076923 0.26373626 0.2967033  0.50549451]


[32m[I 2026-02-26 12:57:14,798][0m Trial 5 finished with value: 0.43516483516483523 and parameters: {'n_estimators': 276, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_depth': 6}. Best is trial 5 with value: 0.43516483516483523.[0m


[0.27472527 0.48351648 0.37362637 0.36263736 0.68131868]


[32m[I 2026-02-26 12:57:37,748][0m Trial 6 finished with value: 0.2813186813186813 and parameters: {'n_estimators': 979, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_depth': 10}. Best is trial 5 with value: 0.43516483516483523.[0m


[0.07692308 0.24175824 0.27472527 0.2967033  0.51648352]


[32m[I 2026-02-26 12:57:50,431][0m Trial 7 finished with value: 0.4175824175824176 and parameters: {'n_estimators': 769, 'min_samples_split': 4, 'min_samples_leaf': 2, 'max_depth': 5}. Best is trial 5 with value: 0.43516483516483523.[0m


[0.24175824 0.45054945 0.36263736 0.35164835 0.68131868]


[32m[I 2026-02-26 12:58:04,298][0m Trial 8 finished with value: 0.27252747252747256 and parameters: {'n_estimators': 506, 'min_samples_split': 6, 'min_samples_leaf': 1, 'max_depth': 16}. Best is trial 5 with value: 0.43516483516483523.[0m


[0.06593407 0.23076923 0.26373626 0.2967033  0.50549451]


[32m[I 2026-02-26 12:58:26,197][0m Trial 9 finished with value: 0.27252747252747256 and parameters: {'n_estimators': 812, 'min_samples_split': 7, 'min_samples_leaf': 5, 'max_depth': 30}. Best is trial 5 with value: 0.43516483516483523.[0m


[0.06593407 0.23076923 0.26373626 0.2967033  0.50549451]


[32m[I 2026-02-26 12:58:35,810][0m Trial 10 finished with value: 0.27252747252747256 and parameters: {'n_estimators': 365, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_depth': 14}. Best is trial 5 with value: 0.43516483516483523.[0m


[0.06593407 0.23076923 0.26373626 0.2967033  0.50549451]


[32m[I 2026-02-26 12:58:46,691][0m Trial 11 finished with value: 0.4197802197802198 and parameters: {'n_estimators': 644, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_depth': 5}. Best is trial 5 with value: 0.43516483516483523.[0m


[0.25274725 0.43956044 0.37362637 0.35164835 0.68131868]


[32m[I 2026-02-26 12:58:54,997][0m Trial 12 finished with value: 0.3692307692307692 and parameters: {'n_estimators': 553, 'min_samples_split': 8, 'min_samples_leaf': 2, 'max_depth': 4}. Best is trial 5 with value: 0.43516483516483523.[0m


[0.14285714 0.36263736 0.34065934 0.31868132 0.68131868]


[32m[I 2026-02-26 12:59:02,276][0m Trial 13 finished with value: 0.3142857142857142 and parameters: {'n_estimators': 313, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 9}. Best is trial 5 with value: 0.43516483516483523.[0m


[0.10989011 0.2967033  0.31868132 0.31868132 0.52747253]


[32m[I 2026-02-26 12:59:04,831][0m Trial 14 finished with value: 0.27692307692307694 and parameters: {'n_estimators': 101, 'min_samples_split': 9, 'min_samples_leaf': 2, 'max_depth': 12}. Best is trial 5 with value: 0.43516483516483523.[0m


[0.07692308 0.23076923 0.26373626 0.30769231 0.50549451]


[32m[I 2026-02-26 12:59:14,278][0m Trial 15 finished with value: 0.36043956043956044 and parameters: {'n_estimators': 637, 'min_samples_split': 6, 'min_samples_leaf': 1, 'max_depth': 4}. Best is trial 5 with value: 0.43516483516483523.[0m


[0.14285714 0.32967033 0.34065934 0.30769231 0.68131868]


[32m[I 2026-02-26 12:59:25,029][0m Trial 16 finished with value: 0.27252747252747256 and parameters: {'n_estimators': 388, 'min_samples_split': 8, 'min_samples_leaf': 2, 'max_depth': 25}. Best is trial 5 with value: 0.43516483516483523.[0m


[0.06593407 0.23076923 0.26373626 0.2967033  0.50549451]


[32m[I 2026-02-26 12:59:39,084][0m Trial 17 finished with value: 0.34285714285714286 and parameters: {'n_estimators': 669, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_depth': 8}. Best is trial 5 with value: 0.43516483516483523.[0m


[0.17582418 0.30769231 0.31868132 0.31868132 0.59340659]


[32m[I 2026-02-26 12:59:51,480][0m Trial 18 finished with value: 0.27252747252747256 and parameters: {'n_estimators': 468, 'min_samples_split': 8, 'min_samples_leaf': 1, 'max_depth': 14}. Best is trial 5 with value: 0.43516483516483523.[0m


[0.06593407 0.23076923 0.26373626 0.2967033  0.50549451]


[32m[I 2026-02-26 12:59:57,564][0m Trial 19 finished with value: 0.39560439560439564 and parameters: {'n_estimators': 305, 'min_samples_split': 5, 'min_samples_leaf': 3, 'max_depth': 7}. Best is trial 5 with value: 0.43516483516483523.[0m


[0.24175824 0.40659341 0.35164835 0.32967033 0.64835165]


[32m[I 2026-02-26 13:00:15,020][0m Trial 20 finished with value: 0.27252747252747256 and parameters: {'n_estimators': 649, 'min_samples_split': 9, 'min_samples_leaf': 2, 'max_depth': 17}. Best is trial 5 with value: 0.43516483516483523.[0m


[0.06593407 0.23076923 0.26373626 0.2967033  0.50549451]


[32m[I 2026-02-26 13:00:27,258][0m Trial 21 finished with value: 0.35384615384615387 and parameters: {'n_estimators': 804, 'min_samples_split': 4, 'min_samples_leaf': 2, 'max_depth': 4}. Best is trial 5 with value: 0.43516483516483523.[0m


[0.12087912 0.31868132 0.34065934 0.30769231 0.68131868]


[32m[I 2026-02-26 13:00:43,216][0m Trial 22 finished with value: 0.41538461538461535 and parameters: {'n_estimators': 853, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_depth': 6}. Best is trial 5 with value: 0.43516483516483523.[0m


[0.28571429 0.42857143 0.35164835 0.34065934 0.67032967]


[32m[I 2026-02-26 13:00:59,915][0m Trial 23 finished with value: 0.41538461538461535 and parameters: {'n_estimators': 894, 'min_samples_split': 3, 'min_samples_leaf': 2, 'max_depth': 6}. Best is trial 5 with value: 0.43516483516483523.[0m


[0.28571429 0.42857143 0.35164835 0.34065934 0.67032967]


[32m[I 2026-02-26 13:01:17,458][0m Trial 24 finished with value: 0.27252747252747256 and parameters: {'n_estimators': 698, 'min_samples_split': 6, 'min_samples_leaf': 2, 'max_depth': 12}. Best is trial 5 with value: 0.43516483516483523.[0m


[0.06593407 0.23076923 0.26373626 0.2967033  0.50549451]


[32m[I 2026-02-26 13:01:33,867][0m Trial 25 finished with value: 0.3032967032967033 and parameters: {'n_estimators': 737, 'min_samples_split': 3, 'min_samples_leaf': 1, 'max_depth': 9}. Best is trial 5 with value: 0.43516483516483523.[0m


[0.12087912 0.25274725 0.30769231 0.31868132 0.51648352]


[32m[I 2026-02-26 13:01:44,750][0m Trial 26 finished with value: 0.4241758241758243 and parameters: {'n_estimators': 598, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_depth': 6}. Best is trial 5 with value: 0.43516483516483523.[0m


[0.28571429 0.45054945 0.36263736 0.34065934 0.68131868]


[32m[I 2026-02-26 13:01:57,951][0m Trial 27 finished with value: 0.34285714285714286 and parameters: {'n_estimators': 598, 'min_samples_split': 7, 'min_samples_leaf': 4, 'max_depth': 8}. Best is trial 5 with value: 0.43516483516483523.[0m


[0.17582418 0.30769231 0.31868132 0.31868132 0.59340659]


[32m[I 2026-02-26 13:02:09,742][0m Trial 28 finished with value: 0.27252747252747256 and parameters: {'n_estimators': 434, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_depth': 14}. Best is trial 5 with value: 0.43516483516483523.[0m


[0.06593407 0.23076923 0.26373626 0.2967033  0.50549451]


[32m[I 2026-02-26 13:02:13,651][0m Trial 29 finished with value: 0.4087912087912088 and parameters: {'n_estimators': 194, 'min_samples_split': 7, 'min_samples_leaf': 3, 'max_depth': 7}. Best is trial 5 with value: 0.43516483516483523.[0m


[0.27472527 0.40659341 0.36263736 0.34065934 0.65934066]


[32m[I 2026-02-26 13:02:27,602][0m Trial 30 finished with value: 0.27252747252747256 and parameters: {'n_estimators': 507, 'min_samples_split': 7, 'min_samples_leaf': 5, 'max_depth': 21}. Best is trial 5 with value: 0.43516483516483523.[0m


[0.06593407 0.23076923 0.26373626 0.2967033  0.50549451]


[32m[I 2026-02-26 13:02:37,779][0m Trial 31 finished with value: 0.4219780219780219 and parameters: {'n_estimators': 613, 'min_samples_split': 4, 'min_samples_leaf': 3, 'max_depth': 5}. Best is trial 5 with value: 0.43516483516483523.[0m


[0.25274725 0.45054945 0.37362637 0.34065934 0.69230769]


[32m[I 2026-02-26 13:02:49,296][0m Trial 32 finished with value: 0.421978021978022 and parameters: {'n_estimators': 615, 'min_samples_split': 5, 'min_samples_leaf': 3, 'max_depth': 6}. Best is trial 5 with value: 0.43516483516483523.[0m


[0.26373626 0.45054945 0.37362637 0.35164835 0.67032967]


[32m[I 2026-02-26 13:02:59,912][0m Trial 33 finished with value: 0.43076923076923085 and parameters: {'n_estimators': 582, 'min_samples_split': 4, 'min_samples_leaf': 3, 'max_depth': 6}. Best is trial 5 with value: 0.43516483516483523.[0m


[0.27472527 0.46153846 0.37362637 0.36263736 0.68131868]


[32m[I 2026-02-26 13:03:13,294][0m Trial 34 finished with value: 0.2813186813186813 and parameters: {'n_estimators': 575, 'min_samples_split': 6, 'min_samples_leaf': 4, 'max_depth': 10}. Best is trial 5 with value: 0.43516483516483523.[0m


[0.07692308 0.23076923 0.28571429 0.30769231 0.50549451]


[32m[I 2026-02-26 13:03:18,443][0m Trial 35 finished with value: 0.4065934065934066 and parameters: {'n_estimators': 260, 'min_samples_split': 3, 'min_samples_leaf': 3, 'max_depth': 7}. Best is trial 5 with value: 0.43516483516483523.[0m


[0.24175824 0.43956044 0.35164835 0.34065934 0.65934066]


[32m[I 2026-02-26 13:03:32,015][0m Trial 36 finished with value: 0.27252747252747256 and parameters: {'n_estimators': 498, 'min_samples_split': 4, 'min_samples_leaf': 4, 'max_depth': 27}. Best is trial 5 with value: 0.43516483516483523.[0m


[0.06593407 0.23076923 0.26373626 0.2967033  0.50549451]


[32m[I 2026-02-26 13:03:42,521][0m Trial 37 finished with value: 0.27252747252747256 and parameters: {'n_estimators': 416, 'min_samples_split': 6, 'min_samples_leaf': 3, 'max_depth': 12}. Best is trial 5 with value: 0.43516483516483523.[0m


[0.06593407 0.23076923 0.26373626 0.2967033  0.50549451]


[32m[I 2026-02-26 13:03:59,100][0m Trial 38 finished with value: 0.28571428571428575 and parameters: {'n_estimators': 695, 'min_samples_split': 5, 'min_samples_leaf': 5, 'max_depth': 10}. Best is trial 5 with value: 0.43516483516483523.[0m


[0.08791209 0.23076923 0.28571429 0.30769231 0.51648352]


[32m[I 2026-02-26 13:04:08,957][0m Trial 39 finished with value: 0.42637362637362647 and parameters: {'n_estimators': 529, 'min_samples_split': 9, 'min_samples_leaf': 3, 'max_depth': 6}. Best is trial 5 with value: 0.43516483516483523.[0m


[0.26373626 0.46153846 0.36263736 0.36263736 0.68131868]


[32m[I 2026-02-26 13:04:20,326][0m Trial 40 finished with value: 0.34945054945054943 and parameters: {'n_estimators': 525, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_depth': 8}. Best is trial 5 with value: 0.43516483516483523.[0m


[0.1978022  0.31868132 0.32967033 0.31868132 0.58241758]


[32m[I 2026-02-26 13:04:22,473][0m Trial 41 finished with value: 0.443956043956044 and parameters: {'n_estimators': 113, 'min_samples_split': 9, 'min_samples_leaf': 3, 'max_depth': 6}. Best is trial 41 with value: 0.443956043956044.[0m


[0.26373626 0.51648352 0.38461538 0.36263736 0.69230769]


[32m[I 2026-02-26 13:04:24,447][0m Trial 42 finished with value: 0.4461538461538462 and parameters: {'n_estimators': 107, 'min_samples_split': 9, 'min_samples_leaf': 3, 'max_depth': 6}. Best is trial 42 with value: 0.4461538461538462.[0m


[0.27472527 0.48351648 0.41758242 0.36263736 0.69230769]


[32m[I 2026-02-26 13:04:27,031][0m Trial 43 finished with value: 0.3186813186813187 and parameters: {'n_estimators': 114, 'min_samples_split': 9, 'min_samples_leaf': 3, 'max_depth': 9}. Best is trial 42 with value: 0.4461538461538462.[0m


[0.14285714 0.28571429 0.32967033 0.31868132 0.51648352]


[32m[I 2026-02-26 13:04:29,537][0m Trial 44 finished with value: 0.4197802197802198 and parameters: {'n_estimators': 165, 'min_samples_split': 10, 'min_samples_leaf': 3, 'max_depth': 4}. Best is trial 42 with value: 0.4461538461538462.[0m


[0.14285714 0.48351648 0.42857143 0.36263736 0.68131868]


[32m[I 2026-02-26 13:04:35,470][0m Trial 45 finished with value: 0.27692307692307694 and parameters: {'n_estimators': 241, 'min_samples_split': 9, 'min_samples_leaf': 3, 'max_depth': 11}. Best is trial 42 with value: 0.4461538461538462.[0m


[0.07692308 0.24175824 0.26373626 0.2967033  0.50549451]


[32m[I 2026-02-26 13:04:37,947][0m Trial 46 finished with value: 0.45274725274725275 and parameters: {'n_estimators': 146, 'min_samples_split': 9, 'min_samples_leaf': 3, 'max_depth': 5}. Best is trial 46 with value: 0.45274725274725275.[0m


[0.20879121 0.53846154 0.42857143 0.37362637 0.71428571]


[32m[I 2026-02-26 13:04:42,364][0m Trial 47 finished with value: 0.27252747252747256 and parameters: {'n_estimators': 160, 'min_samples_split': 10, 'min_samples_leaf': 3, 'max_depth': 19}. Best is trial 46 with value: 0.45274725274725275.[0m


[0.06593407 0.23076923 0.26373626 0.2967033  0.50549451]


[32m[I 2026-02-26 13:04:44,584][0m Trial 48 finished with value: 0.44835164835164837 and parameters: {'n_estimators': 132, 'min_samples_split': 8, 'min_samples_leaf': 3, 'max_depth': 5}. Best is trial 46 with value: 0.45274725274725275.[0m


[0.23076923 0.51648352 0.42857143 0.37362637 0.69230769]


[32m[I 2026-02-26 13:04:46,626][0m Trial 49 finished with value: 0.43296703296703304 and parameters: {'n_estimators': 135, 'min_samples_split': 8, 'min_samples_leaf': 2, 'max_depth': 4}. Best is trial 46 with value: 0.45274725274725275.[0m


[0.15384615 0.51648352 0.45054945 0.36263736 0.68131868]
Best trial: 46
Best recall: 0.4527
Best parameters: {'n_estimators': 146, 'min_samples_split': 9, 'min_samples_leaf': 3, 'max_depth': 5}


## XGBoost