# Catboost

In [None]:
from sklearn.impute import SimpleImputer
import pandas as pd
import numpy as np

def preprocess_for_catboost(input_train_df, input_test_df, target_column='isFraud'):
    # Work on local copies to avoid modifying global variables
    train_df = input_train_df.copy()
    test_df = input_test_df.copy()

    # Drop target column
    if target_column in train_df.columns:
        train_df_labels = train_df[target_column]  # Save target
        train_df = train_df.drop(columns=[target_column])
    else:
        train_df_labels = None

    # Identify numeric and categorical columns
    numeric_columns = train_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_columns = train_df.select_dtypes(include=['object']).columns.tolist()

    # Ensure columns exist in both train and test
    numeric_columns = [col for col in numeric_columns if col in test_df.columns]
    categorical_columns = [col for col in categorical_columns if col in test_df.columns]

    # Handle missing values
    numeric_imputer = SimpleImputer(strategy='median')
    train_df[numeric_columns] = numeric_imputer.fit_transform(train_df[numeric_columns])
    test_df[numeric_columns] = numeric_imputer.transform(test_df[numeric_columns])

    categorical_imputer = SimpleImputer(strategy='constant', fill_value='missing')
    train_df[categorical_columns] = categorical_imputer.fit_transform(train_df[categorical_columns])
    test_df[categorical_columns] = categorical_imputer.transform(test_df[categorical_columns])

    # Combine numeric and categorical columns
    all_columns = numeric_columns + categorical_columns
    train_df = train_df[all_columns]
    test_df = test_df[all_columns]

    # Return processed data and categorical feature indices
    categorical_feature_indices = [all_columns.index(col) for col in categorical_columns]
    return train_df, test_df, train_df_labels, categorical_feature_indices


In [None]:
filled_train, filled_test, train_y, cat_features = preprocess_for_catboost(train_processed, test_processed)

print("Preprocessed Train Shape:", filled_train.shape)
print("Preprocessed Test Shape:", filled_test.shape)
print("Categorical Features:", cat_features)


Preprocessed Train Shape: (590540, 27)
Preprocessed Test Shape: (506691, 27)
Categorical Features: [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]


In [None]:
train_y = train_processed['isFraud']

In [None]:
train_y.value_counts()

Unnamed: 0_level_0,count
isFraud,Unnamed: 1_level_1
0,569877
1,20663


In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.1.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.8-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.1.0-py3-none-any.whl (364 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m364.4/364.4 kB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.0-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.5/233.5 kB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.8-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: M

In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [None]:
import optuna
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
y_train = train_y

In [None]:
import optuna
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

# Target variable
y_train = train_y

# Define the objective function for Optuna
def objective(trial):
    # Suggest hyperparameters for CatBoost
    params = {
        "iterations": trial.suggest_int("iterations", 100, 1000),
        "depth": trial.suggest_int("depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.5, log=True),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-3, 10.0, log=True),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0.0, 1.0),
        "random_strength": trial.suggest_float("random_strength", 0.0, 10.0),
        "border_count": trial.suggest_int("border_count", 32, 255),
        "loss_function": "Logloss",
        "eval_metric": "AUC",
        "task_type": "GPU",  # Change to "GPU" if running on GPU
        "verbose": False,
        "random_state": 42,
    }

    # Cross-validation setup
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    auc_scores = []

    for train_idx, val_idx in cv.split(filled_train, y_train):
        X_train, X_val = filled_train.iloc[train_idx], filled_train.iloc[val_idx]
        y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]

        # Create CatBoost Pool
        train_pool = Pool(X_train, y_train_fold, cat_features=cat_features)
        val_pool = Pool(X_val, y_val_fold, cat_features=cat_features)

        # Train CatBoost model
        model = CatBoostClassifier(**params)
        model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=50, verbose=False)

        # Predict probabilities for validation set
        y_val_pred_proba = model.predict_proba(X_val)[:, 1]  # Probabilities for class 1

        # Calculate AUC
        auc = roc_auc_score(y_val_fold, y_val_pred_proba)
        auc_scores.append(auc)
        print(np.mean(auc_scores))

    # Return the mean AUC (maximize this)
    return np.mean(auc_scores)

# Run Optuna optimization
study = optuna.create_study(direction="maximize", study_name="CatBoost AUC Tuning")
study.optimize(objective, n_trials=50)

# Print best parameters
print("Best parameters:", study.best_trial.params)



[I 2024-12-08 03:55:40,504] A new study created in memory with name: CatBoost AUC Tuning
Default metric period is 5 because AUC is/are not implemented for GPU


0.9043881077084825


Default metric period is 5 because AUC is/are not implemented for GPU


0.9058401088677082


Default metric period is 5 because AUC is/are not implemented for GPU


0.9065322923166326


Default metric period is 5 because AUC is/are not implemented for GPU


0.9062892562641781


Default metric period is 5 because AUC is/are not implemented for GPU
[I 2024-12-08 03:56:23,648] Trial 0 finished with value: 0.9061443094706437 and parameters: {'iterations': 167, 'depth': 3, 'learning_rate': 0.388207897446378, 'l2_leaf_reg': 6.8755110140312885, 'bagging_temperature': 0.6811714714751179, 'random_strength': 4.14857837887677, 'border_count': 62}. Best is trial 0 with value: 0.9061443094706437.


0.9061443094706437


Default metric period is 5 because AUC is/are not implemented for GPU


0.9459310571132278


Default metric period is 5 because AUC is/are not implemented for GPU


0.9479764592557689


Default metric period is 5 because AUC is/are not implemented for GPU


0.9483294380517303


Default metric period is 5 because AUC is/are not implemented for GPU


0.9492153043123704


Default metric period is 5 because AUC is/are not implemented for GPU
[I 2024-12-08 03:58:51,984] Trial 1 finished with value: 0.9493309115500008 and parameters: {'iterations': 888, 'depth': 3, 'learning_rate': 0.08952970265043257, 'l2_leaf_reg': 0.06897218309640818, 'bagging_temperature': 0.19799868635800066, 'random_strength': 7.270068093909751, 'border_count': 41}. Best is trial 1 with value: 0.9493309115500008.


0.9493309115500008


Default metric period is 5 because AUC is/are not implemented for GPU


0.8825335509060143


Default metric period is 5 because AUC is/are not implemented for GPU


0.8364953371409996


Default metric period is 5 because AUC is/are not implemented for GPU


0.8186578783167758


Default metric period is 5 because AUC is/are not implemented for GPU


0.8347115305299467


Default metric period is 5 because AUC is/are not implemented for GPU
[I 2024-12-08 04:01:06,385] Trial 2 finished with value: 0.8250540333705286 and parameters: {'iterations': 739, 'depth': 7, 'learning_rate': 0.00227705252315925, 'l2_leaf_reg': 0.018520695567049904, 'bagging_temperature': 0.7454132967256856, 'random_strength': 1.953128581895891, 'border_count': 87}. Best is trial 1 with value: 0.9493309115500008.


0.8250540333705286


Default metric period is 5 because AUC is/are not implemented for GPU


0.9631974307561294


Default metric period is 5 because AUC is/are not implemented for GPU


0.9634544289516361


Default metric period is 5 because AUC is/are not implemented for GPU


0.9640038129359566


Default metric period is 5 because AUC is/are not implemented for GPU


0.9638596813070467


Default metric period is 5 because AUC is/are not implemented for GPU
[I 2024-12-08 04:06:20,095] Trial 3 finished with value: 0.9643826247121673 and parameters: {'iterations': 747, 'depth': 8, 'learning_rate': 0.09213627980656504, 'l2_leaf_reg': 0.05071652563682082, 'bagging_temperature': 0.5691809747083104, 'random_strength': 9.884245278126327, 'border_count': 123}. Best is trial 3 with value: 0.9643826247121673.


0.9643826247121673


Default metric period is 5 because AUC is/are not implemented for GPU


0.9151714012532045


Default metric period is 5 because AUC is/are not implemented for GPU


0.9158369294989568


Default metric period is 5 because AUC is/are not implemented for GPU


0.916591289504571


Default metric period is 5 because AUC is/are not implemented for GPU


0.916445708728161


Default metric period is 5 because AUC is/are not implemented for GPU
[I 2024-12-08 04:07:28,780] Trial 4 finished with value: 0.9167660330432058 and parameters: {'iterations': 128, 'depth': 10, 'learning_rate': 0.07297676858631792, 'l2_leaf_reg': 1.0947833676506324, 'bagging_temperature': 0.4198034946806124, 'random_strength': 3.730965745235487, 'border_count': 229}. Best is trial 3 with value: 0.9643826247121673.


0.9167660330432058


Default metric period is 5 because AUC is/are not implemented for GPU


0.8602889740259512


Default metric period is 5 because AUC is/are not implemented for GPU


0.8604934723566742


Default metric period is 5 because AUC is/are not implemented for GPU


0.8604244845357946


Default metric period is 5 because AUC is/are not implemented for GPU


0.8422183384951822


Default metric period is 5 because AUC is/are not implemented for GPU
[I 2024-12-08 04:09:32,453] Trial 5 finished with value: 0.8455806452854173 and parameters: {'iterations': 264, 'depth': 9, 'learning_rate': 0.004766553518675966, 'l2_leaf_reg': 1.200754441712152, 'bagging_temperature': 0.8650978294747176, 'random_strength': 7.993374729135444, 'border_count': 192}. Best is trial 3 with value: 0.9643826247121673.


0.8455806452854173


Default metric period is 5 because AUC is/are not implemented for GPU


0.9001922591030016


Default metric period is 5 because AUC is/are not implemented for GPU


0.9031048032198962


Default metric period is 5 because AUC is/are not implemented for GPU


0.9038233449946903


Default metric period is 5 because AUC is/are not implemented for GPU


0.9032150814236677


Default metric period is 5 because AUC is/are not implemented for GPU
[I 2024-12-08 04:10:15,637] Trial 6 finished with value: 0.9031948741764706 and parameters: {'iterations': 179, 'depth': 3, 'learning_rate': 0.21311404990528435, 'l2_leaf_reg': 0.006058147112179533, 'bagging_temperature': 0.22477234367166088, 'random_strength': 6.615301796832668, 'border_count': 51}. Best is trial 3 with value: 0.9643826247121673.


0.9031948741764706


Default metric period is 5 because AUC is/are not implemented for GPU


0.9007457311201061


Default metric period is 5 because AUC is/are not implemented for GPU


0.9020534193617026


Default metric period is 5 because AUC is/are not implemented for GPU


0.9027892399373857


Default metric period is 5 because AUC is/are not implemented for GPU


0.9027323451306177


Default metric period is 5 because AUC is/are not implemented for GPU
[I 2024-12-08 04:11:07,626] Trial 7 finished with value: 0.9028581040387319 and parameters: {'iterations': 149, 'depth': 8, 'learning_rate': 0.05758573900350037, 'l2_leaf_reg': 0.0021469889053081833, 'bagging_temperature': 0.5867001202345027, 'random_strength': 7.705867962924344, 'border_count': 63}. Best is trial 3 with value: 0.9643826247121673.


0.9028581040387319


Default metric period is 5 because AUC is/are not implemented for GPU


0.9204600203786045


Default metric period is 5 because AUC is/are not implemented for GPU


0.9230862446432397


Default metric period is 5 because AUC is/are not implemented for GPU


0.9235628106730305


Default metric period is 5 because AUC is/are not implemented for GPU


0.9232161033998263


Default metric period is 5 because AUC is/are not implemented for GPU
[I 2024-12-08 04:15:07,254] Trial 8 finished with value: 0.9238886994809047 and parameters: {'iterations': 807, 'depth': 6, 'learning_rate': 0.00835769476128691, 'l2_leaf_reg': 0.005984861709875464, 'bagging_temperature': 0.9854912663903324, 'random_strength': 2.694375478876866, 'border_count': 220}. Best is trial 3 with value: 0.9643826247121673.


0.9238886994809047


Default metric period is 5 because AUC is/are not implemented for GPU


0.8734425378084386


Default metric period is 5 because AUC is/are not implemented for GPU


0.8355022722510967


Default metric period is 5 because AUC is/are not implemented for GPU


0.8488982543078324


Default metric period is 5 because AUC is/are not implemented for GPU


0.8341560606288494


Default metric period is 5 because AUC is/are not implemented for GPU
[I 2024-12-08 04:17:38,270] Trial 9 finished with value: 0.8422304723088612 and parameters: {'iterations': 323, 'depth': 10, 'learning_rate': 0.004377635459175478, 'l2_leaf_reg': 0.028674776945044188, 'bagging_temperature': 0.7161535402848802, 'random_strength': 6.910998019919891, 'border_count': 148}. Best is trial 3 with value: 0.9643826247121673.


0.8422304723088612


Default metric period is 5 because AUC is/are not implemented for GPU


0.9263249059294832


Default metric period is 5 because AUC is/are not implemented for GPU


0.9272100525137303


Default metric period is 5 because AUC is/are not implemented for GPU


0.9283765057276545


Default metric period is 5 because AUC is/are not implemented for GPU


0.928386033633494


Default metric period is 5 because AUC is/are not implemented for GPU
[I 2024-12-08 04:20:21,001] Trial 10 finished with value: 0.9284708914733801 and parameters: {'iterations': 577, 'depth': 5, 'learning_rate': 0.0197223080431357, 'l2_leaf_reg': 0.29549669750538266, 'bagging_temperature': 0.41926186612586414, 'random_strength': 9.65679398572635, 'border_count': 122}. Best is trial 3 with value: 0.9643826247121673.


0.9284708914733801


Default metric period is 5 because AUC is/are not implemented for GPU


0.9589531384589992


Default metric period is 5 because AUC is/are not implemented for GPU


0.9592818169469417


Default metric period is 5 because AUC is/are not implemented for GPU


0.9600879680117966


Default metric period is 5 because AUC is/are not implemented for GPU


0.961029730683358


Default metric period is 5 because AUC is/are not implemented for GPU
[I 2024-12-08 04:24:29,840] Trial 11 finished with value: 0.9616705196163675 and parameters: {'iterations': 960, 'depth': 5, 'learning_rate': 0.09166193095927914, 'l2_leaf_reg': 0.1274341136867327, 'bagging_temperature': 0.09855504626086153, 'random_strength': 9.782044461925395, 'border_count': 126}. Best is trial 3 with value: 0.9643826247121673.


0.9616705196163675


Default metric period is 5 because AUC is/are not implemented for GPU


0.9474469436628734


Default metric period is 5 because AUC is/are not implemented for GPU


0.9487594785031763


Default metric period is 5 because AUC is/are not implemented for GPU


0.9498254413669341


Default metric period is 5 because AUC is/are not implemented for GPU


0.950802703705008


Default metric period is 5 because AUC is/are not implemented for GPU
[I 2024-12-08 04:28:42,506] Trial 12 finished with value: 0.9512035510968117 and parameters: {'iterations': 983, 'depth': 5, 'learning_rate': 0.032147382331346515, 'l2_leaf_reg': 0.16272189137426593, 'bagging_temperature': 0.01800425739916489, 'random_strength': 9.558731321808336, 'border_count': 142}. Best is trial 3 with value: 0.9643826247121673.


0.9512035510968117


Default metric period is 5 because AUC is/are not implemented for GPU


0.9645505830663149


Default metric period is 5 because AUC is/are not implemented for GPU


0.9656516044825862


Default metric period is 5 because AUC is/are not implemented for GPU


0.9655821628012564


Default metric period is 5 because AUC is/are not implemented for GPU


0.9661759523048707


Default metric period is 5 because AUC is/are not implemented for GPU
[I 2024-12-08 04:32:53,957] Trial 13 finished with value: 0.9666334327197699 and parameters: {'iterations': 688, 'depth': 7, 'learning_rate': 0.1750868232685585, 'l2_leaf_reg': 0.43647094795731994, 'bagging_temperature': 0.2848859876419809, 'random_strength': 9.76236909704992, 'border_count': 103}. Best is trial 13 with value: 0.9666334327197699.


0.9666334327197699


Default metric period is 5 because AUC is/are not implemented for GPU


0.9659320484311127


Default metric period is 5 because AUC is/are not implemented for GPU


0.9670886931937438


Default metric period is 5 because AUC is/are not implemented for GPU


0.9668473464561168


Default metric period is 5 because AUC is/are not implemented for GPU


0.9675495291784262


Default metric period is 5 because AUC is/are not implemented for GPU
[I 2024-12-08 04:37:10,180] Trial 14 finished with value: 0.9677243397955267 and parameters: {'iterations': 585, 'depth': 8, 'learning_rate': 0.2081565236546131, 'l2_leaf_reg': 0.670749557238687, 'bagging_temperature': 0.322699760617324, 'random_strength': 0.7678906273398205, 'border_count': 96}. Best is trial 14 with value: 0.9677243397955267.


0.9677243397955267


Default metric period is 5 because AUC is/are not implemented for GPU


0.962068706224119


Default metric period is 5 because AUC is/are not implemented for GPU


0.9632329494767704


Default metric period is 5 because AUC is/are not implemented for GPU


0.9634455430233745


Default metric period is 5 because AUC is/are not implemented for GPU


0.9638576594485491


Default metric period is 5 because AUC is/are not implemented for GPU
[I 2024-12-08 04:40:06,689] Trial 15 finished with value: 0.963893531647088 and parameters: {'iterations': 539, 'depth': 7, 'learning_rate': 0.4839107435498328, 'l2_leaf_reg': 0.8157588692206368, 'bagging_temperature': 0.3208508253197022, 'random_strength': 0.605813801146486, 'border_count': 86}. Best is trial 14 with value: 0.9677243397955267.


0.963893531647088


Default metric period is 5 because AUC is/are not implemented for GPU


0.9645164286127799


Default metric period is 5 because AUC is/are not implemented for GPU


0.9649603935104356


Default metric period is 5 because AUC is/are not implemented for GPU


0.9656391724371413


Default metric period is 5 because AUC is/are not implemented for GPU


0.9665528982610334


Default metric period is 5 because AUC is/are not implemented for GPU
[I 2024-12-08 04:44:09,973] Trial 16 finished with value: 0.9667685588378173 and parameters: {'iterations': 557, 'depth': 8, 'learning_rate': 0.18442253456397656, 'l2_leaf_reg': 5.605865998182563, 'bagging_temperature': 0.3264423854517234, 'random_strength': 0.2652858221830474, 'border_count': 93}. Best is trial 14 with value: 0.9677243397955267.


0.9667685588378173


Default metric period is 5 because AUC is/are not implemented for GPU


0.9460625703388515


Default metric period is 5 because AUC is/are not implemented for GPU


0.947792783250814


Default metric period is 5 because AUC is/are not implemented for GPU


0.9487742090058323


Default metric period is 5 because AUC is/are not implemented for GPU


0.949713260935796


Default metric period is 5 because AUC is/are not implemented for GPU
[I 2024-12-08 04:48:39,725] Trial 17 finished with value: 0.9501757571094009 and parameters: {'iterations': 518, 'depth': 9, 'learning_rate': 0.022281112932095525, 'l2_leaf_reg': 4.585888630803957, 'bagging_temperature': 0.45312829914798763, 'random_strength': 0.6204471698594464, 'border_count': 168}. Best is trial 14 with value: 0.9677243397955267.


0.9501757571094009


Default metric period is 5 because AUC is/are not implemented for GPU


0.9624928690766983


Default metric period is 5 because AUC is/are not implemented for GPU


0.9637435920002452


Default metric period is 5 because AUC is/are not implemented for GPU


0.9639426592131382


Default metric period is 5 because AUC is/are not implemented for GPU


0.964363316562821


Default metric period is 5 because AUC is/are not implemented for GPU
[I 2024-12-08 04:52:01,038] Trial 18 finished with value: 0.9648417186987384 and parameters: {'iterations': 443, 'depth': 8, 'learning_rate': 0.15671533398494092, 'l2_leaf_reg': 2.2330399015326625, 'bagging_temperature': 0.3542177985662058, 'random_strength': 1.7805336232633975, 'border_count': 89}. Best is trial 14 with value: 0.9677243397955267.


0.9648417186987384


Default metric period is 5 because AUC is/are not implemented for GPU


0.9675364414111127


Default metric period is 5 because AUC is/are not implemented for GPU


0.9675287075560683


Default metric period is 5 because AUC is/are not implemented for GPU


0.9679446062012325


Default metric period is 5 because AUC is/are not implemented for GPU


0.9683391745262138


Default metric period is 5 because AUC is/are not implemented for GPU
[I 2024-12-08 04:57:22,599] Trial 19 finished with value: 0.9685769958537109 and parameters: {'iterations': 636, 'depth': 9, 'learning_rate': 0.2674450435488523, 'l2_leaf_reg': 9.36926182619007, 'bagging_temperature': 0.16164736755497494, 'random_strength': 0.30009043880645453, 'border_count': 252}. Best is trial 19 with value: 0.9685769958537109.


0.9685769958537109


Default metric period is 5 because AUC is/are not implemented for GPU


0.9660260713843325


Default metric period is 5 because AUC is/are not implemented for GPU


0.9677027673358791


Default metric period is 5 because AUC is/are not implemented for GPU


0.9670512951492686


Default metric period is 5 because AUC is/are not implemented for GPU


0.967431932577266


Default metric period is 5 because AUC is/are not implemented for GPU
[I 2024-12-08 05:02:06,574] Trial 20 finished with value: 0.967427705158023 and parameters: {'iterations': 635, 'depth': 9, 'learning_rate': 0.3168923453127148, 'l2_leaf_reg': 2.3081729432812828, 'bagging_temperature': 0.12911460746834041, 'random_strength': 5.255904559243601, 'border_count': 249}. Best is trial 19 with value: 0.9685769958537109.


0.967427705158023


Default metric period is 5 because AUC is/are not implemented for GPU


0.9647566457920422


Default metric period is 5 because AUC is/are not implemented for GPU


0.9667969943070164


Default metric period is 5 because AUC is/are not implemented for GPU


0.9665608152137249


Default metric period is 5 because AUC is/are not implemented for GPU


0.9667464919654072


Default metric period is 5 because AUC is/are not implemented for GPU
[I 2024-12-08 05:06:13,618] Trial 21 finished with value: 0.9670198370784664 and parameters: {'iterations': 646, 'depth': 9, 'learning_rate': 0.3519924533202363, 'l2_leaf_reg': 2.4796666841970416, 'bagging_temperature': 0.11122743841766525, 'random_strength': 5.324597531019798, 'border_count': 253}. Best is trial 19 with value: 0.9685769958537109.


0.9670198370784664


Default metric period is 5 because AUC is/are not implemented for GPU


0.964393167876038


Default metric period is 5 because AUC is/are not implemented for GPU


0.965638157161838


Default metric period is 5 because AUC is/are not implemented for GPU


0.9661871883116572


Default metric period is 5 because AUC is/are not implemented for GPU


0.9670499498731204


Default metric period is 5 because AUC is/are not implemented for GPU
[I 2024-12-08 05:11:04,719] Trial 22 finished with value: 0.9675337730206091 and parameters: {'iterations': 437, 'depth': 10, 'learning_rate': 0.29315375841185765, 'l2_leaf_reg': 9.812570772973977, 'bagging_temperature': 0.15009625565709847, 'random_strength': 5.5628335976061045, 'border_count': 252}. Best is trial 19 with value: 0.9685769958537109.


0.9675337730206091


Default metric period is 5 because AUC is/are not implemented for GPU


0.9551671114453468


Default metric period is 5 because AUC is/are not implemented for GPU


0.9568185259879781


Default metric period is 5 because AUC is/are not implemented for GPU


0.9571396688779092


Default metric period is 5 because AUC is/are not implemented for GPU


0.9576672646643754


Default metric period is 5 because AUC is/are not implemented for GPU
[I 2024-12-08 05:15:57,209] Trial 23 finished with value: 0.9582463783878727 and parameters: {'iterations': 438, 'depth': 10, 'learning_rate': 0.04339126505704614, 'l2_leaf_reg': 9.848644483324998, 'bagging_temperature': 0.01743222146111012, 'random_strength': 1.4992841960381087, 'border_count': 203}. Best is trial 19 with value: 0.9685769958537109.


0.9582463783878727


Default metric period is 5 because AUC is/are not implemented for GPU


0.7958189213642641


Default metric period is 5 because AUC is/are not implemented for GPU


0.7954365650704067


Default metric period is 5 because AUC is/are not implemented for GPU


0.7945150932469535


Default metric period is 5 because AUC is/are not implemented for GPU


0.7943208140563562


Default metric period is 5 because AUC is/are not implemented for GPU
[I 2024-12-08 05:17:50,946] Trial 24 finished with value: 0.7941264927462525 and parameters: {'iterations': 441, 'depth': 10, 'learning_rate': 0.0011088646584286252, 'l2_leaf_reg': 0.490463111955364, 'bagging_temperature': 0.20900216334661165, 'random_strength': 2.988729682646596, 'border_count': 230}. Best is trial 19 with value: 0.9685769958537109.


0.7941264927462525


Default metric period is 5 because AUC is/are not implemented for GPU


0.9642777169049228


Default metric period is 5 because AUC is/are not implemented for GPU


0.9645121749659633


Default metric period is 5 because AUC is/are not implemented for GPU


0.9653513817018441


Default metric period is 5 because AUC is/are not implemented for GPU


0.9660108554001632


Default metric period is 5 because AUC is/are not implemented for GPU
[I 2024-12-08 05:21:10,913] Trial 25 finished with value: 0.9663707752847011 and parameters: {'iterations': 357, 'depth': 9, 'learning_rate': 0.26223471606144744, 'l2_leaf_reg': 3.487930981106902, 'bagging_temperature': 0.2565059146210759, 'random_strength': 6.016205494050851, 'border_count': 180}. Best is trial 19 with value: 0.9685769958537109.


0.9663707752847011


Default metric period is 5 because AUC is/are not implemented for GPU


0.9651794879066609


Default metric period is 5 because AUC is/are not implemented for GPU


0.9656028088418744


Default metric period is 5 because AUC is/are not implemented for GPU


0.96593725216833


Default metric period is 5 because AUC is/are not implemented for GPU


0.966438488484828


Default metric period is 5 because AUC is/are not implemented for GPU
[I 2024-12-08 05:26:28,612] Trial 26 finished with value: 0.966899650842292 and parameters: {'iterations': 478, 'depth': 10, 'learning_rate': 0.14970539507715985, 'l2_leaf_reg': 9.03987317921792, 'bagging_temperature': 0.09900589336386587, 'random_strength': 4.263842269088166, 'border_count': 206}. Best is trial 19 with value: 0.9685769958537109.


0.966899650842292


Default metric period is 5 because AUC is/are not implemented for GPU


0.9610424492994604


Default metric period is 5 because AUC is/are not implemented for GPU


0.9616936686658988


Default metric period is 5 because AUC is/are not implemented for GPU


0.9617984561656391


Default metric period is 5 because AUC is/are not implemented for GPU


0.9622095690283672


Default metric period is 5 because AUC is/are not implemented for GPU
[I 2024-12-08 05:29:25,720] Trial 27 finished with value: 0.9626240937544754 and parameters: {'iterations': 369, 'depth': 8, 'learning_rate': 0.12401831886888785, 'l2_leaf_reg': 1.065321505898603, 'bagging_temperature': 0.1694822633519732, 'random_strength': 1.0723440486286075, 'border_count': 241}. Best is trial 19 with value: 0.9685769958537109.


0.9626240937544754


Default metric period is 5 because AUC is/are not implemented for GPU


0.9382283339010383


Default metric period is 5 because AUC is/are not implemented for GPU


0.9397844397881425


Default metric period is 5 because AUC is/are not implemented for GPU


0.940604311472408


Default metric period is 5 because AUC is/are not implemented for GPU


0.9411539276314353


Default metric period is 5 because AUC is/are not implemented for GPU
[I 2024-12-08 05:34:24,034] Trial 28 finished with value: 0.9415070398541919 and parameters: {'iterations': 611, 'depth': 9, 'learning_rate': 0.013064003374173083, 'l2_leaf_reg': 0.25109529470846026, 'bagging_temperature': 0.5227651515150585, 'random_strength': 2.6610600811210454, 'border_count': 163}. Best is trial 19 with value: 0.9685769958537109.


0.9415070398541919


Default metric period is 5 because AUC is/are not implemented for GPU


0.9643035307495996


Default metric period is 5 because AUC is/are not implemented for GPU


0.9654733032653535


Default metric period is 5 because AUC is/are not implemented for GPU


0.9660531142736456


Default metric period is 5 because AUC is/are not implemented for GPU


0.9667824432742382


Default metric period is 5 because AUC is/are not implemented for GPU
[I 2024-12-08 05:38:58,285] Trial 29 finished with value: 0.9670412444768589 and parameters: {'iterations': 798, 'depth': 7, 'learning_rate': 0.24882567368203606, 'l2_leaf_reg': 4.832213378188166, 'bagging_temperature': 0.37250964332596237, 'random_strength': 3.7557893496331825, 'border_count': 65}. Best is trial 19 with value: 0.9685769958537109.


0.9670412444768589


Default metric period is 5 because AUC is/are not implemented for GPU


0.9604648971716742


Default metric period is 5 because AUC is/are not implemented for GPU


0.959221684618171


Default metric period is 5 because AUC is/are not implemented for GPU


0.9603098146071414


Default metric period is 5 because AUC is/are not implemented for GPU


0.9603628781356203


Default metric period is 5 because AUC is/are not implemented for GPU
[I 2024-12-08 05:40:39,369] Trial 30 finished with value: 0.9609694026227229 and parameters: {'iterations': 239, 'depth': 6, 'learning_rate': 0.47531392422735114, 'l2_leaf_reg': 1.778246631468421, 'bagging_temperature': 0.055482805006033586, 'random_strength': 4.427528709790191, 'border_count': 214}. Best is trial 19 with value: 0.9685769958537109.


0.9609694026227229


Default metric period is 5 because AUC is/are not implemented for GPU


0.9667043648172802


Default metric period is 5 because AUC is/are not implemented for GPU


0.9675539188936771


Default metric period is 5 because AUC is/are not implemented for GPU


0.9677044779685008


Default metric period is 5 because AUC is/are not implemented for GPU


0.9679781417623963


Default metric period is 5 because AUC is/are not implemented for GPU
[I 2024-12-08 05:46:00,891] Trial 31 finished with value: 0.9684211734558232 and parameters: {'iterations': 640, 'depth': 9, 'learning_rate': 0.2823245662700379, 'l2_leaf_reg': 3.2391904743917945, 'bagging_temperature': 0.15656642882409638, 'random_strength': 5.075198987349091, 'border_count': 255}. Best is trial 19 with value: 0.9685769958537109.


0.9684211734558232


Default metric period is 5 because AUC is/are not implemented for GPU


0.9662883132492832


Default metric period is 5 because AUC is/are not implemented for GPU


0.9673694954614518


Default metric period is 5 because AUC is/are not implemented for GPU


0.9671750011007864


Default metric period is 5 because AUC is/are not implemented for GPU


0.967750347679607


Default metric period is 5 because AUC is/are not implemented for GPU
[I 2024-12-08 05:51:17,711] Trial 32 finished with value: 0.9681660742305531 and parameters: {'iterations': 700, 'depth': 9, 'learning_rate': 0.3112295039427468, 'l2_leaf_reg': 6.519651464825368, 'bagging_temperature': 0.17113808001770323, 'random_strength': 5.811368904669851, 'border_count': 236}. Best is trial 19 with value: 0.9685769958537109.


0.9681660742305531


Default metric period is 5 because AUC is/are not implemented for GPU


0.9642696470261126


Default metric period is 5 because AUC is/are not implemented for GPU


0.9653765119646799


Default metric period is 5 because AUC is/are not implemented for GPU


0.9658927770193437


Default metric period is 5 because AUC is/are not implemented for GPU


0.9664213647081266


Default metric period is 5 because AUC is/are not implemented for GPU
[I 2024-12-08 05:56:16,042] Trial 33 finished with value: 0.9668395542039037 and parameters: {'iterations': 701, 'depth': 8, 'learning_rate': 0.1238352728018746, 'l2_leaf_reg': 3.6177403065820184, 'bagging_temperature': 0.24945437108538615, 'random_strength': 0.14493801997813927, 'border_count': 235}. Best is trial 19 with value: 0.9685769958537109.


0.9668395542039037


Default metric period is 5 because AUC is/are not implemented for GPU


0.9627714481729515


Default metric period is 5 because AUC is/are not implemented for GPU


0.9631243718638207


Default metric period is 5 because AUC is/are not implemented for GPU


0.9639261131257569


Default metric period is 5 because AUC is/are not implemented for GPU


0.9647508773406824


Default metric period is 5 because AUC is/are not implemented for GPU
[I 2024-12-08 05:59:53,922] Trial 34 finished with value: 0.964974694050909 and parameters: {'iterations': 790, 'depth': 9, 'learning_rate': 0.37099351546443227, 'l2_leaf_reg': 0.5995308951537399, 'bagging_temperature': 0.16695029558427538, 'random_strength': 6.235209391967109, 'border_count': 32}. Best is trial 19 with value: 0.9685769958537109.


0.964974694050909


Default metric period is 5 because AUC is/are not implemented for GPU


0.960853089025179


Default metric period is 5 because AUC is/are not implemented for GPU


0.9623816351242167


Default metric period is 5 because AUC is/are not implemented for GPU


0.9625586174035851


Default metric period is 5 because AUC is/are not implemented for GPU


0.9628467529930821


Default metric period is 5 because AUC is/are not implemented for GPU
[I 2024-12-08 06:05:43,354] Trial 35 finished with value: 0.9632210697624426 and parameters: {'iterations': 845, 'depth': 8, 'learning_rate': 0.06025843974333924, 'l2_leaf_reg': 6.00478863504632, 'bagging_temperature': 0.21260186354464777, 'random_strength': 4.743176541439125, 'border_count': 240}. Best is trial 19 with value: 0.9685769958537109.


0.9632210697624426


Default metric period is 5 because AUC is/are not implemented for GPU


0.9656657137648449


Default metric period is 5 because AUC is/are not implemented for GPU


0.9664365193711746


Default metric period is 5 because AUC is/are not implemented for GPU


0.9669864538447676


Default metric period is 5 because AUC is/are not implemented for GPU


0.9675808958794755


Default metric period is 5 because AUC is/are not implemented for GPU
[I 2024-12-08 06:11:48,211] Trial 36 finished with value: 0.9680483029309224 and parameters: {'iterations': 714, 'depth': 9, 'learning_rate': 0.1130717933813104, 'l2_leaf_reg': 1.4282593389746716, 'bagging_temperature': 0.28725658787751346, 'random_strength': 3.461547563052325, 'border_count': 225}. Best is trial 19 with value: 0.9685769958537109.


0.9680483029309224


Default metric period is 5 because AUC is/are not implemented for GPU


0.9658758300094902


Default metric period is 5 because AUC is/are not implemented for GPU


0.9670229769250176


Default metric period is 5 because AUC is/are not implemented for GPU


0.9674972510304997


Default metric period is 5 because AUC is/are not implemented for GPU


0.9678955070775995


Default metric period is 5 because AUC is/are not implemented for GPU
[I 2024-12-08 06:17:45,171] Trial 37 finished with value: 0.9680965076844019 and parameters: {'iterations': 696, 'depth': 9, 'learning_rate': 0.10864729883624667, 'l2_leaf_reg': 1.4749376605620472, 'bagging_temperature': 0.06824153619125956, 'random_strength': 3.336104468269953, 'border_count': 221}. Best is trial 19 with value: 0.9685769958537109.


0.9680965076844019


Default metric period is 5 because AUC is/are not implemented for GPU


0.9601165546578955


Default metric period is 5 because AUC is/are not implemented for GPU


0.9616116369304426


Default metric period is 5 because AUC is/are not implemented for GPU


0.9629602427723544


Default metric period is 5 because AUC is/are not implemented for GPU


0.9638107694938802


Default metric period is 5 because AUC is/are not implemented for GPU
[I 2024-12-08 06:21:18,768] Trial 38 finished with value: 0.9642495615282197 and parameters: {'iterations': 881, 'depth': 10, 'learning_rate': 0.4989766133545469, 'l2_leaf_reg': 3.0576827065624963, 'bagging_temperature': 0.06721088099174959, 'random_strength': 7.942614481446176, 'border_count': 193}. Best is trial 19 with value: 0.9685769958537109.


0.9642495615282197


Default metric period is 5 because AUC is/are not implemented for GPU


0.9507905022259403


Default metric period is 5 because AUC is/are not implemented for GPU


0.9513861380592616


Default metric period is 5 because AUC is/are not implemented for GPU


0.9518659225167125


Default metric period is 5 because AUC is/are not implemented for GPU


0.9525966362975513


Default metric period is 5 because AUC is/are not implemented for GPU
[I 2024-12-08 06:23:53,540] Trial 39 finished with value: 0.9527288861478352 and parameters: {'iterations': 656, 'depth': 4, 'learning_rate': 0.07974391576904563, 'l2_leaf_reg': 5.2773533306928115, 'bagging_temperature': 0.0034233981974987338, 'random_strength': 5.966604631947284, 'border_count': 242}. Best is trial 19 with value: 0.9685769958537109.


0.9527288861478352


Default metric period is 5 because AUC is/are not implemented for GPU


0.9560820611611582


Default metric period is 5 because AUC is/are not implemented for GPU


0.9570553542640489


Default metric period is 5 because AUC is/are not implemented for GPU


0.9573692475164636


Default metric period is 5 because AUC is/are not implemented for GPU


0.9580128365444904


Default metric period is 5 because AUC is/are not implemented for GPU
[I 2024-12-08 06:30:06,909] Trial 40 finished with value: 0.9581157507236469 and parameters: {'iterations': 751, 'depth': 9, 'learning_rate': 0.035624908212499286, 'l2_leaf_reg': 0.07181090622007405, 'bagging_temperature': 0.6562571906265824, 'random_strength': 8.63004908813083, 'border_count': 207}. Best is trial 19 with value: 0.9685769958537109.


0.9581157507236469


Default metric period is 5 because AUC is/are not implemented for GPU


0.9671071782167622


Default metric period is 5 because AUC is/are not implemented for GPU


0.9682054307547364


Default metric period is 5 because AUC is/are not implemented for GPU


0.9685253977318649


Default metric period is 5 because AUC is/are not implemented for GPU


0.9688246441264778


Default metric period is 5 because AUC is/are not implemented for GPU
[I 2024-12-08 06:36:29,025] Trial 41 finished with value: 0.9690379022011475 and parameters: {'iterations': 747, 'depth': 9, 'learning_rate': 0.11833365442458572, 'l2_leaf_reg': 1.2685058754197363, 'bagging_temperature': 0.17965606614534962, 'random_strength': 3.5115033147518337, 'border_count': 226}. Best is trial 41 with value: 0.9690379022011475.


0.9690379022011475


Default metric period is 5 because AUC is/are not implemented for GPU


0.9660367009892065


Default metric period is 5 because AUC is/are not implemented for GPU


0.9672425803999021


Default metric period is 5 because AUC is/are not implemented for GPU


0.967562362341604


Default metric period is 5 because AUC is/are not implemented for GPU


0.9681928281829847


Default metric period is 5 because AUC is/are not implemented for GPU
[I 2024-12-08 06:42:13,390] Trial 42 finished with value: 0.9684065390170419 and parameters: {'iterations': 749, 'depth': 9, 'learning_rate': 0.23122004584043554, 'l2_leaf_reg': 1.4636245026812997, 'bagging_temperature': 0.17560018444882333, 'random_strength': 4.758805450016254, 'border_count': 222}. Best is trial 41 with value: 0.9690379022011475.


0.9684065390170419


Default metric period is 5 because AUC is/are not implemented for GPU


0.967955032533131


Default metric period is 5 because AUC is/are not implemented for GPU


0.967695418873021


Default metric period is 5 because AUC is/are not implemented for GPU


In [None]:
best_params = {'iterations': 709, 'depth': 5, 'learning_rate': 0.22378524791213256, 'l2_leaf_reg': 0.030725954622609027, 'bagging_temperature': 0.035983637250438405, 'random_strength': 5.650565084985939, 'border_count': 89}

In [None]:
best_params_2 = {'iterations': 749, 'depth': 9, 'learning_rate': 0.23122004584043554, 'l2_leaf_reg': 1.4636245026812997, 'bagging_temperature': 0.17560018444882333, 'random_strength': 4.758805450016254, 'border_count': 222}

In [None]:
best_params

{'iterations': 709,
 'depth': 5,
 'learning_rate': 0.22378524791213256,
 'l2_leaf_reg': 0.030725954622609027,
 'bagging_temperature': 0.035983637250438405,
 'random_strength': 5.650565084985939,
 'border_count': 89}

In [None]:
best_params_2

{'iterations': 749,
 'depth': 9,
 'learning_rate': 0.23122004584043554,
 'l2_leaf_reg': 1.4636245026812997,
 'bagging_temperature': 0.17560018444882333,
 'random_strength': 4.758805450016254,
 'border_count': 222}

In [None]:
# Train the final model with the best parameters on the full dataset
best_params = study.best_trial.params
final_pool = Pool(filled_train, y_train, cat_features=cat_features)
final_model = CatBoostClassifier(**best_params, verbose=100)
final_model.fit(final_pool)

# Make predictions on the test set
test_pool = Pool(filled_test, cat_features=cat_features)
test_pred_proba = final_model.predict_proba(test_pool)[:, 1]

# Create submission
submission = pd.DataFrame({
    "TransactionID": test_processed["TransactionID"],
    "isFraud": test_pred_proba
})
submission.to_csv("catboost_auc_submission.csv", index=False)

print("Submission file created: catboost_auc_submission.csv")

0:	learn: 0.4076108	total: 300ms	remaining: 3m 32s
100:	learn: 0.0733359	total: 57s	remaining: 5m 43s
200:	learn: 0.0659926	total: 1m 51s	remaining: 4m 42s
300:	learn: 0.0616673	total: 2m 46s	remaining: 3m 45s
400:	learn: 0.0590472	total: 3m 40s	remaining: 2m 49s
500:	learn: 0.0571702	total: 4m 34s	remaining: 1m 54s
600:	learn: 0.0555282	total: 5m 29s	remaining: 59.2s
700:	learn: 0.0543717	total: 6m 21s	remaining: 4.36s
708:	learn: 0.0542038	total: 6m 26s	remaining: 0us
Submission file created: catboost_auc_submission.csv


In [None]:
# Train the final model with the best parameters on the full dataset
#best_params = study.best_trial.params
final_pool = Pool(filled_train, y_train, cat_features=cat_features)
final_model = CatBoostClassifier(**best_params_2, verbose=100)
final_model.fit(final_pool)

# Make predictions on the test set
test_pool = Pool(filled_test, cat_features=cat_features)
test_pred_proba = final_model.predict_proba(test_pool)[:, 1]

# Create submission
submission = pd.DataFrame({
    "TransactionID": test_processed["TransactionID"],
    "isFraud": test_pred_proba
})
submission.to_csv("catboost_auc_submission_fine_tuned_with_more_trials.csv", index=False)

print("Submission file created: catboost_auc_submission.csv")

0:	learn: 0.4075424	total: 503ms	remaining: 6m 16s
100:	learn: 0.0678686	total: 58.7s	remaining: 6m 16s
200:	learn: 0.0574476	total: 2m 5s	remaining: 5m 41s
300:	learn: 0.0524879	total: 3m 10s	remaining: 4m 43s
400:	learn: 0.0481044	total: 4m 15s	remaining: 3m 41s
500:	learn: 0.0444347	total: 5m 19s	remaining: 2m 38s
600:	learn: 0.0412870	total: 6m 24s	remaining: 1m 34s
700:	learn: 0.0387877	total: 7m 27s	remaining: 30.7s
748:	learn: 0.0375502	total: 7m 58s	remaining: 0us
Submission file created: catboost_auc_submission.csv


## Catboost with LOG loss evaluation with 30 trials

In [None]:
import optuna
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss

# Target variable
y_train = train_y

# Define the objective function for Optuna
def objective(trial):
    # Suggest hyperparameters for CatBoost
    params = {
        "iterations": trial.suggest_int("iterations", 100, 1000),
        "depth": trial.suggest_int("depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.5, log=True),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-3, 10.0, log=True),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0.0, 1.0),
        "random_strength": trial.suggest_float("random_strength", 0.0, 10.0),
        "border_count": trial.suggest_int("border_count", 32, 255),
        "loss_function": "Logloss",
        "eval_metric": "Logloss",
        "task_type": "GPU",  # Change to "GPU" if running on GPU
        "verbose": False,
        "random_state": 42,
    }

    # Cross-validation setup
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    log_losses = []

    for train_idx, val_idx in cv.split(filled_train, y_train):
        X_train, X_val = filled_train.iloc[train_idx], filled_train.iloc[val_idx]
        y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]

        # Create CatBoost Pool
        train_pool = Pool(X_train, y_train_fold, cat_features=cat_features)
        val_pool = Pool(X_val, y_val_fold, cat_features=cat_features)

        # Train CatBoost model
        model = CatBoostClassifier(**params)
        model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=50, verbose=False)

        # Predict probabilities for validation set
        y_val_pred_proba = model.predict_proba(X_val)[:, 1]  # Probabilities for class 1

        # Calculate Log Loss
        log_losses.append(log_loss(y_val_fold, y_val_pred_proba + 1e-15))  # Add small epsilon to avoid log(0)

    # Return the mean log loss (minimize this)
    print(np.mean(log_losses))
    return np.mean(log_losses)

# Run Optuna optimization
study = optuna.create_study(direction="minimize", study_name="CatBoost Log Loss Tuning")
study.optimize(objective, n_trials=50)

# Print best parameters
print("Best parameters:", study.best_trial.params)

# # Train the final model with the best parameters on the full dataset
# best_params = study.best_trial.params
# final_pool = Pool(filled_train, y_train, cat_features=cat_features)
# final_model = CatBoostClassifier(**best_params, verbose=100)
# final_model.fit(final_pool)

# # Make predictions on the test set
# test_pool = Pool(filled_test, cat_features=cat_features)
# test_pred_proba = final_model.predict_proba(test_pool)[:, 1]

# # Create submission
# submission = pd.DataFrame({
#     "TransactionID": test_processed["TransactionID"],
#     "isFraud": test_pred_proba
# })
# submission.to_csv("catboost_logloss_submission.csv", index=False)

# print("Submission file created: catboost_logloss_submission.csv")


[I 2024-12-07 20:03:38,297] A new study created in memory with name: CatBoost Log Loss Tuning
[I 2024-12-07 20:08:07,831] Trial 0 finished with value: 0.051666531659739265 and parameters: {'iterations': 628, 'depth': 8, 'learning_rate': 0.1568043883162735, 'l2_leaf_reg': 0.04351111154361797, 'bagging_temperature': 0.881323716639531, 'random_strength': 3.738382614647681, 'border_count': 242}. Best is trial 0 with value: 0.051666531659739265.


0.051666531659739265


[I 2024-12-07 20:13:54,880] Trial 1 finished with value: 0.052524361473352335 and parameters: {'iterations': 998, 'depth': 7, 'learning_rate': 0.08744799447668071, 'l2_leaf_reg': 0.0019929958328108823, 'bagging_temperature': 0.7662886573259491, 'random_strength': 7.963089144666048, 'border_count': 158}. Best is trial 0 with value: 0.051666531659739265.


0.052524361473352335


[I 2024-12-07 20:16:07,358] Trial 2 finished with value: 0.06122054973712495 and parameters: {'iterations': 309, 'depth': 7, 'learning_rate': 0.11192074071754711, 'l2_leaf_reg': 2.5630813399738623, 'bagging_temperature': 0.9716364945840094, 'random_strength': 7.537579028624313, 'border_count': 186}. Best is trial 0 with value: 0.051666531659739265.


0.06122054973712495


[I 2024-12-07 20:19:09,712] Trial 3 finished with value: 0.05424326958490505 and parameters: {'iterations': 389, 'depth': 8, 'learning_rate': 0.21706062765375478, 'l2_leaf_reg': 0.008403514579605902, 'bagging_temperature': 0.8429160244082903, 'random_strength': 7.228929837141084, 'border_count': 253}. Best is trial 0 with value: 0.051666531659739265.


0.05424326958490505


[I 2024-12-07 20:21:35,796] Trial 4 finished with value: 0.054535356037065816 and parameters: {'iterations': 347, 'depth': 7, 'learning_rate': 0.1506376838874219, 'l2_leaf_reg': 0.003956211312464788, 'bagging_temperature': 0.2012463379252145, 'random_strength': 4.815704603731802, 'border_count': 141}. Best is trial 0 with value: 0.051666531659739265.


0.054535356037065816


[I 2024-12-07 20:23:10,220] Trial 5 finished with value: 0.06549651234456313 and parameters: {'iterations': 457, 'depth': 3, 'learning_rate': 0.1834491185921306, 'l2_leaf_reg': 4.743167572729501, 'bagging_temperature': 0.7459450944285722, 'random_strength': 8.237338751530814, 'border_count': 39}. Best is trial 0 with value: 0.051666531659739265.


0.06549651234456313


[I 2024-12-07 20:26:30,262] Trial 6 finished with value: 0.10626762507614455 and parameters: {'iterations': 669, 'depth': 6, 'learning_rate': 0.0029932799808211215, 'l2_leaf_reg': 0.039115236503052375, 'bagging_temperature': 0.6736040443983473, 'random_strength': 6.864953058931178, 'border_count': 253}. Best is trial 0 with value: 0.051666531659739265.


0.10626762507614455


[I 2024-12-07 20:29:36,098] Trial 7 finished with value: 0.05372497657600629 and parameters: {'iterations': 832, 'depth': 4, 'learning_rate': 0.48381429819634536, 'l2_leaf_reg': 1.1411907442468459, 'bagging_temperature': 0.7450593803516102, 'random_strength': 2.105037759392828, 'border_count': 207}. Best is trial 0 with value: 0.051666531659739265.


0.05372497657600629


[I 2024-12-07 20:33:13,705] Trial 8 finished with value: 0.08942203400044299 and parameters: {'iterations': 998, 'depth': 4, 'learning_rate': 0.00447194556220727, 'l2_leaf_reg': 0.19555837349189295, 'bagging_temperature': 0.005107489669016085, 'random_strength': 6.253431085322334, 'border_count': 89}. Best is trial 0 with value: 0.051666531659739265.


0.08942203400044299


[I 2024-12-07 20:37:31,168] Trial 9 finished with value: 0.053696954932158626 and parameters: {'iterations': 904, 'depth': 10, 'learning_rate': 0.2927281975869129, 'l2_leaf_reg': 0.025368964086679643, 'bagging_temperature': 0.9350961272475211, 'random_strength': 6.0364819572242014, 'border_count': 92}. Best is trial 0 with value: 0.051666531659739265.


0.053696954932158626


[I 2024-12-07 20:38:43,934] Trial 10 finished with value: 0.09154476514519132 and parameters: {'iterations': 145, 'depth': 10, 'learning_rate': 0.026524218798048053, 'l2_leaf_reg': 0.3098386344732078, 'bagging_temperature': 0.4744112966842312, 'random_strength': 0.13194579590254385, 'border_count': 212}. Best is trial 0 with value: 0.051666531659739265.


0.09154476514519132


[I 2024-12-07 20:43:14,736] Trial 11 finished with value: 0.05986034924469039 and parameters: {'iterations': 646, 'depth': 8, 'learning_rate': 0.03539554651519598, 'l2_leaf_reg': 0.0011766206323644945, 'bagging_temperature': 0.5458417280098087, 'random_strength': 4.000879694286144, 'border_count': 157}. Best is trial 0 with value: 0.051666531659739265.


0.05986034924469039


[I 2024-12-07 20:47:10,875] Trial 12 finished with value: 0.05784004191131499 and parameters: {'iterations': 763, 'depth': 6, 'learning_rate': 0.05867067433775448, 'l2_leaf_reg': 0.0014491313335468843, 'bagging_temperature': 0.5288498290624598, 'random_strength': 9.126814701878484, 'border_count': 130}. Best is trial 0 with value: 0.051666531659739265.


0.05784004191131499


[I 2024-12-07 20:51:21,578] Trial 13 finished with value: 0.07814381552593072 and parameters: {'iterations': 511, 'depth': 9, 'learning_rate': 0.009728070683981597, 'l2_leaf_reg': 0.011069202518854062, 'bagging_temperature': 0.9953572271094743, 'random_strength': 3.2367141003736952, 'border_count': 182}. Best is trial 0 with value: 0.051666531659739265.


0.07814381552593072


[I 2024-12-07 20:54:13,756] Trial 14 finished with value: 0.05879581166583355 and parameters: {'iterations': 620, 'depth': 5, 'learning_rate': 0.0790613776385738, 'l2_leaf_reg': 0.07133817626817071, 'bagging_temperature': 0.33367810879570736, 'random_strength': 2.5711884095687347, 'border_count': 221}. Best is trial 0 with value: 0.051666531659739265.


0.05879581166583355


[I 2024-12-07 21:00:05,753] Trial 15 finished with value: 0.12958534309849354 and parameters: {'iterations': 992, 'depth': 8, 'learning_rate': 0.0012929141837871706, 'l2_leaf_reg': 0.00403775244758367, 'bagging_temperature': 0.8381971146786673, 'random_strength': 5.103389535114037, 'border_count': 106}. Best is trial 0 with value: 0.051666531659739265.


0.12958534309849354


[I 2024-12-07 21:06:17,809] Trial 16 finished with value: 0.06270692384629446 and parameters: {'iterations': 764, 'depth': 9, 'learning_rate': 0.017389841303385774, 'l2_leaf_reg': 0.33606591097733135, 'bagging_temperature': 0.6368612121763924, 'random_strength': 0.5297596257463439, 'border_count': 49}. Best is trial 0 with value: 0.051666531659739265.


0.06270692384629446


[I 2024-12-07 21:07:06,240] Trial 17 finished with value: 0.09294464471812577 and parameters: {'iterations': 179, 'depth': 7, 'learning_rate': 0.051229531440218204, 'l2_leaf_reg': 0.01483144042007536, 'bagging_temperature': 0.8356499065081585, 'random_strength': 9.901547690743872, 'border_count': 173}. Best is trial 0 with value: 0.051666531659739265.


0.09294464471812577


[I 2024-12-07 21:10:44,311] Trial 18 finished with value: 0.05295586571657436 and parameters: {'iterations': 878, 'depth': 9, 'learning_rate': 0.4307227722089576, 'l2_leaf_reg': 0.07383225251655019, 'bagging_temperature': 0.39923395718047594, 'random_strength': 1.5624680093321075, 'border_count': 233}. Best is trial 0 with value: 0.051666531659739265.


0.05295586571657436


[I 2024-12-07 21:12:13,600] Trial 19 finished with value: 0.0891818387929964 and parameters: {'iterations': 249, 'depth': 5, 'learning_rate': 0.014987580863774998, 'l2_leaf_reg': 0.8243522809660517, 'bagging_temperature': 0.6424239739267705, 'random_strength': 4.793091300126125, 'border_count': 109}. Best is trial 0 with value: 0.051666531659739265.


0.0891818387929964


[I 2024-12-07 21:15:14,209] Trial 20 finished with value: 0.05767298967764437 and parameters: {'iterations': 554, 'depth': 6, 'learning_rate': 0.10991869734597715, 'l2_leaf_reg': 0.003590565167727811, 'bagging_temperature': 0.8884249627084365, 'random_strength': 3.667759178892986, 'border_count': 61}. Best is trial 0 with value: 0.051666531659739265.


0.05767298967764437


[I 2024-12-07 21:20:33,044] Trial 21 finished with value: 0.04884686690539884 and parameters: {'iterations': 878, 'depth': 9, 'learning_rate': 0.2915424937128539, 'l2_leaf_reg': 0.0879648589626479, 'bagging_temperature': 0.3799426182799066, 'random_strength': 1.2941142607832146, 'border_count': 230}. Best is trial 21 with value: 0.04884686690539884.


0.04884686690539884


[I 2024-12-07 21:26:30,260] Trial 22 finished with value: 0.04709716992652767 and parameters: {'iterations': 922, 'depth': 8, 'learning_rate': 0.289797854805548, 'l2_leaf_reg': 0.14439378735693914, 'bagging_temperature': 0.25537238507055116, 'random_strength': 1.1868593706497697, 'border_count': 234}. Best is trial 22 with value: 0.04709716992652767.


0.04709716992652767


[I 2024-12-07 21:31:19,851] Trial 23 finished with value: 0.04861462195866687 and parameters: {'iterations': 769, 'depth': 9, 'learning_rate': 0.304975276463343, 'l2_leaf_reg': 0.14038120556481112, 'bagging_temperature': 0.23288867499321783, 'random_strength': 1.1709568047228591, 'border_count': 234}. Best is trial 22 with value: 0.04709716992652767.


0.04861462195866687


[I 2024-12-07 21:36:48,654] Trial 24 finished with value: 0.04855972563375308 and parameters: {'iterations': 761, 'depth': 9, 'learning_rate': 0.29902442248276256, 'l2_leaf_reg': 0.15017269196435165, 'bagging_temperature': 0.21112613787802698, 'random_strength': 1.158680913423015, 'border_count': 198}. Best is trial 22 with value: 0.04709716992652767.


0.04855972563375308


[I 2024-12-07 21:41:45,248] Trial 25 finished with value: 0.05011911569534076 and parameters: {'iterations': 764, 'depth': 10, 'learning_rate': 0.29982659809793993, 'l2_leaf_reg': 0.17014180094825426, 'bagging_temperature': 0.1895156122510263, 'random_strength': 1.6643967798764008, 'border_count': 205}. Best is trial 22 with value: 0.04709716992652767.


0.05011911569534076


[I 2024-12-07 21:46:50,279] Trial 26 finished with value: 0.04827379011404065 and parameters: {'iterations': 726, 'depth': 9, 'learning_rate': 0.41765134509149177, 'l2_leaf_reg': 0.581285742121047, 'bagging_temperature': 0.20728859362573115, 'random_strength': 0.8940669235748013, 'border_count': 210}. Best is trial 22 with value: 0.04709716992652767.


0.04827379011404065


[I 2024-12-07 21:52:06,089] Trial 27 finished with value: 0.04747557524847139 and parameters: {'iterations': 827, 'depth': 8, 'learning_rate': 0.44056617474858856, 'l2_leaf_reg': 0.7921751064925575, 'bagging_temperature': 0.04988080086833868, 'random_strength': 2.6684303388170214, 'border_count': 194}. Best is trial 22 with value: 0.04709716992652767.


0.04747557524847139


[I 2024-12-07 21:56:44,521] Trial 28 finished with value: 0.04774048350284791 and parameters: {'iterations': 700, 'depth': 8, 'learning_rate': 0.47235630938877116, 'l2_leaf_reg': 0.790216496603614, 'bagging_temperature': 0.033163249159584685, 'random_strength': 2.6672375876592143, 'border_count': 191}. Best is trial 22 with value: 0.04709716992652767.


0.04774048350284791


[I 2024-12-07 22:03:01,399] Trial 29 finished with value: 0.046574187931322016 and parameters: {'iterations': 924, 'depth': 8, 'learning_rate': 0.1594460863661075, 'l2_leaf_reg': 1.922656398678431, 'bagging_temperature': 0.03174556334672353, 'random_strength': 2.9372431655008557, 'border_count': 170}. Best is trial 29 with value: 0.046574187931322016.


0.046574187931322016


[I 2024-12-07 22:09:12,407] Trial 30 finished with value: 0.04869130631036471 and parameters: {'iterations': 927, 'depth': 8, 'learning_rate': 0.13329109381523235, 'l2_leaf_reg': 8.988315281187564, 'bagging_temperature': 0.08524549146434732, 'random_strength': 2.579487611136575, 'border_count': 171}. Best is trial 29 with value: 0.046574187931322016.


0.04869130631036471


[I 2024-12-07 22:14:55,206] Trial 31 finished with value: 0.046258235354992405 and parameters: {'iterations': 831, 'depth': 8, 'learning_rate': 0.19787700294145588, 'l2_leaf_reg': 1.719467447745354, 'bagging_temperature': 0.003077341657130972, 'random_strength': 3.0129368520970843, 'border_count': 184}. Best is trial 31 with value: 0.046258235354992405.


0.046258235354992405


[I 2024-12-07 22:20:20,466] Trial 32 finished with value: 0.04674536089628659 and parameters: {'iterations': 940, 'depth': 7, 'learning_rate': 0.2023558125285519, 'l2_leaf_reg': 1.9108836845616561, 'bagging_temperature': 0.11096545607931614, 'random_strength': 3.3129452606268233, 'border_count': 155}. Best is trial 31 with value: 0.046258235354992405.


0.04674536089628659


[I 2024-12-07 22:25:47,953] Trial 33 finished with value: 0.04719316044338614 and parameters: {'iterations': 942, 'depth': 7, 'learning_rate': 0.1891385893706226, 'l2_leaf_reg': 2.426436445006586, 'bagging_temperature': 0.12339952919135971, 'random_strength': 4.073597435087278, 'border_count': 151}. Best is trial 31 with value: 0.046258235354992405.


0.04719316044338614


[I 2024-12-07 22:30:42,879] Trial 34 finished with value: 0.0508831922911402 and parameters: {'iterations': 843, 'depth': 7, 'learning_rate': 0.09823305033296223, 'l2_leaf_reg': 1.6830548911725058, 'bagging_temperature': 0.1118393098331157, 'random_strength': 3.3895878100847887, 'border_count': 133}. Best is trial 31 with value: 0.046258235354992405.


0.0508831922911402


[I 2024-12-07 22:37:11,098] Trial 35 finished with value: 0.04624755672634312 and parameters: {'iterations': 962, 'depth': 8, 'learning_rate': 0.20859009680764215, 'l2_leaf_reg': 4.8043393324413755, 'bagging_temperature': 0.2787101750510259, 'random_strength': 1.9216373313135575, 'border_count': 170}. Best is trial 35 with value: 0.04624755672634312.


0.04624755672634312


[I 2024-12-07 22:42:40,856] Trial 36 finished with value: 0.0532853754934825 and parameters: {'iterations': 961, 'depth': 7, 'learning_rate': 0.06444682419124828, 'l2_leaf_reg': 5.300861619098948, 'bagging_temperature': 0.15721170124857142, 'random_strength': 2.035452334879489, 'border_count': 171}. Best is trial 35 with value: 0.04624755672634312.


0.0532853754934825


[I 2024-12-07 22:47:27,649] Trial 37 finished with value: 0.04885434306553837 and parameters: {'iterations': 824, 'depth': 7, 'learning_rate': 0.167925461837444, 'l2_leaf_reg': 4.62288704455052, 'bagging_temperature': 0.07158917448856453, 'random_strength': 4.405623527997893, 'border_count': 142}. Best is trial 35 with value: 0.04624755672634312.


0.04885434306553837


[I 2024-12-07 22:51:48,931] Trial 38 finished with value: 0.058936801942094816 and parameters: {'iterations': 870, 'depth': 6, 'learning_rate': 0.041398169235036496, 'l2_leaf_reg': 2.640912734468717, 'bagging_temperature': 0.29040087670918896, 'random_strength': 3.1697382261662472, 'border_count': 166}. Best is trial 35 with value: 0.04624755672634312.


0.058936801942094816


[I 2024-12-07 22:58:22,480] Trial 39 finished with value: 0.04821703329468168 and parameters: {'iterations': 983, 'depth': 8, 'learning_rate': 0.13277928810379253, 'l2_leaf_reg': 7.461107443538928, 'bagging_temperature': 0.0019091458390829395, 'random_strength': 5.643458173073895, 'border_count': 181}. Best is trial 35 with value: 0.04624755672634312.


0.04821703329468168


[I 2024-12-07 23:03:50,365] Trial 40 finished with value: 0.04688786639659902 and parameters: {'iterations': 948, 'depth': 7, 'learning_rate': 0.22253667431270774, 'l2_leaf_reg': 1.7075615773095227, 'bagging_temperature': 0.141349011932075, 'random_strength': 2.046331246003281, 'border_count': 120}. Best is trial 35 with value: 0.04624755672634312.


0.04688786639659902


[I 2024-12-07 23:09:15,697] Trial 41 finished with value: 0.04678311930468018 and parameters: {'iterations': 941, 'depth': 7, 'learning_rate': 0.19703848430822593, 'l2_leaf_reg': 1.600951869195546, 'bagging_temperature': 0.15473508206257644, 'random_strength': 1.771206393945758, 'border_count': 124}. Best is trial 35 with value: 0.04624755672634312.


0.04678311930468018


[I 2024-12-07 23:15:16,162] Trial 42 finished with value: 0.046376775514470434 and parameters: {'iterations': 887, 'depth': 8, 'learning_rate': 0.19734160927447478, 'l2_leaf_reg': 3.2521877180745684, 'bagging_temperature': 0.08218706396663447, 'random_strength': 2.9898977099933695, 'border_count': 151}. Best is trial 35 with value: 0.04624755672634312.


0.046376775514470434


[I 2024-12-07 23:21:20,879] Trial 43 finished with value: 0.049552784920904695 and parameters: {'iterations': 894, 'depth': 8, 'learning_rate': 0.09089044149082824, 'l2_leaf_reg': 3.019080032920735, 'bagging_temperature': 0.07147320533721858, 'random_strength': 3.083325594703355, 'border_count': 151}. Best is trial 35 with value: 0.04624755672634312.


0.049552784920904695


[I 2024-12-07 23:26:49,955] Trial 44 finished with value: 0.04642237280593196 and parameters: {'iterations': 794, 'depth': 8, 'learning_rate': 0.22571118001336865, 'l2_leaf_reg': 4.0048768111040305, 'bagging_temperature': 0.0016389852987072496, 'random_strength': 3.7826787724691098, 'border_count': 158}. Best is trial 35 with value: 0.04624755672634312.


0.04642237280593196


[I 2024-12-07 23:32:25,375] Trial 45 finished with value: 0.04850287137141259 and parameters: {'iterations': 804, 'depth': 8, 'learning_rate': 0.13203050247767334, 'l2_leaf_reg': 4.23364548291039, 'bagging_temperature': 0.02512882568898596, 'random_strength': 4.276439638207186, 'border_count': 143}. Best is trial 35 with value: 0.04624755672634312.


0.04850287137141259


[I 2024-12-07 23:39:01,717] Trial 46 finished with value: 0.05161466483449604 and parameters: {'iterations': 608, 'depth': 10, 'learning_rate': 0.08144187356918607, 'l2_leaf_reg': 6.504824130873559, 'bagging_temperature': 0.2884662333373824, 'random_strength': 3.816073931594247, 'border_count': 164}. Best is trial 35 with value: 0.04624755672634312.


0.05161466483449604


[I 2024-12-07 23:46:01,475] Trial 47 finished with value: 0.04605015597593375 and parameters: {'iterations': 851, 'depth': 9, 'learning_rate': 0.2297453294159207, 'l2_leaf_reg': 0.4045230071405023, 'bagging_temperature': 0.004527543389180377, 'random_strength': 2.3179713243727047, 'border_count': 184}. Best is trial 47 with value: 0.04605015597593375.


0.04605015597593375


[I 2024-12-07 23:50:55,220] Trial 48 finished with value: 0.04777611948487367 and parameters: {'iterations': 424, 'depth': 10, 'learning_rate': 0.23244405902922669, 'l2_leaf_reg': 0.44578318384961274, 'bagging_temperature': 0.08700153887428552, 'random_strength': 2.1805523087076693, 'border_count': 182}. Best is trial 47 with value: 0.04605015597593375.


0.04777611948487367


[I 2024-12-07 23:56:37,520] Trial 49 finished with value: 0.08101055993373722 and parameters: {'iterations': 698, 'depth': 9, 'learning_rate': 0.0050862600483667095, 'l2_leaf_reg': 1.1610784836429426, 'bagging_temperature': 0.0017270943499896898, 'random_strength': 0.0478708931412557, 'border_count': 200}. Best is trial 47 with value: 0.04605015597593375.


0.08101055993373722
Best parameters: {'iterations': 851, 'depth': 9, 'learning_rate': 0.2297453294159207, 'l2_leaf_reg': 0.4045230071405023, 'bagging_temperature': 0.004527543389180377, 'random_strength': 2.3179713243727047, 'border_count': 184}


In [None]:
{'iterations': 851, 'depth': 9, 'learning_rate': 0.2297453294159207, 'l2_leaf_reg': 0.4045230071405023, 'bagging_temperature': 0.004527543389180377, 'random_strength': 2.3179713243727047, 'border_count': 184}

In [None]:
# Train the final model with the best parameters on the full dataset
best_params = study.best_trial.params
final_pool = Pool(filled_train, y_train, cat_features=cat_features)
final_model = CatBoostClassifier(**best_params, verbose=100)
final_model.fit(final_pool)

# Make predictions on the test set
test_pool = Pool(filled_test, cat_features=cat_features)
test_pred_proba = final_model.predict_proba(test_pool)[:, 1]

# Create submission
submission = pd.DataFrame({
    "TransactionID": test_processed["TransactionID"],
    "isFraud": test_pred_proba
})
submission.to_csv("catboost_logloss_submission.csv", index=False)

print("Submission file created: catboost_logloss_submission.csv")

0:	learn: 0.4018979	total: 235ms	remaining: 3m 19s
100:	learn: 0.0602193	total: 1m 48s	remaining: 13m 23s
200:	learn: 0.0501391	total: 3m 35s	remaining: 11m 37s
300:	learn: 0.0444150	total: 5m 20s	remaining: 9m 46s
400:	learn: 0.0404429	total: 7m 6s	remaining: 7m 58s
500:	learn: 0.0373053	total: 8m 51s	remaining: 6m 11s
600:	learn: 0.0343420	total: 10m 36s	remaining: 4m 24s
700:	learn: 0.0318427	total: 12m 19s	remaining: 2m 38s
800:	learn: 0.0295467	total: 14m 5s	remaining: 52.8s
850:	learn: 0.0284451	total: 15m	remaining: 0us
Submission file created: catboost_logloss_submission.csv
