In [1]:
!pip install -qU polars catboost optuna

In [2]:
import numpy as np
import pandas as pd
import polars as pl
import polars.selectors as cs

import matplotlib.pyplot as plt
import seaborn as sns 

from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score, classification_report

from catboost import CatBoostClassifier, Pool, EShapCalcType, EFeaturesSelectionAlgorithm

import optuna

## 0. Dataset load and train test split

In [3]:
DATA_PATH = '/kaggle/input/sber-salaraclient-churn-data/'

In [4]:
data = pl.read_parquet(DATA_PATH + 'data_cleaned.parquet')
data.head()

target,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,feature12,feature14,feature16,feature17,feature18,feature19,feature22,feature24,feature25,feature31,feature33,feature35,feature36,feature37,feature38,feature41,feature43,feature44,feature45,feature46,feature47,feature48,feature49,feature52,feature53,feature54,feature55,…,feature992,feature993,feature994,feature996,feature997,feature998,feature999,feature1000,feature1001,feature1002,feature1003,feature1004,feature1035,feature1036,feature1037,feature1038,feature1042,feature1043,feature1045,feature1049,feature1050,feature1051,feature1052,feature1053,feature1054,feature1055,feature1056,feature1057,feature1059,feature1063,feature1064,feature1065,feature1066,feature1067,feature1068,feature1069,feature1076
i8,i16,i16,i16,i32,i16,i16,i8,i16,i16,i16,i16,i16,i16,i8,i16,i32,i16,i16,i16,i32,i32,i16,i32,i16,i16,i32,i16,i16,i32,i32,i16,i16,i16,i16,i32,i32,…,i16,i16,i16,i8,i8,i8,i8,i8,i8,i16,i16,i16,i32,i32,i16,i32,i16,i16,i16,i16,i16,i16,i16,i16,i16,i16,i16,i16,i32,i32,i32,i32,i32,i32,i32,i32,i32
0,1761,1759,85,105469,191,46,0,6,95,20,1,50,60,7,248,91601,173,1722,633,50524,16130,3652,85495,14172,7705,19131,6511,12018,31287,27091,0,0,0,0,30270,27157,…,5,8,16,1,0,0,2,1,2,3,6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,12045,12107,12510,21126,28913,48985,84264,84264
0,1761,1759,120,105610,144,71,0,135,135,187,1,43,101,11,18,36890,184,1722,633,50524,18999,3652,85495,14172,7705,59900,6511,12018,116284,141739,0,0,0,0,112320,138924,…,4,10,22,0,0,0,3,0,2,3,9,22,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,890,1759,141,105227,191,11,0,0,0,187,1,0,0,28,248,140184,248,1722,633,50524,65748,3652,85495,14172,7705,11110,6511,12018,261217,284839,0,0,0,0,252909,278364,…,9,24,40,1,1,2,3,3,5,8,23,40,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,33735,31242,12691,48682,55949,70006,113317,113317
0,1599,966,30,102441,191,8,0,0,0,187,1,0,0,28,248,140184,248,1722,633,50524,65748,3652,85495,81,7705,38606,6511,12018,303119,151593,0,0,0,0,293748,148572,…,5,5,5,1,1,1,2,2,3,3,3,3,12436,0,0,12329,101,0,114,0,0,0,0,0,0,0,5220,0,5810,0,0,0,0,0,6872,8530,8530
0,1761,1759,85,104006,191,2,0,0,46,2,1,1,21,3,248,140184,248,1722,633,16988,65748,3652,0,14172,7705,17927,6511,12018,245560,449478,0,0,0,0,237725,438447,…,6,12,14,1,1,0,3,1,3,4,10,12,27032,0,0,26719,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7642,7642


In [5]:
print('Data shape and size:', data.shape, round(data.estimated_size('gb'), 3))
print('Target info:', pl.Series(data.select(pl.col('target'))).value_counts())

Data shape and size: (519615, 641) 0.659
Target info: shape: (2, 2)
┌────────┬────────┐
│ target ┆ count  │
│ ---    ┆ ---    │
│ i8     ┆ u32    │
╞════════╪════════╡
│ 0      ┆ 501078 │
│ 1      ┆ 18537  │
└────────┴────────┘


In [6]:
data.n_unique()

519615

In [7]:
n_unique_vals_by_col = data.drop(
    'target'
).select(
    pl.all().n_unique()
).transpose(
    include_header=True
).rename(
    {'column_0': 'n_unique'}
).sort(
    'n_unique'
)
n_unique_vals_by_col

column,n_unique
str,u32
"""feature242""",2
"""feature384""",2
"""feature385""",2
"""feature386""",2
"""feature387""",2
…,…
"""feature136""",501444
"""feature142""",501904
"""feature206""",502242
"""feature226""",504898


In [8]:
X_train, X_valid, y_train, y_valid = train_test_split(
    data.drop('target').to_pandas(), data['target'].to_pandas().values, test_size=0.25, random_state=42, stratify=data['target'].to_pandas()
)
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

((389711, 640), (129904, 640), (389711,), (129904,))

## 1. Parameter tuning

In [9]:
default_params = {
    'random_state': 42, 'allow_writing_files': False, 'verbose': 1000, 'task_type': 'GPU',
}

def objective(trial):
    params = {
        'undersampling': trial.suggest_categorical('undersampling', [True, False]),
        'n_unique_to_cat_cols': trial.suggest_int('n_unique_to_cat_cols', 0, 25, step=5),
        # boosting params
        'iterations': trial.suggest_int('iterations', 500, 2500, step=500),
        'depth': trial.suggest_int('depth', 4, 8, step=1), # 6-10 catboost recoms
        'l2_leaf_reg': trial.suggest_int('l2_leaf_reg', 5, 25, step=2),
        'border_count': trial.suggest_int('border_count', 64, 128, step=64),
        'max_ctr_complexity': trial.suggest_int('max_ctr_complexity', 0, 7, step=1),
        'bootstrap_type': trial.suggest_categorical('bootstrap_type', ['Bayesian', 'Bernoulli']),
    }
    if params['bootstrap_type'] == 'Bayesian':
        params['bagging_temperature'] = trial.suggest_float('bagging_temperature', 0, 1)
    elif params['bootstrap_type'] == 'Bernoulli':
        params['subsample'] = trial.suggest_float('subsample', 0.5, 1.0)

    sampled_train = np.random.choice(len(X_train), size=int(len(X_train) * 0.9), replace=False)

    X_train_sampled, y_train_sampled = X_train.iloc[sampled_train].reset_index(drop=True), y_train[sampled_train]

    if params.pop('undersampling'):
        X_train_sampled['y'] = y_train_sampled
        Xy_train_sampled = X_train_sampled.groupby('y').head(100_000)

        X_train_sampled, y_train_sampled = Xy_train_sampled.drop(columns=['y']), Xy_train_sampled['y'].values

    n_unique_to_cat_cols = params.pop('n_unique_to_cat_cols')

    cat_cols = n_unique_vals_by_col.filter(
        pl.col('n_unique') <= n_unique_to_cat_cols
    )['column'].to_list()

    train_pool = Pool(X_train_sampled, label=y_train_sampled, cat_features=cat_cols)

    model = CatBoostClassifier(**params, **default_params)
    model.fit(train_pool)

    roc_auc_score_score = roc_auc_score(y_valid, model.predict_proba(X_valid)[:, 1])

    return roc_auc_score_score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50, n_jobs=1)

print(f'Number of finished trials: {len(study.trials)}')

print(f'Best trial:')
trial = study.best_trial

print(f'  Value: {trial.value}')

print(f'  Params:')
for key, value in trial.params.items():
    print(f'    {key}: {value}')

[I 2024-03-13 18:51:52,472] A new study created in memory with name: no-name-56a8fa76-3f46-422b-aec4-0e8ba0712dab


0:	learn: 0.6668199	total: 8.5s	remaining: 5h 54m 2s
1000:	learn: 0.2752140	total: 32.5s	remaining: 48.7s
2000:	learn: 0.2493313	total: 56.7s	remaining: 14.1s
2499:	learn: 0.2374208	total: 1m 8s	remaining: 0us


[I 2024-03-13 18:53:39,763] Trial 0 finished with value: 0.7621774084938122 and parameters: {'undersampling': True, 'n_unique_to_cat_cols': 25, 'iterations': 2500, 'depth': 7, 'l2_leaf_reg': 17, 'border_count': 128, 'max_ctr_complexity': 1, 'bootstrap_type': 'Bernoulli', 'subsample': 0.8267422253593207}. Best is trial 0 with value: 0.7621774084938122.


0:	learn: 0.6483510	total: 13.6ms	remaining: 13.6s
999:	learn: 0.1273823	total: 26.8s	remaining: 0us


[I 2024-03-13 18:54:29,378] Trial 1 finished with value: 0.760414914229804 and parameters: {'undersampling': False, 'n_unique_to_cat_cols': 20, 'iterations': 1000, 'depth': 7, 'l2_leaf_reg': 5, 'border_count': 64, 'max_ctr_complexity': 0, 'bootstrap_type': 'Bayesian', 'bagging_temperature': 0.12783311144653642}. Best is trial 0 with value: 0.7621774084938122.


0:	learn: 0.6470769	total: 98.4ms	remaining: 4m 6s
1000:	learn: 0.1339732	total: 1m 33s	remaining: 2m 19s
2000:	learn: 0.1306627	total: 3m 6s	remaining: 46.4s
2499:	learn: 0.1291139	total: 3m 52s	remaining: 0us


[I 2024-03-13 18:58:46,322] Trial 2 finished with value: 0.7605865504011551 and parameters: {'undersampling': False, 'n_unique_to_cat_cols': 25, 'iterations': 2500, 'depth': 5, 'l2_leaf_reg': 15, 'border_count': 128, 'max_ctr_complexity': 6, 'bootstrap_type': 'Bayesian', 'bagging_temperature': 0.32052664270179687}. Best is trial 0 with value: 0.7621774084938122.


0:	learn: 0.6468603	total: 113ms	remaining: 56.5s
499:	learn: 0.1318975	total: 55.2s	remaining: 0us


[I 2024-03-13 19:00:01,579] Trial 3 finished with value: 0.7573033391594485 and parameters: {'undersampling': False, 'n_unique_to_cat_cols': 10, 'iterations': 500, 'depth': 8, 'l2_leaf_reg': 21, 'border_count': 64, 'max_ctr_complexity': 2, 'bootstrap_type': 'Bernoulli', 'subsample': 0.7343568041801848}. Best is trial 0 with value: 0.7621774084938122.


0:	learn: 0.6466840	total: 177ms	remaining: 5m 53s
1000:	learn: 0.1254288	total: 2m 52s	remaining: 2m 52s
1999:	learn: 0.1157506	total: 5m 45s	remaining: 0us


[I 2024-03-13 19:06:11,765] Trial 4 finished with value: 0.7624312115954699 and parameters: {'undersampling': False, 'n_unique_to_cat_cols': 25, 'iterations': 2000, 'depth': 8, 'l2_leaf_reg': 7, 'border_count': 64, 'max_ctr_complexity': 3, 'bootstrap_type': 'Bayesian', 'bagging_temperature': 0.44009413914400275}. Best is trial 4 with value: 0.7624312115954699.


0:	learn: 0.6666829	total: 7.44ms	remaining: 3.71s
499:	learn: 0.3008341	total: 3.02s	remaining: 0us


[I 2024-03-13 19:06:17,524] Trial 5 finished with value: 0.7562236721034744 and parameters: {'undersampling': True, 'n_unique_to_cat_cols': 0, 'iterations': 500, 'depth': 4, 'l2_leaf_reg': 5, 'border_count': 64, 'max_ctr_complexity': 3, 'bootstrap_type': 'Bernoulli', 'subsample': 0.8875223650297475}. Best is trial 4 with value: 0.7624312115954699.


0:	learn: 0.6467009	total: 139ms	remaining: 5m 46s
1000:	learn: 0.1272510	total: 2m 21s	remaining: 3m 31s
2000:	learn: 0.1190528	total: 4m 42s	remaining: 1m 10s
2499:	learn: 0.1152121	total: 5m 52s	remaining: 0us


[I 2024-03-13 19:12:33,560] Trial 6 finished with value: 0.7623263556501296 and parameters: {'undersampling': False, 'n_unique_to_cat_cols': 15, 'iterations': 2500, 'depth': 8, 'l2_leaf_reg': 23, 'border_count': 64, 'max_ctr_complexity': 6, 'bootstrap_type': 'Bayesian', 'bagging_temperature': 0.3646853531489995}. Best is trial 4 with value: 0.7624312115954699.


0:	learn: 0.6467008	total: 161ms	remaining: 4m 1s
1000:	learn: 0.1265838	total: 2m 37s	remaining: 1m 18s
1499:	learn: 0.1229771	total: 3m 55s	remaining: 0us


[I 2024-03-13 19:16:52,613] Trial 7 finished with value: 0.7608764774603903 and parameters: {'undersampling': False, 'n_unique_to_cat_cols': 20, 'iterations': 1500, 'depth': 8, 'l2_leaf_reg': 23, 'border_count': 64, 'max_ctr_complexity': 6, 'bootstrap_type': 'Bayesian', 'bagging_temperature': 0.016090506308135022}. Best is trial 4 with value: 0.7624312115954699.


0:	learn: 0.6473724	total: 12.6ms	remaining: 6.29s
499:	learn: 0.1367535	total: 5.17s	remaining: 0us


[I 2024-03-13 19:17:02,444] Trial 8 finished with value: 0.753610669663066 and parameters: {'undersampling': False, 'n_unique_to_cat_cols': 0, 'iterations': 500, 'depth': 4, 'l2_leaf_reg': 23, 'border_count': 128, 'max_ctr_complexity': 4, 'bootstrap_type': 'Bernoulli', 'subsample': 0.864767154576433}. Best is trial 4 with value: 0.7624312115954699.


0:	learn: 0.6465704	total: 113ms	remaining: 2m 48s
1000:	learn: 0.1283722	total: 1m 51s	remaining: 55.8s
1499:	learn: 0.1244606	total: 2m 47s	remaining: 0us


[I 2024-03-13 19:20:11,688] Trial 9 finished with value: 0.7613164128279635 and parameters: {'undersampling': False, 'n_unique_to_cat_cols': 15, 'iterations': 1500, 'depth': 7, 'l2_leaf_reg': 9, 'border_count': 64, 'max_ctr_complexity': 2, 'bootstrap_type': 'Bernoulli', 'subsample': 0.9996477812489453}. Best is trial 4 with value: 0.7624312115954699.


0:	learn: 0.6662069	total: 50.5ms	remaining: 1m 40s
1000:	learn: 0.2857113	total: 47.2s	remaining: 47.1s
1999:	learn: 0.2692796	total: 1m 34s	remaining: 0us


[I 2024-03-13 19:21:56,321] Trial 10 finished with value: 0.7600134197832293 and parameters: {'undersampling': True, 'n_unique_to_cat_cols': 10, 'iterations': 2000, 'depth': 6, 'l2_leaf_reg': 11, 'border_count': 128, 'max_ctr_complexity': 4, 'bootstrap_type': 'Bayesian', 'bagging_temperature': 0.8820036874449995}. Best is trial 4 with value: 0.7624312115954699.


0:	learn: 0.6467616	total: 143ms	remaining: 4m 46s
1000:	learn: 0.1267849	total: 2m 21s	remaining: 2m 21s
1999:	learn: 0.1195460	total: 4m 42s	remaining: 0us


[I 2024-03-13 19:27:00,902] Trial 11 finished with value: 0.7612559133816059 and parameters: {'undersampling': False, 'n_unique_to_cat_cols': 15, 'iterations': 2000, 'depth': 8, 'l2_leaf_reg': 19, 'border_count': 64, 'max_ctr_complexity': 7, 'bootstrap_type': 'Bayesian', 'bagging_temperature': 0.5445425607641393}. Best is trial 4 with value: 0.7624312115954699.


0:	learn: 0.6465575	total: 59.5ms	remaining: 1m 58s
1000:	learn: 0.1319415	total: 56.5s	remaining: 56.4s
1999:	learn: 0.1270384	total: 1m 51s	remaining: 0us


[I 2024-03-13 19:29:11,515] Trial 12 finished with value: 0.7603186973711233 and parameters: {'undersampling': False, 'n_unique_to_cat_cols': 5, 'iterations': 2000, 'depth': 6, 'l2_leaf_reg': 11, 'border_count': 64, 'max_ctr_complexity': 5, 'bootstrap_type': 'Bayesian', 'bagging_temperature': 0.5295612083361738}. Best is trial 4 with value: 0.7624312115954699.


0:	learn: 0.6467326	total: 160ms	remaining: 6m 40s
1000:	learn: 0.1268415	total: 2m 35s	remaining: 3m 53s
2000:	learn: 0.1197524	total: 5m 10s	remaining: 1m 17s
2499:	learn: 0.1164513	total: 6m 28s	remaining: 0us


[I 2024-03-13 19:36:03,191] Trial 13 finished with value: 0.7615598972598127 and parameters: {'undersampling': False, 'n_unique_to_cat_cols': 20, 'iterations': 2500, 'depth': 8, 'l2_leaf_reg': 25, 'border_count': 64, 'max_ctr_complexity': 3, 'bootstrap_type': 'Bayesian', 'bagging_temperature': 0.45469348824890204}. Best is trial 4 with value: 0.7624312115954699.


0:	learn: 0.6465328	total: 148ms	remaining: 4m 54s
1000:	learn: 0.1306449	total: 2m 21s	remaining: 2m 21s
1999:	learn: 0.1252569	total: 4m 41s	remaining: 0us


[I 2024-03-13 19:41:09,943] Trial 14 finished with value: 0.7606681626383601 and parameters: {'undersampling': False, 'n_unique_to_cat_cols': 25, 'iterations': 2000, 'depth': 7, 'l2_leaf_reg': 9, 'border_count': 64, 'max_ctr_complexity': 6, 'bootstrap_type': 'Bayesian', 'bagging_temperature': 0.7559813162335914}. Best is trial 4 with value: 0.7624312115954699.


0:	learn: 0.6660759	total: 105ms	remaining: 4m 23s
1000:	learn: 0.2588459	total: 1m 39s	remaining: 2m 29s
2000:	learn: 0.2217683	total: 3m 19s	remaining: 49.6s
2499:	learn: 0.2051845	total: 4m 9s	remaining: 0us


[I 2024-03-13 19:45:31,079] Trial 15 finished with value: 0.7605044110332385 and parameters: {'undersampling': True, 'n_unique_to_cat_cols': 15, 'iterations': 2500, 'depth': 8, 'l2_leaf_reg': 15, 'border_count': 64, 'max_ctr_complexity': 5, 'bootstrap_type': 'Bayesian', 'bagging_temperature': 0.25938198554245695}. Best is trial 4 with value: 0.7624312115954699.


0:	learn: 0.6468481	total: 46.9ms	remaining: 1m 33s
1000:	learn: 0.1340094	total: 45.2s	remaining: 45.1s
1999:	learn: 0.1306521	total: 1m 30s	remaining: 0us


[I 2024-03-13 19:47:19,804] Trial 16 finished with value: 0.7597756872776726 and parameters: {'undersampling': False, 'n_unique_to_cat_cols': 5, 'iterations': 2000, 'depth': 5, 'l2_leaf_reg': 13, 'border_count': 64, 'max_ctr_complexity': 7, 'bootstrap_type': 'Bayesian', 'bagging_temperature': 0.6634277046487813}. Best is trial 4 with value: 0.7624312115954699.


0:	learn: 0.6465362	total: 108ms	remaining: 2m 41s
1000:	learn: 0.1318572	total: 1m 44s	remaining: 52.2s
1499:	learn: 0.1296458	total: 2m 36s	remaining: 0us


[I 2024-03-13 19:50:19,448] Trial 17 finished with value: 0.7605750672203629 and parameters: {'undersampling': False, 'n_unique_to_cat_cols': 20, 'iterations': 1500, 'depth': 6, 'l2_leaf_reg': 19, 'border_count': 128, 'max_ctr_complexity': 2, 'bootstrap_type': 'Bayesian', 'bagging_temperature': 0.3208994514060165}. Best is trial 4 with value: 0.7624312115954699.


0:	learn: 0.6662075	total: 62.3ms	remaining: 1m 2s
999:	learn: 0.2701271	total: 58s	remaining: 0us


[I 2024-03-13 19:51:27,883] Trial 18 finished with value: 0.760483374038964 and parameters: {'undersampling': True, 'n_unique_to_cat_cols': 10, 'iterations': 1000, 'depth': 7, 'l2_leaf_reg': 7, 'border_count': 64, 'max_ctr_complexity': 5, 'bootstrap_type': 'Bayesian', 'bagging_temperature': 0.42162267927762787}. Best is trial 4 with value: 0.7624312115954699.


0:	learn: 0.6466092	total: 178ms	remaining: 7m 24s
1000:	learn: 0.1277453	total: 2m 50s	remaining: 4m 14s
2000:	learn: 0.1207318	total: 5m 39s	remaining: 1m 24s
2499:	learn: 0.1173137	total: 7m 4s	remaining: 0us


[I 2024-03-13 19:58:56,640] Trial 19 finished with value: 0.7616220883478653 and parameters: {'undersampling': False, 'n_unique_to_cat_cols': 25, 'iterations': 2500, 'depth': 8, 'l2_leaf_reg': 25, 'border_count': 64, 'max_ctr_complexity': 4, 'bootstrap_type': 'Bayesian', 'bagging_temperature': 0.1773106889186135}. Best is trial 4 with value: 0.7624312115954699.


0:	learn: 0.6481219	total: 13.9ms	remaining: 27.7s
1000:	learn: 0.1344277	total: 18.4s	remaining: 18.3s
1999:	learn: 0.1313728	total: 36.6s	remaining: 0us


[I 2024-03-13 19:59:52,206] Trial 20 finished with value: 0.7590899694639724 and parameters: {'undersampling': False, 'n_unique_to_cat_cols': 5, 'iterations': 2000, 'depth': 5, 'l2_leaf_reg': 17, 'border_count': 128, 'max_ctr_complexity': 0, 'bootstrap_type': 'Bayesian', 'bagging_temperature': 0.6326764733634739}. Best is trial 4 with value: 0.7624312115954699.


0:	learn: 0.6665671	total: 26.3ms	remaining: 1m 5s
1000:	learn: 0.2778392	total: 22.9s	remaining: 34.3s
2000:	learn: 0.2545292	total: 45.6s	remaining: 11.4s
2499:	learn: 0.2441804	total: 57s	remaining: 0us


[I 2024-03-13 20:01:01,517] Trial 21 finished with value: 0.7610437467155537 and parameters: {'undersampling': True, 'n_unique_to_cat_cols': 25, 'iterations': 2500, 'depth': 7, 'l2_leaf_reg': 19, 'border_count': 128, 'max_ctr_complexity': 0, 'bootstrap_type': 'Bernoulli', 'subsample': 0.5036628203658072}. Best is trial 4 with value: 0.7624312115954699.


0:	learn: 0.6667377	total: 26.3ms	remaining: 1m 5s
1000:	learn: 0.2731533	total: 23.3s	remaining: 34.9s
2000:	learn: 0.2474492	total: 46.7s	remaining: 11.6s
2499:	learn: 0.2354052	total: 58.3s	remaining: 0us


[I 2024-03-13 20:02:12,634] Trial 22 finished with value: 0.761576579396445 and parameters: {'undersampling': True, 'n_unique_to_cat_cols': 20, 'iterations': 2500, 'depth': 7, 'l2_leaf_reg': 17, 'border_count': 128, 'max_ctr_complexity': 1, 'bootstrap_type': 'Bernoulli', 'subsample': 0.6891009275296313}. Best is trial 4 with value: 0.7624312115954699.


0:	learn: 0.6663924	total: 39.1ms	remaining: 1m 37s
1000:	learn: 0.2617009	total: 35s	remaining: 52.4s
2000:	learn: 0.2265131	total: 1m 9s	remaining: 17.4s
2499:	learn: 0.2112231	total: 1m 27s	remaining: 0us


[I 2024-03-13 20:03:52,378] Trial 23 finished with value: 0.7602598999023569 and parameters: {'undersampling': True, 'n_unique_to_cat_cols': 25, 'iterations': 2500, 'depth': 8, 'l2_leaf_reg': 13, 'border_count': 128, 'max_ctr_complexity': 1, 'bootstrap_type': 'Bernoulli', 'subsample': 0.5985727033200412}. Best is trial 4 with value: 0.7624312115954699.


0:	learn: 0.6662798	total: 27.1ms	remaining: 54.1s
1000:	learn: 0.2733479	total: 23.8s	remaining: 23.7s
1999:	learn: 0.2479479	total: 47.3s	remaining: 0us


[I 2024-03-13 20:04:51,743] Trial 24 finished with value: 0.762388005826276 and parameters: {'undersampling': True, 'n_unique_to_cat_cols': 15, 'iterations': 2000, 'depth': 7, 'l2_leaf_reg': 21, 'border_count': 128, 'max_ctr_complexity': 1, 'bootstrap_type': 'Bernoulli', 'subsample': 0.8382249191261969}. Best is trial 4 with value: 0.7624312115954699.


0:	learn: 0.6661873	total: 102ms	remaining: 3m 24s
1000:	learn: 0.2621895	total: 1m 35s	remaining: 1m 35s
1999:	learn: 0.2306130	total: 3m 9s	remaining: 0us


[I 2024-03-13 20:08:13,388] Trial 25 finished with value: 0.7609312594334433 and parameters: {'undersampling': True, 'n_unique_to_cat_cols': 15, 'iterations': 2000, 'depth': 8, 'l2_leaf_reg': 23, 'border_count': 64, 'max_ctr_complexity': 3, 'bootstrap_type': 'Bernoulli', 'subsample': 0.9839766798866445}. Best is trial 4 with value: 0.7624312115954699.


0:	learn: 0.6662092	total: 87.8ms	remaining: 2m 11s
1000:	learn: 0.2740546	total: 1m 17s	remaining: 38.7s
1499:	learn: 0.2604799	total: 1m 56s	remaining: 0us


[I 2024-03-13 20:10:21,179] Trial 26 finished with value: 0.7622016237761997 and parameters: {'undersampling': True, 'n_unique_to_cat_cols': 15, 'iterations': 1500, 'depth': 7, 'l2_leaf_reg': 21, 'border_count': 128, 'max_ctr_complexity': 2, 'bootstrap_type': 'Bernoulli', 'subsample': 0.7894401551890882}. Best is trial 4 with value: 0.7624312115954699.


0:	learn: 0.6482848	total: 12.6ms	remaining: 25.1s
1000:	learn: 0.1270372	total: 37.5s	remaining: 37.4s
1999:	learn: 0.1191759	total: 1m 14s	remaining: 0us


[I 2024-03-13 20:11:55,416] Trial 27 finished with value: 0.7613958114607107 and parameters: {'undersampling': False, 'n_unique_to_cat_cols': 10, 'iterations': 2000, 'depth': 8, 'l2_leaf_reg': 25, 'border_count': 64, 'max_ctr_complexity': 1, 'bootstrap_type': 'Bayesian', 'bagging_temperature': 0.39770580640874614}. Best is trial 4 with value: 0.7624312115954699.


0:	learn: 0.6663922	total: 92.9ms	remaining: 1m 32s
999:	learn: 0.2772025	total: 1m 26s	remaining: 0us


[I 2024-03-13 20:13:33,807] Trial 28 finished with value: 0.7608671958255107 and parameters: {'undersampling': True, 'n_unique_to_cat_cols': 20, 'iterations': 1000, 'depth': 7, 'l2_leaf_reg': 23, 'border_count': 128, 'max_ctr_complexity': 3, 'bootstrap_type': 'Bernoulli', 'subsample': 0.6881202950351428}. Best is trial 4 with value: 0.7624312115954699.


0:	learn: 0.6662583	total: 79.4ms	remaining: 1m 59s
1000:	learn: 0.2734187	total: 1m 14s	remaining: 37s
1499:	learn: 0.2588552	total: 1m 51s	remaining: 0us


[I 2024-03-13 20:15:36,494] Trial 29 finished with value: 0.7629756480426104 and parameters: {'undersampling': True, 'n_unique_to_cat_cols': 15, 'iterations': 1500, 'depth': 7, 'l2_leaf_reg': 21, 'border_count': 64, 'max_ctr_complexity': 5, 'bootstrap_type': 'Bernoulli', 'subsample': 0.9172004622572392}. Best is trial 29 with value: 0.7629756480426104.


0:	learn: 0.6659114	total: 52.7ms	remaining: 1m 18s
1000:	learn: 0.2808800	total: 48.6s	remaining: 24.2s
1499:	learn: 0.2701986	total: 1m 12s	remaining: 0us


[I 2024-03-13 20:16:59,666] Trial 30 finished with value: 0.7618701584723739 and parameters: {'undersampling': True, 'n_unique_to_cat_cols': 10, 'iterations': 1500, 'depth': 6, 'l2_leaf_reg': 17, 'border_count': 128, 'max_ctr_complexity': 5, 'bootstrap_type': 'Bernoulli', 'subsample': 0.9114198962510881}. Best is trial 29 with value: 0.7629756480426104.


0:	learn: 0.6661856	total: 83ms	remaining: 2m 4s
1000:	learn: 0.2738768	total: 1m 18s	remaining: 38.9s
1499:	learn: 0.2610910	total: 1m 56s	remaining: 0us


[I 2024-03-13 20:19:07,849] Trial 31 finished with value: 0.7629860683487327 and parameters: {'undersampling': True, 'n_unique_to_cat_cols': 15, 'iterations': 1500, 'depth': 7, 'l2_leaf_reg': 21, 'border_count': 64, 'max_ctr_complexity': 6, 'bootstrap_type': 'Bernoulli', 'subsample': 0.9275798959802446}. Best is trial 31 with value: 0.7629860683487327.


0:	learn: 0.6662037	total: 84ms	remaining: 1m 23s
999:	learn: 0.2718947	total: 1m 17s	remaining: 0us


[I 2024-03-13 20:20:36,816] Trial 32 finished with value: 0.7610024565324741 and parameters: {'undersampling': True, 'n_unique_to_cat_cols': 15, 'iterations': 1000, 'depth': 7, 'l2_leaf_reg': 21, 'border_count': 64, 'max_ctr_complexity': 7, 'bootstrap_type': 'Bernoulli', 'subsample': 0.93582728733298}. Best is trial 31 with value: 0.7629860683487327.


0:	learn: 0.6663670	total: 91.5ms	remaining: 2m 17s
1000:	learn: 0.2736587	total: 1m 25s	remaining: 42.6s
1499:	learn: 0.2596297	total: 2m 8s	remaining: 0us


[I 2024-03-13 20:22:56,592] Trial 33 finished with value: 0.7617534317501302 and parameters: {'undersampling': True, 'n_unique_to_cat_cols': 20, 'iterations': 1500, 'depth': 7, 'l2_leaf_reg': 21, 'border_count': 64, 'max_ctr_complexity': 4, 'bootstrap_type': 'Bernoulli', 'subsample': 0.8403141305945747}. Best is trial 31 with value: 0.7629860683487327.


0:	learn: 0.6662963	total: 72.1ms	remaining: 1m 48s
1000:	learn: 0.2841363	total: 1m 7s	remaining: 33.8s
1499:	learn: 0.2745532	total: 1m 41s	remaining: 0us


[I 2024-03-13 20:24:49,370] Trial 34 finished with value: 0.7609807804352784 and parameters: {'undersampling': True, 'n_unique_to_cat_cols': 20, 'iterations': 1500, 'depth': 6, 'l2_leaf_reg': 19, 'border_count': 64, 'max_ctr_complexity': 6, 'bootstrap_type': 'Bernoulli', 'subsample': 0.9365403170705552}. Best is trial 31 with value: 0.7629860683487327.


0:	learn: 0.6664235	total: 63.8ms	remaining: 1m 3s
999:	learn: 0.2796182	total: 1m	remaining: 0us


[I 2024-03-13 20:26:00,849] Trial 35 finished with value: 0.7616944826882179 and parameters: {'undersampling': True, 'n_unique_to_cat_cols': 15, 'iterations': 1000, 'depth': 6, 'l2_leaf_reg': 5, 'border_count': 64, 'max_ctr_complexity': 5, 'bootstrap_type': 'Bernoulli', 'subsample': 0.8155164946376867}. Best is trial 31 with value: 0.7629860683487327.


0:	learn: 0.6663270	total: 63.4ms	remaining: 1m 35s
1000:	learn: 0.2713274	total: 59.2s	remaining: 29.5s
1499:	learn: 0.2561392	total: 1m 28s	remaining: 0us


[I 2024-03-13 20:27:39,853] Trial 36 finished with value: 0.7623839248698857 and parameters: {'undersampling': True, 'n_unique_to_cat_cols': 10, 'iterations': 1500, 'depth': 7, 'l2_leaf_reg': 15, 'border_count': 64, 'max_ctr_complexity': 6, 'bootstrap_type': 'Bernoulli', 'subsample': 0.9412698510127167}. Best is trial 31 with value: 0.7629860683487327.


0:	learn: 0.6670407	total: 24ms	remaining: 48s
1000:	learn: 0.2727162	total: 20.7s	remaining: 20.6s
1999:	learn: 0.2453486	total: 41.3s	remaining: 0us


[I 2024-03-13 20:28:30,846] Trial 37 finished with value: 0.76271793797215 and parameters: {'undersampling': True, 'n_unique_to_cat_cols': 5, 'iterations': 2000, 'depth': 7, 'l2_leaf_reg': 21, 'border_count': 64, 'max_ctr_complexity': 0, 'bootstrap_type': 'Bernoulli', 'subsample': 0.7649865336426298}. Best is trial 31 with value: 0.7629860683487327.


0:	learn: 0.6666593	total: 18ms	remaining: 27s
1000:	learn: 0.2806627	total: 14.9s	remaining: 7.45s
1499:	learn: 0.2698263	total: 22.5s	remaining: 0us


[I 2024-03-13 20:29:03,077] Trial 38 finished with value: 0.762444822248251 and parameters: {'undersampling': True, 'n_unique_to_cat_cols': 5, 'iterations': 1500, 'depth': 6, 'l2_leaf_reg': 17, 'border_count': 64, 'max_ctr_complexity': 0, 'bootstrap_type': 'Bernoulli', 'subsample': 0.7655375861856879}. Best is trial 31 with value: 0.7629860683487327.


0:	learn: 0.6666224	total: 9.23ms	remaining: 9.22s
999:	learn: 0.2816704	total: 8.52s	remaining: 0us


[I 2024-03-13 20:29:14,508] Trial 39 finished with value: 0.7605625091063553 and parameters: {'undersampling': True, 'n_unique_to_cat_cols': 0, 'iterations': 1000, 'depth': 6, 'l2_leaf_reg': 17, 'border_count': 64, 'max_ctr_complexity': 0, 'bootstrap_type': 'Bernoulli', 'subsample': 0.7472868736746163}. Best is trial 31 with value: 0.7629860683487327.


0:	learn: 0.6668808	total: 13.8ms	remaining: 20.6s
1000:	learn: 0.2905427	total: 11.7s	remaining: 5.81s
1499:	learn: 0.2827655	total: 17.4s	remaining: 0us


[I 2024-03-13 20:29:41,650] Trial 40 finished with value: 0.7608191356303531 and parameters: {'undersampling': True, 'n_unique_to_cat_cols': 5, 'iterations': 1500, 'depth': 5, 'l2_leaf_reg': 19, 'border_count': 64, 'max_ctr_complexity': 0, 'bootstrap_type': 'Bernoulli', 'subsample': 0.7799122237875196}. Best is trial 31 with value: 0.7629860683487327.


0:	learn: 0.6666446	total: 15.2ms	remaining: 22.7s
1000:	learn: 0.2836289	total: 14.7s	remaining: 7.32s
1499:	learn: 0.2740054	total: 21.9s	remaining: 0us


[I 2024-03-13 20:30:13,258] Trial 41 finished with value: 0.760929812407961 and parameters: {'undersampling': True, 'n_unique_to_cat_cols': 5, 'iterations': 1500, 'depth': 6, 'l2_leaf_reg': 23, 'border_count': 64, 'max_ctr_complexity': 1, 'bootstrap_type': 'Bernoulli', 'subsample': 0.6617427599896003}. Best is trial 31 with value: 0.7629860683487327.


0:	learn: 0.6663301	total: 12.2ms	remaining: 18.2s
1000:	learn: 0.2735564	total: 10.5s	remaining: 5.25s
1499:	learn: 0.2588545	total: 15.9s	remaining: 0us


[I 2024-03-13 20:30:32,239] Trial 42 finished with value: 0.7612164888278091 and parameters: {'undersampling': True, 'n_unique_to_cat_cols': 0, 'iterations': 1500, 'depth': 7, 'l2_leaf_reg': 13, 'border_count': 64, 'max_ctr_complexity': 0, 'bootstrap_type': 'Bernoulli', 'subsample': 0.626298423396377}. Best is trial 31 with value: 0.7629860683487327.


0:	learn: 0.6666822	total: 35.4ms	remaining: 1m 10s
1000:	learn: 0.2807038	total: 35.7s	remaining: 35.7s
1999:	learn: 0.2603432	total: 1m 11s	remaining: 0us


[I 2024-03-13 20:31:53,058] Trial 43 finished with value: 0.7620500926458066 and parameters: {'undersampling': True, 'n_unique_to_cat_cols': 5, 'iterations': 2000, 'depth': 6, 'l2_leaf_reg': 21, 'border_count': 64, 'max_ctr_complexity': 2, 'bootstrap_type': 'Bernoulli', 'subsample': 0.8736236863804726}. Best is trial 31 with value: 0.7629860683487327.


0:	learn: 0.6656799	total: 18.1ms	remaining: 27.2s
1000:	learn: 0.2578415	total: 15.4s	remaining: 7.67s
1499:	learn: 0.2389182	total: 22.9s	remaining: 0us


[I 2024-03-13 20:32:18,986] Trial 44 finished with value: 0.7625228444841403 and parameters: {'undersampling': True, 'n_unique_to_cat_cols': 0, 'iterations': 1500, 'depth': 8, 'l2_leaf_reg': 19, 'border_count': 64, 'max_ctr_complexity': 6, 'bootstrap_type': 'Bernoulli', 'subsample': 0.7219250806657669}. Best is trial 31 with value: 0.7629860683487327.


0:	learn: 0.6668403	total: 9.07ms	remaining: 13.6s
1000:	learn: 0.2830655	total: 8.41s	remaining: 4.19s
1499:	learn: 0.2728096	total: 12.5s	remaining: 0us


[I 2024-03-13 20:32:34,425] Trial 45 finished with value: 0.7617338159415973 and parameters: {'undersampling': True, 'n_unique_to_cat_cols': 0, 'iterations': 1500, 'depth': 6, 'l2_leaf_reg': 19, 'border_count': 64, 'max_ctr_complexity': 6, 'bootstrap_type': 'Bernoulli', 'subsample': 0.729223309678627}. Best is trial 31 with value: 0.7629860683487327.


0:	learn: 0.6662410	total: 13.4ms	remaining: 13.4s
999:	learn: 0.2723409	total: 11s	remaining: 0us


[I 2024-03-13 20:32:48,539] Trial 46 finished with value: 0.7610892212139863 and parameters: {'undersampling': True, 'n_unique_to_cat_cols': 0, 'iterations': 1000, 'depth': 7, 'l2_leaf_reg': 17, 'border_count': 64, 'max_ctr_complexity': 7, 'bootstrap_type': 'Bernoulli', 'subsample': 0.7784336557477124}. Best is trial 31 with value: 0.7629860683487327.


0:	learn: 0.6662319	total: 74ms	remaining: 1m 50s
1000:	learn: 0.2634332	total: 1m 14s	remaining: 37s
1499:	learn: 0.2447689	total: 1m 51s	remaining: 0us


[I 2024-03-13 20:34:50,176] Trial 47 finished with value: 0.7609998501639565 and parameters: {'undersampling': True, 'n_unique_to_cat_cols': 10, 'iterations': 1500, 'depth': 8, 'l2_leaf_reg': 23, 'border_count': 64, 'max_ctr_complexity': 6, 'bootstrap_type': 'Bernoulli', 'subsample': 0.6947091669310717}. Best is trial 31 with value: 0.7629860683487327.


0:	learn: 0.6669582	total: 8.11ms	remaining: 12.2s
1000:	learn: 0.2870467	total: 7s	remaining: 3.49s
1499:	learn: 0.2792449	total: 10.6s	remaining: 0us


[I 2024-03-13 20:35:03,680] Trial 48 finished with value: 0.7616369083005139 and parameters: {'undersampling': True, 'n_unique_to_cat_cols': 0, 'iterations': 1500, 'depth': 5, 'l2_leaf_reg': 19, 'border_count': 64, 'max_ctr_complexity': 5, 'bootstrap_type': 'Bernoulli', 'subsample': 0.9646764399070219}. Best is trial 31 with value: 0.7629860683487327.


0:	learn: 0.6666321	total: 21.8ms	remaining: 10.9s
499:	learn: 0.3008943	total: 10.9s	remaining: 0us


[I 2024-03-13 20:35:24,149] Trial 49 finished with value: 0.7538583366876188 and parameters: {'undersampling': True, 'n_unique_to_cat_cols': 5, 'iterations': 500, 'depth': 4, 'l2_leaf_reg': 21, 'border_count': 64, 'max_ctr_complexity': 6, 'bootstrap_type': 'Bernoulli', 'subsample': 0.8051440267922123}. Best is trial 31 with value: 0.7629860683487327.


Number of finished trials: 50
Best trial:
  Value: 0.7629860683487327
  Params:
    undersampling: True
    n_unique_to_cat_cols: 15
    iterations: 1500
    depth: 7
    l2_leaf_reg: 21
    border_count: 64
    max_ctr_complexity: 6
    bootstrap_type: Bernoulli
    subsample: 0.9275798959802446


In [10]:
final_params = trial.params
final_params = {
    k: (round(v, 2) if isinstance(v, float) else v)
    for k, v in final_params.items()
}
final_params

{'undersampling': True,
 'n_unique_to_cat_cols': 15,
 'iterations': 1500,
 'depth': 7,
 'l2_leaf_reg': 21,
 'border_count': 64,
 'max_ctr_complexity': 6,
 'bootstrap_type': 'Bernoulli',
 'subsample': 0.93}

In [11]:
final_params = {**final_params, **default_params}
final_params

{'undersampling': True,
 'n_unique_to_cat_cols': 15,
 'iterations': 1500,
 'depth': 7,
 'l2_leaf_reg': 21,
 'border_count': 64,
 'max_ctr_complexity': 6,
 'bootstrap_type': 'Bernoulli',
 'subsample': 0.93,
 'random_state': 42,
 'allow_writing_files': False,
 'verbose': 1000,
 'task_type': 'GPU'}

In [9]:
final_params = {
    'undersampling': True,
    'n_unique_to_cat_cols': 15,
    'iterations': 1500,
    'depth': 7,
    'l2_leaf_reg': 21,
    'border_count': 64,
    'max_ctr_complexity': 6,
    'bootstrap_type': 'Bernoulli',
    'subsample': 0.93,
    'random_state': 42,
    'allow_writing_files': False,
    'verbose': 1000,
    'task_type': 'GPU',
}
final_params

{'undersampling': True,
 'n_unique_to_cat_cols': 15,
 'iterations': 1500,
 'depth': 7,
 'l2_leaf_reg': 21,
 'border_count': 64,
 'max_ctr_complexity': 6,
 'bootstrap_type': 'Bernoulli',
 'subsample': 0.93,
 'random_state': 42,
 'allow_writing_files': False,
 'verbose': 1000,
 'task_type': 'GPU'}

In [10]:
if final_params['undersampling']:
    Xy_train = data.group_by(
        'target'
    ).head(
        100_000
    ).to_pandas()
else:
    Xy_train = data.to_pandas()

X_train_sampled, y_train_sampled = Xy_train.drop(columns=['target']), Xy_train['target'].values

cat_cols = n_unique_vals_by_col.filter(
    pl.col('n_unique') <= final_params['n_unique_to_cat_cols']
)['column'].to_list()

train_pool = Pool(X_train_sampled, label=y_train_sampled, cat_features=cat_cols)

model = CatBoostClassifier(**{k: v for k, v in final_params.items() if k not in ['undersampling', 'n_unique_to_cat_cols']})
model.fit(train_pool)

0:	learn: 0.6724481	total: 5.88s	remaining: 2h 26m 59s
1000:	learn: 0.3394862	total: 1m 24s	remaining: 42s
1499:	learn: 0.3239409	total: 2m 3s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x78715803dae0>

In [11]:
data_test = pl.read_parquet(DATA_PATH + 'data_test_cleaned.parquet')

In [12]:
X_test = data_test.drop('target').to_pandas()

y_pred = model.predict_proba(X_test)[:, 1]
y_pred_binary = (y_pred > round(y_train.mean(), 3)).astype(int)

In [13]:
submission = pl.read_parquet(DATA_PATH + 'test_sber.parquet').select('id').to_pandas()

submission['target_prob'] = y_pred
submission['target_bin'] = y_pred_binary

submission.to_csv('submission_2_top_n.csv', index=False)
submission.head(3)

Unnamed: 0,id,target_prob,target_bin
0,3,0.061472,1
1,4,0.084514,1
2,12,0.150282,1


## 2. Feature selection

In [14]:
def select_features(
    pool_train: Pool, pool_valid: Pool, final_params, features,
    algorithm: EFeaturesSelectionAlgorithm,
    num_features_to_select: int = 250,
    steps: int = 25,
):
    """
    taken and updated from:
    https://github.com/catboost/catboost/blob/master/catboost/
    tutorials/feature_selection/select_features_tutorial.ipynb
    """
    print('Algorithm:', algorithm)
    model = CatBoostClassifier(**final_params)

    summary = model.select_features(
        pool_train,
        eval_set=pool_valid,
        features_for_select=features,
        num_features_to_select=num_features_to_select,
        steps=steps,                                     # more steps - more accurate selection
        algorithm=algorithm,
        shap_calc_type=EShapCalcType.Regular,            # can be Approximate, Regular and Exact
        train_final_model=True,                          # to train model with selected features
        logging_level='Verbose',
        plot=True,
    )
    print('Selected features:', summary['selected_features_names'])
    print()
    print('Eliminated features:', summary['eliminated_features_names'])

    return summary

In [15]:
if final_params['undersampling']:
    X_train['y'] = y_train
    Xy_train = X_train.groupby('y').head(100_000)

    X_train_sampled, y_train_sampled = Xy_train.drop(columns=['y']), Xy_train['y'].values
else:
    X_train_sampled, y_train_sampled = X_train.copy(), y_train.copy()

cat_cols = n_unique_vals_by_col.filter(
    pl.col('n_unique') <= final_params['n_unique_to_cat_cols']
)['column'].to_list()

train_pool = Pool(X_train_sampled, label=y_train_sampled, cat_features=cat_cols)
valid_pool = Pool(X_valid, label=y_valid, cat_features=cat_cols)

summary = select_features(
    pool_train=train_pool, pool_valid=valid_pool, final_params={k: v for k, v in final_params.items() if k not in ['undersampling', 'n_unique_to_cat_cols']},
    features = X_valid.columns.to_list(), algorithm=EFeaturesSelectionAlgorithm.RecursiveByShapValues, num_features_to_select=250, steps=25,
)

Algorithm: EFeaturesSelectionAlgorithm.RecursiveByShapValues


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Step #1 out of 25
0:	learn: 0.6691895	test: 0.6641792	best: 0.6641792 (0)	total: 86ms	remaining: 2m 8s
1000:	learn: 0.3000189	test: 0.1787809	best: 0.1787775 (999)	total: 1m 17s	remaining: 38.9s
1499:	learn: 0.2895844	test: 0.1781511	best: 0.1781442 (1490)	total: 1m 56s	remaining: 0us
bestTest = 0.1781442179
bestIteration = 1490
Shrink model to first 1491 iterations.
Feature #605 eliminated
Feature #594 eliminated
Feature #249 eliminated
Feature #263 eliminated
Feature #595 eliminated
Feature #582 eliminated
Feature #318 eliminated
Feature #20 eliminated
Feature #608 eliminated
Feature #490 eliminated
Feature #587 eliminated
Feature #399 eliminated
Feature #604 eliminated
Feature #160 eliminated
Feature #467 eliminated
Feature #616 eliminated
Feature #161 eliminated
Feature #512 eliminated
Feature #242 eliminated
Feature #601 eliminated
Feature #65 eliminated
Feature #353 eliminated
Feature #589 eliminated
Feature #199 eliminated
Step #2 out of 25
0:	learn: 0.6696105	test: 0.6645759	be

Selected features: ['feature36', 'feature38', 'feature41', 'feature44', 'feature45', 'feature48', 'feature49', 'feature52', 'feature53', 'feature56', 'feature57', 'feature58', 'feature59', 'feature60', 'feature61', 'feature64', 'feature65', 'feature70', 'feature71', 'feature72', 'feature74', 'feature75', 'feature76', 'feature78', 'feature79', 'feature80', 'feature81', 'feature82', 'feature83', 'feature84', 'feature85', 'feature86', 'feature87', 'feature92', 'feature95', 'feature97', 'feature100', 'feature102', 'feature104', 'feature106', 'feature111', 'feature112', 'feature113', 'feature114', 'feature115', 'feature117', 'feature123', 'feature125', 'feature127', 'feature128', 'feature129', 'feature133', 'feature140', 'feature146', 'feature150', 'feature153', 'feature161', 'feature163', 'feature164', 'feature166', 'feature167', 'feature168', 'feature169', 'feature171', 'feature186', 'feature196', 'feature197', 'feature202', 'feature203', 'feature205', 'feature214', 'feature215', 'feature

In [16]:
if final_params['undersampling']:
    Xy_train = data.group_by(
        'target'
    ).head(
        100_000
    ).to_pandas()
else:
    Xy_train = data.to_pandas()

X_train_sampled, y_train_sampled = Xy_train.drop(columns=['target']).loc[:, summary['selected_features_names']], Xy_train['target'].values

cat_cols = n_unique_vals_by_col.filter(
    pl.col('n_unique') <= final_params['n_unique_to_cat_cols']
)['column'].to_list()

cat_cols = [col for col in cat_cols if col in summary['selected_features_names']]

train_pool = Pool(X_train_sampled, label=y_train_sampled, cat_features=cat_cols)

model = CatBoostClassifier(**{k: v for k, v in final_params.items() if k not in ['undersampling', 'n_unique_to_cat_cols']})
model.fit(train_pool)

0:	learn: 0.6757779	total: 26.8ms	remaining: 40.2s
1000:	learn: 0.3916732	total: 25.8s	remaining: 12.8s
1499:	learn: 0.3799258	total: 38.7s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x78712e4876d0>

In [19]:
data_test = pl.read_parquet(DATA_PATH + 'data_test_cleaned.parquet')

In [20]:
X_test = data_test.drop('target').to_pandas()

X_test = X_test.loc[:, summary['selected_features_names']]

y_pred = model.predict_proba(X_test)[:, 1]
y_pred_binary = (y_pred > round(y_train.mean(), 3)).astype(int)

In [21]:
submission = pl.read_parquet(DATA_PATH + 'test_sber.parquet').select('id').to_pandas()

submission['target_prob'] = y_pred
submission['target_bin'] = y_pred_binary

submission.to_csv('submission_2_top_250.csv', index=False)
submission.head(3)

Unnamed: 0,id,target_prob,target_bin
0,3,0.141666,1
1,4,0.107053,1
2,12,0.102647,1
