In [1]:
from lightgbm.sklearn import LGBMClassifier
from sklearn.model_selection import StratifiedGroupKFold
import pandas as pd
import xgboost as xgb
from sklearn.metrics import roc_auc_score
import joblib
from datetime import datetime
import numpy as np

import sys
sys.path.append("..")
from src.modeling import nestedcv, random_seed

random_seed(42)

# Setup and data loading

In [2]:
### Setup
ROOT = '..'
EXTENSION = 'parquet'
DATA_FOLDER = 'data'

In [3]:
train = pd.read_parquet(f'{ROOT}/{DATA_FOLDER}/train/train_base.{EXTENSION}')
train['date_decision'] = pd.to_datetime(train['date_decision'])
train.head(2)

Unnamed: 0,case_id,date_decision,MONTH,WEEK_NUM,target
0,0,2019-01-03,201901,0,0
1,1,2019-01-03,201901,0,0


In [4]:
# Balancing taking the number of positive target as reference
train_balanced = train.groupby('target', group_keys=False).apply(
    lambda x: x.sample(
        min(len(x), train.target.value_counts()[1])
        )
    ).reset_index(drop=True)

train_balanced.shape

  train_balanced = train.groupby('target', group_keys=False).apply(


(95988, 5)

In [5]:
# Import static features
train_static_0 = pd.read_parquet(f'{ROOT}/{DATA_FOLDER}/train/train_static_0_0.{EXTENSION}')
train_static_1 = pd.read_parquet(f'{ROOT}/{DATA_FOLDER}/train/train_static_0_1.{EXTENSION}')
train_static_concat = pd.concat([train_static_0, train_static_1])

# Getting rid of high missing columns
filter_high_na = (train_static_concat.isna().sum()/train_static_concat.shape[0]) < 0.5
columns_to_keep = train_static_concat.columns[filter_high_na.values]

train_merged = train_balanced.merge(train_static_concat[columns_to_keep], on='case_id', validate='1:1')

# Exclude object columns
train_merged = train_merged.loc[:, ~train_merged.columns.isin(train_merged.select_dtypes(include=['O', '<M8[ns]']).columns)]

train_merged.head()

Unnamed: 0,case_id,MONTH,WEEK_NUM,target,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,applicationcnt_361L,applications30d_658L,...,pmtnum_254L,posfpd10lastmonth_333P,posfpd30lastmonth_3976960P,posfstqpd30lastmonth_3976962P,price_1097A,sellerplacecnt_915L,sellerplacescnt_216L,sumoutstandtotal_3546847A,totaldebt_9A,totalsettled_863A
0,1653686,201911,47,0,0.0,0.0,1769.6,0.0,0.0,0.0,...,24.0,0.0,0.0,0.0,29496.0,0.0,0.0,0.0,0.0,291421.2
1,1726009,201912,52,0,0.0,15385.469,1685.6,0.0,0.0,0.0,...,24.0,0.0,0.0,0.0,28096.0,0.0,2.0,0.0,0.0,34062.0
2,736389,201907,26,0,0.0,,2297.0,0.0,0.0,0.0,...,12.0,0.0,0.0,0.0,22198.0,0.0,1.0,,0.0,0.0
3,1027343,202010,91,0,,,2143.4001,0.0,0.0,0.0,...,12.0,0.0,0.0,0.0,17516.0,0.0,0.0,,0.0,0.0
4,117009,201903,10,0,0.0,18596.201,1236.2001,0.0,0.0,0.0,...,,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,31960.0


In [6]:
# Save columns for inference later
pd.Series([i for i in train_merged.columns if i != 'target'], name='cols').to_csv('{}/{}/output/columns_to_keep.csv'.format(ROOT, DATA_FOLDER), index=False)

In [7]:
x = train_merged.drop(['case_id', 'target'], axis=1)
y = train_merged['target']
weeks = train_merged["WEEK_NUM"]

# Nested cross validation

In [8]:
cv_inner = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)          
cv_outer = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)

In [9]:
random_seed(42)

def create_params(trial):
    return {
        'verbose': -1,
        'subsample': trial.suggest_float('subsample', 0.1, 0.9),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 0.95, step=0.05),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 0.95, step=0.05),
        'learning_rate': trial.suggest_float("learning_rate", 1e-4, 1e-1, log=True),
        'max_depth': trial.suggest_int('max_depth', 2, 30),
    }

classifier = LGBMClassifier(n_estimators=1000, objective='binary', metric='auc', n_jobs=-1) 

lgb_clfs, lgb_oof_preds, lgb_oof_targets = nestedcv(x=x,
                                                    y=y,
                                                    groups=weeks,
                                                    classifier=classifier,
                                                    cv_outer=cv_outer,
                                                    cv_inner=cv_inner,
                                                    p_grid=create_params,
                                                    n_iter=15)

[I 2024-02-20 07:43:46,152] A new study created in memory with name: no-name-e7a6b289-8c89-4f78-bd9b-bb56d7ea0a60



Nested CV: 1 of 5 outer fold


[I 2024-02-20 07:44:01,278] Trial 0 finished with value: 0.7687175753868335 and parameters: {'subsample': 0.39963209507789, 'bagging_fraction': 0.95, 'feature_fraction': 0.8, 'learning_rate': 0.006251373574521752, 'max_depth': 6}. Best is trial 0 with value: 0.7687175753868335.
[I 2024-02-20 07:44:22,323] Trial 1 finished with value: 0.7706064455573061 and parameters: {'subsample': 0.22479561626896213, 'bagging_fraction': 0.4, 'feature_fraction': 0.9, 'learning_rate': 0.006358358856676255, 'max_depth': 22}. Best is trial 1 with value: 0.7706064455573061.
[I 2024-02-20 07:44:44,713] Trial 2 finished with value: 0.7376648080581714 and parameters: {'subsample': 0.11646759543664197, 'bagging_fraction': 0.95, 'feature_fraction': 0.8500000000000001, 'learning_rate': 0.0004335281794951569, 'max_depth': 7}. Best is trial 1 with value: 0.7706064455573061.
[I 2024-02-20 07:45:06,671] Trial 3 finished with value: 0.7566188962656676 and parameters: {'subsample': 0.24672360788274705, 'bagging_fract

Found new best score with score 0.776358


[I 2024-02-20 07:46:48,439] Trial 0 finished with value: 0.779439548942004 and parameters: {'subsample': 0.39963209507789, 'bagging_fraction': 0.95, 'feature_fraction': 0.8, 'learning_rate': 0.006251373574521752, 'max_depth': 6}. Best is trial 0 with value: 0.779439548942004.
[I 2024-02-20 07:46:57,154] Trial 1 finished with value: 0.7820899645859425 and parameters: {'subsample': 0.22479561626896213, 'bagging_fraction': 0.4, 'feature_fraction': 0.9, 'learning_rate': 0.006358358856676255, 'max_depth': 22}. Best is trial 1 with value: 0.7820899645859425.
[I 2024-02-20 07:47:05,725] Trial 2 finished with value: 0.7474278052253305 and parameters: {'subsample': 0.11646759543664197, 'bagging_fraction': 0.95, 'feature_fraction': 0.8500000000000001, 'learning_rate': 0.0004335281794951569, 'max_depth': 7}. Best is trial 1 with value: 0.7820899645859425.
[I 2024-02-20 07:47:14,165] Trial 3 finished with value: 0.7664056290207658 and parameters: {'subsample': 0.24672360788274705, 'bagging_fractio

Found new best score with score 0.786421


[I 2024-02-20 07:48:30,622] Trial 0 finished with value: 0.7778962754010024 and parameters: {'subsample': 0.39963209507789, 'bagging_fraction': 0.95, 'feature_fraction': 0.8, 'learning_rate': 0.006251373574521752, 'max_depth': 6}. Best is trial 0 with value: 0.7778962754010024.
[I 2024-02-20 07:48:39,160] Trial 1 finished with value: 0.7792713855488749 and parameters: {'subsample': 0.22479561626896213, 'bagging_fraction': 0.4, 'feature_fraction': 0.9, 'learning_rate': 0.006358358856676255, 'max_depth': 22}. Best is trial 1 with value: 0.7792713855488749.
[I 2024-02-20 07:48:47,822] Trial 2 finished with value: 0.7437930890391607 and parameters: {'subsample': 0.11646759543664197, 'bagging_fraction': 0.95, 'feature_fraction': 0.8500000000000001, 'learning_rate': 0.0004335281794951569, 'max_depth': 7}. Best is trial 1 with value: 0.7792713855488749.
[I 2024-02-20 07:48:56,314] Trial 3 finished with value: 0.7637644278196647 and parameters: {'subsample': 0.24672360788274705, 'bagging_fract

Found new best score with score 0.787804


[I 2024-02-20 07:54:41,044] A new study created in memory with name: no-name-284b2d34-c449-4fa2-8409-3e06fed87749



        Test AUC                           : 0.786
        

Nested CV: 2 of 5 outer fold


[I 2024-02-20 07:54:48,801] Trial 0 finished with value: 0.7730563087141924 and parameters: {'subsample': 0.39963209507789, 'bagging_fraction': 0.95, 'feature_fraction': 0.8, 'learning_rate': 0.006251373574521752, 'max_depth': 6}. Best is trial 0 with value: 0.7730563087141924.
[I 2024-02-20 07:54:58,898] Trial 1 finished with value: 0.7756234936577133 and parameters: {'subsample': 0.22479561626896213, 'bagging_fraction': 0.4, 'feature_fraction': 0.9, 'learning_rate': 0.006358358856676255, 'max_depth': 22}. Best is trial 1 with value: 0.7756234936577133.
[I 2024-02-20 07:55:08,566] Trial 2 finished with value: 0.7407127577283183 and parameters: {'subsample': 0.11646759543664197, 'bagging_fraction': 0.95, 'feature_fraction': 0.8500000000000001, 'learning_rate': 0.0004335281794951569, 'max_depth': 7}. Best is trial 1 with value: 0.7756234936577133.
[I 2024-02-20 07:55:18,440] Trial 3 finished with value: 0.7592779030073923 and parameters: {'subsample': 0.24672360788274705, 'bagging_fract

Found new best score with score 0.779541


[I 2024-02-20 07:56:49,810] Trial 0 finished with value: 0.7867967975816239 and parameters: {'subsample': 0.39963209507789, 'bagging_fraction': 0.95, 'feature_fraction': 0.8, 'learning_rate': 0.006251373574521752, 'max_depth': 6}. Best is trial 0 with value: 0.7867967975816239.
[I 2024-02-20 07:56:59,699] Trial 1 finished with value: 0.7892354938260151 and parameters: {'subsample': 0.22479561626896213, 'bagging_fraction': 0.4, 'feature_fraction': 0.9, 'learning_rate': 0.006358358856676255, 'max_depth': 22}. Best is trial 1 with value: 0.7892354938260151.
[I 2024-02-20 07:57:09,151] Trial 2 finished with value: 0.7509979257674336 and parameters: {'subsample': 0.11646759543664197, 'bagging_fraction': 0.95, 'feature_fraction': 0.8500000000000001, 'learning_rate': 0.0004335281794951569, 'max_depth': 7}. Best is trial 1 with value: 0.7892354938260151.
[I 2024-02-20 07:57:18,643] Trial 3 finished with value: 0.77185310248778 and parameters: {'subsample': 0.24672360788274705, 'bagging_fractio

Found new best score with score 0.794949


[I 2024-02-20 07:58:48,143] Trial 0 finished with value: 0.7810732477988553 and parameters: {'subsample': 0.39963209507789, 'bagging_fraction': 0.95, 'feature_fraction': 0.8, 'learning_rate': 0.006251373574521752, 'max_depth': 6}. Best is trial 0 with value: 0.7810732477988553.
[I 2024-02-20 07:58:58,289] Trial 1 finished with value: 0.7833872327984618 and parameters: {'subsample': 0.22479561626896213, 'bagging_fraction': 0.4, 'feature_fraction': 0.9, 'learning_rate': 0.006358358856676255, 'max_depth': 22}. Best is trial 1 with value: 0.7833872327984618.
[I 2024-02-20 07:59:08,025] Trial 2 finished with value: 0.7425970134148947 and parameters: {'subsample': 0.11646759543664197, 'bagging_fraction': 0.95, 'feature_fraction': 0.8500000000000001, 'learning_rate': 0.0004335281794951569, 'max_depth': 7}. Best is trial 1 with value: 0.7833872327984618.
[I 2024-02-20 07:59:17,821] Trial 3 finished with value: 0.7656436450731465 and parameters: {'subsample': 0.24672360788274705, 'bagging_fract


        Test AUC                           : 0.783
        

Nested CV: 3 of 5 outer fold


[I 2024-02-20 08:05:20,160] A new study created in memory with name: no-name-c00cbc7d-12c9-45f4-be23-cd9a8f93d4ea
[I 2024-02-20 08:05:29,036] Trial 0 finished with value: 0.77585387381743 and parameters: {'subsample': 0.39963209507789, 'bagging_fraction': 0.95, 'feature_fraction': 0.8, 'learning_rate': 0.006251373574521752, 'max_depth': 6}. Best is trial 0 with value: 0.77585387381743.
[I 2024-02-20 08:05:40,747] Trial 1 finished with value: 0.7782940617755779 and parameters: {'subsample': 0.22479561626896213, 'bagging_fraction': 0.4, 'feature_fraction': 0.9, 'learning_rate': 0.006358358856676255, 'max_depth': 22}. Best is trial 1 with value: 0.7782940617755779.
[I 2024-02-20 08:05:51,555] Trial 2 finished with value: 0.7420362333187144 and parameters: {'subsample': 0.11646759543664197, 'bagging_fraction': 0.95, 'feature_fraction': 0.8500000000000001, 'learning_rate': 0.0004335281794951569, 'max_depth': 7}. Best is trial 1 with value: 0.7782940617755779.
[I 2024-02-20 08:06:01,924] Tri

Found new best score with score 0.783883


[I 2024-02-20 08:07:42,535] Trial 0 finished with value: 0.7775633979440177 and parameters: {'subsample': 0.39963209507789, 'bagging_fraction': 0.95, 'feature_fraction': 0.8, 'learning_rate': 0.006251373574521752, 'max_depth': 6}. Best is trial 0 with value: 0.7775633979440177.
[I 2024-02-20 08:07:53,110] Trial 1 finished with value: 0.7798929506634826 and parameters: {'subsample': 0.22479561626896213, 'bagging_fraction': 0.4, 'feature_fraction': 0.9, 'learning_rate': 0.006358358856676255, 'max_depth': 22}. Best is trial 1 with value: 0.7798929506634826.
[I 2024-02-20 08:08:03,701] Trial 2 finished with value: 0.7453024531177226 and parameters: {'subsample': 0.11646759543664197, 'bagging_fraction': 0.95, 'feature_fraction': 0.8500000000000001, 'learning_rate': 0.0004335281794951569, 'max_depth': 7}. Best is trial 1 with value: 0.7798929506634826.
[I 2024-02-20 08:08:13,984] Trial 3 finished with value: 0.7644320438953434 and parameters: {'subsample': 0.24672360788274705, 'bagging_fract

Found new best score with score 0.784905


[I 2024-02-20 08:09:52,068] Trial 0 finished with value: 0.7815571638986796 and parameters: {'subsample': 0.39963209507789, 'bagging_fraction': 0.95, 'feature_fraction': 0.8, 'learning_rate': 0.006251373574521752, 'max_depth': 6}. Best is trial 0 with value: 0.7815571638986796.
[I 2024-02-20 08:10:03,257] Trial 1 finished with value: 0.7839350452534609 and parameters: {'subsample': 0.22479561626896213, 'bagging_fraction': 0.4, 'feature_fraction': 0.9, 'learning_rate': 0.006358358856676255, 'max_depth': 22}. Best is trial 1 with value: 0.7839350452534609.
[I 2024-02-20 08:10:14,687] Trial 2 finished with value: 0.7476217919849311 and parameters: {'subsample': 0.11646759543664197, 'bagging_fraction': 0.95, 'feature_fraction': 0.8500000000000001, 'learning_rate': 0.0004335281794951569, 'max_depth': 7}. Best is trial 1 with value: 0.7839350452534609.
[I 2024-02-20 08:10:24,894] Trial 3 finished with value: 0.767078970675082 and parameters: {'subsample': 0.24672360788274705, 'bagging_fracti

Found new best score with score 0.790820


[I 2024-02-20 08:12:04,196] Trial 0 finished with value: 0.7766476071176971 and parameters: {'subsample': 0.39963209507789, 'bagging_fraction': 0.95, 'feature_fraction': 0.8, 'learning_rate': 0.006251373574521752, 'max_depth': 6}. Best is trial 0 with value: 0.7766476071176971.
[I 2024-02-20 08:12:15,140] Trial 1 finished with value: 0.77940813953652 and parameters: {'subsample': 0.22479561626896213, 'bagging_fraction': 0.4, 'feature_fraction': 0.9, 'learning_rate': 0.006358358856676255, 'max_depth': 22}. Best is trial 1 with value: 0.77940813953652.
[I 2024-02-20 08:12:24,990] Trial 2 finished with value: 0.7428892045225588 and parameters: {'subsample': 0.11646759543664197, 'bagging_fraction': 0.95, 'feature_fraction': 0.8500000000000001, 'learning_rate': 0.0004335281794951569, 'max_depth': 7}. Best is trial 1 with value: 0.77940813953652.
[I 2024-02-20 08:12:35,571] Trial 3 finished with value: 0.7641875245216276 and parameters: {'subsample': 0.24672360788274705, 'bagging_fraction': 


        Test AUC                           : 0.789
        

Nested CV: 4 of 5 outer fold


[I 2024-02-20 08:16:32,027] Trial 0 finished with value: 0.7756765345550146 and parameters: {'subsample': 0.39963209507789, 'bagging_fraction': 0.95, 'feature_fraction': 0.8, 'learning_rate': 0.006251373574521752, 'max_depth': 6}. Best is trial 0 with value: 0.7756765345550146.
[I 2024-02-20 08:16:42,863] Trial 1 finished with value: 0.7779733579874469 and parameters: {'subsample': 0.22479561626896213, 'bagging_fraction': 0.4, 'feature_fraction': 0.9, 'learning_rate': 0.006358358856676255, 'max_depth': 22}. Best is trial 1 with value: 0.7779733579874469.
[I 2024-02-20 08:16:53,188] Trial 2 finished with value: 0.7401698123417924 and parameters: {'subsample': 0.11646759543664197, 'bagging_fraction': 0.95, 'feature_fraction': 0.8500000000000001, 'learning_rate': 0.0004335281794951569, 'max_depth': 7}. Best is trial 1 with value: 0.7779733579874469.
[I 2024-02-20 08:17:03,856] Trial 3 finished with value: 0.7617162204394672 and parameters: {'subsample': 0.24672360788274705, 'bagging_fract

Found new best score with score 0.783303


[I 2024-02-20 08:18:39,429] Trial 0 finished with value: 0.7765059638307317 and parameters: {'subsample': 0.39963209507789, 'bagging_fraction': 0.95, 'feature_fraction': 0.8, 'learning_rate': 0.006251373574521752, 'max_depth': 6}. Best is trial 0 with value: 0.7765059638307317.
[I 2024-02-20 08:18:49,837] Trial 1 finished with value: 0.7787200253506998 and parameters: {'subsample': 0.22479561626896213, 'bagging_fraction': 0.4, 'feature_fraction': 0.9, 'learning_rate': 0.006358358856676255, 'max_depth': 22}. Best is trial 1 with value: 0.7787200253506998.
[I 2024-02-20 08:19:00,714] Trial 2 finished with value: 0.74238273009754 and parameters: {'subsample': 0.11646759543664197, 'bagging_fraction': 0.95, 'feature_fraction': 0.8500000000000001, 'learning_rate': 0.0004335281794951569, 'max_depth': 7}. Best is trial 1 with value: 0.7787200253506998.
[I 2024-02-20 08:19:11,765] Trial 3 finished with value: 0.7615021337275611 and parameters: {'subsample': 0.24672360788274705, 'bagging_fractio

Found new best score with score 0.783985


[I 2024-02-20 08:20:49,443] Trial 0 finished with value: 0.7761857416996466 and parameters: {'subsample': 0.39963209507789, 'bagging_fraction': 0.95, 'feature_fraction': 0.8, 'learning_rate': 0.006251373574521752, 'max_depth': 6}. Best is trial 0 with value: 0.7761857416996466.
[I 2024-02-20 08:21:00,352] Trial 1 finished with value: 0.7781271879158611 and parameters: {'subsample': 0.22479561626896213, 'bagging_fraction': 0.4, 'feature_fraction': 0.9, 'learning_rate': 0.006358358856676255, 'max_depth': 22}. Best is trial 1 with value: 0.7781271879158611.
[I 2024-02-20 08:21:10,659] Trial 2 finished with value: 0.7434948936000351 and parameters: {'subsample': 0.11646759543664197, 'bagging_fraction': 0.95, 'feature_fraction': 0.8500000000000001, 'learning_rate': 0.0004335281794951569, 'max_depth': 7}. Best is trial 1 with value: 0.7781271879158611.
[I 2024-02-20 08:21:20,235] Trial 3 finished with value: 0.7627904101589469 and parameters: {'subsample': 0.24672360788274705, 'bagging_fract

Found new best score with score 0.785267


[I 2024-02-20 08:24:52,334] Trial 0 finished with value: 0.7757191614736407 and parameters: {'subsample': 0.39963209507789, 'bagging_fraction': 0.95, 'feature_fraction': 0.8, 'learning_rate': 0.006251373574521752, 'max_depth': 6}. Best is trial 0 with value: 0.7757191614736407.
[I 2024-02-20 08:25:03,181] Trial 1 finished with value: 0.7784653510074138 and parameters: {'subsample': 0.22479561626896213, 'bagging_fraction': 0.4, 'feature_fraction': 0.9, 'learning_rate': 0.006358358856676255, 'max_depth': 22}. Best is trial 1 with value: 0.7784653510074138.
[I 2024-02-20 08:25:14,147] Trial 2 finished with value: 0.7423129819117008 and parameters: {'subsample': 0.11646759543664197, 'bagging_fraction': 0.95, 'feature_fraction': 0.8500000000000001, 'learning_rate': 0.0004335281794951569, 'max_depth': 7}. Best is trial 1 with value: 0.7784653510074138.
[I 2024-02-20 08:25:24,840] Trial 3 finished with value: 0.7623007718371192 and parameters: {'subsample': 0.24672360788274705, 'bagging_fract


        Test AUC                           : 0.787
        

Nested CV: 5 of 5 outer fold


[I 2024-02-20 08:27:05,541] Trial 0 finished with value: 0.7778909261580833 and parameters: {'subsample': 0.39963209507789, 'bagging_fraction': 0.95, 'feature_fraction': 0.8, 'learning_rate': 0.006251373574521752, 'max_depth': 6}. Best is trial 0 with value: 0.7778909261580833.
[I 2024-02-20 08:27:15,789] Trial 1 finished with value: 0.7800116382371691 and parameters: {'subsample': 0.22479561626896213, 'bagging_fraction': 0.4, 'feature_fraction': 0.9, 'learning_rate': 0.006358358856676255, 'max_depth': 22}. Best is trial 1 with value: 0.7800116382371691.
[I 2024-02-20 08:27:26,118] Trial 2 finished with value: 0.7420929828792775 and parameters: {'subsample': 0.11646759543664197, 'bagging_fraction': 0.95, 'feature_fraction': 0.8500000000000001, 'learning_rate': 0.0004335281794951569, 'max_depth': 7}. Best is trial 1 with value: 0.7800116382371691.
[I 2024-02-20 08:27:36,162] Trial 3 finished with value: 0.7639653336935547 and parameters: {'subsample': 0.24672360788274705, 'bagging_fract

Found new best score with score 0.785076


[I 2024-02-20 08:29:12,075] Trial 0 finished with value: 0.7768534865350847 and parameters: {'subsample': 0.39963209507789, 'bagging_fraction': 0.95, 'feature_fraction': 0.8, 'learning_rate': 0.006251373574521752, 'max_depth': 6}. Best is trial 0 with value: 0.7768534865350847.
[I 2024-02-20 08:29:22,411] Trial 1 finished with value: 0.778182607714787 and parameters: {'subsample': 0.22479561626896213, 'bagging_fraction': 0.4, 'feature_fraction': 0.9, 'learning_rate': 0.006358358856676255, 'max_depth': 22}. Best is trial 1 with value: 0.778182607714787.
[I 2024-02-20 08:29:32,679] Trial 2 finished with value: 0.74270176056653 and parameters: {'subsample': 0.11646759543664197, 'bagging_fraction': 0.95, 'feature_fraction': 0.8500000000000001, 'learning_rate': 0.0004335281794951569, 'max_depth': 7}. Best is trial 1 with value: 0.778182607714787.
[I 2024-02-20 08:29:42,597] Trial 3 finished with value: 0.7601838016256702 and parameters: {'subsample': 0.24672360788274705, 'bagging_fraction':

Found new best score with score 0.785118


[I 2024-02-20 08:33:21,566] Trial 0 finished with value: 0.7777782050986928 and parameters: {'subsample': 0.39963209507789, 'bagging_fraction': 0.95, 'feature_fraction': 0.8, 'learning_rate': 0.006251373574521752, 'max_depth': 6}. Best is trial 0 with value: 0.7777782050986928.
[I 2024-02-20 08:33:31,615] Trial 1 finished with value: 0.7799822241931006 and parameters: {'subsample': 0.22479561626896213, 'bagging_fraction': 0.4, 'feature_fraction': 0.9, 'learning_rate': 0.006358358856676255, 'max_depth': 22}. Best is trial 1 with value: 0.7799822241931006.
[I 2024-02-20 08:33:42,396] Trial 2 finished with value: 0.7456089106592174 and parameters: {'subsample': 0.11646759543664197, 'bagging_fraction': 0.95, 'feature_fraction': 0.8500000000000001, 'learning_rate': 0.0004335281794951569, 'max_depth': 7}. Best is trial 1 with value: 0.7799822241931006.
[I 2024-02-20 08:33:53,071] Trial 3 finished with value: 0.7640345240370523 and parameters: {'subsample': 0.24672360788274705, 'bagging_fract


        Test AUC                           : 0.787
        


In [13]:
random_seed(42)

def create_params(trial):
    return {
        'subsample': trial.suggest_float('subsample', 0.1, 0.9),
        'tree_method': 'hist',
        'colsample_bytree': trial.suggest_float('bagging_fraction', 0.4, 0.95, step=0.05),
        'learning_rate': trial.suggest_float("learning_rate", 1e-4, 1e-1, log=True),
        'max_depth': trial.suggest_int('max_depth', 2, 30)
    }

classifier = xgb.XGBClassifier(n_estimators=250, objective='binary:logistic', eval_metric='auc', n_jobs=-1) 

xb_clfs, xb_oof_preds, xb_oof_targets = nestedcv(x=x,
                                                 y=y,
                                                 groups=weeks,
                                                 classifier=classifier,
                                                 cv_outer=cv_outer,
                                                 cv_inner=cv_inner,
                                                 p_grid=create_params,
                                                 n_iter=10)

[I 2024-02-20 10:00:15,117] A new study created in memory with name: no-name-d17b40c5-742e-4008-b48e-01eddc341a6b



Nested CV: 1 of 5 outer fold


[I 2024-02-20 10:00:48,748] Trial 0 finished with value: 0.7720008546229552 and parameters: {'subsample': 0.39963209507789, 'bagging_fraction': 0.95, 'learning_rate': 0.015702970884055395, 'max_depth': 19}. Best is trial 0 with value: 0.7720008546229552.
[I 2024-02-20 10:01:13,152] Trial 1 finished with value: 0.7591483842044509 and parameters: {'subsample': 0.22481491235394924, 'bagging_fraction': 0.45, 'learning_rate': 0.00014936568554617635, 'max_depth': 27}. Best is trial 0 with value: 0.7720008546229552.
[I 2024-02-20 10:02:13,525] Trial 2 finished with value: 0.7617930867667851 and parameters: {'subsample': 0.5808920093945671, 'bagging_fraction': 0.8, 'learning_rate': 0.00011527987128232407, 'max_depth': 30}. Best is trial 0 with value: 0.7720008546229552.
[I 2024-02-20 10:02:18,056] Trial 3 finished with value: 0.7466946817082256 and parameters: {'subsample': 0.7659541126403374, 'bagging_fraction': 0.5, 'learning_rate': 0.0003511356313970409, 'max_depth': 7}. Best is trial 0 wit

Found new best score with score 0.772001


[I 2024-02-20 10:04:24,774] Trial 0 finished with value: 0.7826464575503282 and parameters: {'subsample': 0.39963209507789, 'bagging_fraction': 0.95, 'learning_rate': 0.015702970884055395, 'max_depth': 19}. Best is trial 0 with value: 0.7826464575503282.
[I 2024-02-20 10:04:46,992] Trial 1 finished with value: 0.7709839979509662 and parameters: {'subsample': 0.22481491235394924, 'bagging_fraction': 0.45, 'learning_rate': 0.00014936568554617635, 'max_depth': 27}. Best is trial 0 with value: 0.7826464575503282.
[I 2024-02-20 10:05:44,212] Trial 2 finished with value: 0.7747147437810732 and parameters: {'subsample': 0.5808920093945671, 'bagging_fraction': 0.8, 'learning_rate': 0.00011527987128232407, 'max_depth': 30}. Best is trial 0 with value: 0.7826464575503282.
[I 2024-02-20 10:05:48,736] Trial 3 finished with value: 0.7573604661728557 and parameters: {'subsample': 0.7659541126403374, 'bagging_fraction': 0.5, 'learning_rate': 0.0003511356313970409, 'max_depth': 7}. Best is trial 0 wit

Found new best score with score 0.782646


[I 2024-02-20 10:07:56,780] Trial 0 finished with value: 0.7796677125198235 and parameters: {'subsample': 0.39963209507789, 'bagging_fraction': 0.95, 'learning_rate': 0.015702970884055395, 'max_depth': 19}. Best is trial 0 with value: 0.7796677125198235.
[I 2024-02-20 10:08:18,427] Trial 1 finished with value: 0.7677963943051562 and parameters: {'subsample': 0.22481491235394924, 'bagging_fraction': 0.45, 'learning_rate': 0.00014936568554617635, 'max_depth': 27}. Best is trial 0 with value: 0.7796677125198235.
[I 2024-02-20 10:09:16,341] Trial 2 finished with value: 0.7715192669789519 and parameters: {'subsample': 0.5808920093945671, 'bagging_fraction': 0.8, 'learning_rate': 0.00011527987128232407, 'max_depth': 30}. Best is trial 0 with value: 0.7796677125198235.
[I 2024-02-20 10:09:20,791] Trial 3 finished with value: 0.754743771938162 and parameters: {'subsample': 0.7659541126403374, 'bagging_fraction': 0.5, 'learning_rate': 0.0003511356313970409, 'max_depth': 7}. Best is trial 0 with


        Test AUC                           : 0.780
        

Nested CV: 2 of 5 outer fold


[I 2024-02-20 10:18:50,132] A new study created in memory with name: no-name-11ee7d2c-0608-4751-b80e-4040f8037ec7
[I 2024-02-20 10:19:21,221] Trial 0 finished with value: 0.775626870457818 and parameters: {'subsample': 0.39963209507789, 'bagging_fraction': 0.95, 'learning_rate': 0.015702970884055395, 'max_depth': 19}. Best is trial 0 with value: 0.775626870457818.
[I 2024-02-20 10:19:43,275] Trial 1 finished with value: 0.7632570985226519 and parameters: {'subsample': 0.22481491235394924, 'bagging_fraction': 0.45, 'learning_rate': 0.00014936568554617635, 'max_depth': 27}. Best is trial 0 with value: 0.775626870457818.
[I 2024-02-20 10:20:37,542] Trial 2 finished with value: 0.7658179736739775 and parameters: {'subsample': 0.5808920093945671, 'bagging_fraction': 0.8, 'learning_rate': 0.00011527987128232407, 'max_depth': 30}. Best is trial 0 with value: 0.775626870457818.
[I 2024-02-20 10:20:41,932] Trial 3 finished with value: 0.7492233488399425 and parameters: {'subsample': 0.765954112

Found new best score with score 0.775627


[I 2024-02-20 10:22:47,307] Trial 0 finished with value: 0.7896256476467598 and parameters: {'subsample': 0.39963209507789, 'bagging_fraction': 0.95, 'learning_rate': 0.015702970884055395, 'max_depth': 19}. Best is trial 0 with value: 0.7896256476467598.
[I 2024-02-20 10:23:09,702] Trial 1 finished with value: 0.776130447470095 and parameters: {'subsample': 0.22481491235394924, 'bagging_fraction': 0.45, 'learning_rate': 0.00014936568554617635, 'max_depth': 27}. Best is trial 0 with value: 0.7896256476467598.
[I 2024-02-20 10:24:08,811] Trial 2 finished with value: 0.7794468767281405 and parameters: {'subsample': 0.5808920093945671, 'bagging_fraction': 0.8, 'learning_rate': 0.00011527987128232407, 'max_depth': 30}. Best is trial 0 with value: 0.7896256476467598.
[I 2024-02-20 10:24:13,385] Trial 3 finished with value: 0.7621295400223429 and parameters: {'subsample': 0.7659541126403374, 'bagging_fraction': 0.5, 'learning_rate': 0.0003511356313970409, 'max_depth': 7}. Best is trial 0 with

Found new best score with score 0.789626


[I 2024-02-20 10:26:25,066] Trial 0 finished with value: 0.7826067119452159 and parameters: {'subsample': 0.39963209507789, 'bagging_fraction': 0.95, 'learning_rate': 0.015702970884055395, 'max_depth': 19}. Best is trial 0 with value: 0.7826067119452159.
[I 2024-02-20 10:26:48,639] Trial 1 finished with value: 0.7702096709808728 and parameters: {'subsample': 0.22481491235394924, 'bagging_fraction': 0.45, 'learning_rate': 0.00014936568554617635, 'max_depth': 27}. Best is trial 0 with value: 0.7826067119452159.
[I 2024-02-20 10:27:50,374] Trial 2 finished with value: 0.7727428005982167 and parameters: {'subsample': 0.5808920093945671, 'bagging_fraction': 0.8, 'learning_rate': 0.00011527987128232407, 'max_depth': 30}. Best is trial 0 with value: 0.7826067119452159.
[I 2024-02-20 10:27:54,943] Trial 3 finished with value: 0.7565348556746242 and parameters: {'subsample': 0.7659541126403374, 'bagging_fraction': 0.5, 'learning_rate': 0.0003511356313970409, 'max_depth': 7}. Best is trial 0 wit


        Test AUC                           : 0.777
        

Nested CV: 3 of 5 outer fold


[I 2024-02-20 10:37:13,452] A new study created in memory with name: no-name-fbd2f15e-3f40-40a0-9eb2-c5b98e09391c
[I 2024-02-20 10:37:49,210] Trial 0 finished with value: 0.7756764814234697 and parameters: {'subsample': 0.39963209507789, 'bagging_fraction': 0.95, 'learning_rate': 0.015702970884055395, 'max_depth': 19}. Best is trial 0 with value: 0.7756764814234697.
[I 2024-02-20 10:38:14,140] Trial 1 finished with value: 0.7634494881273103 and parameters: {'subsample': 0.22481491235394924, 'bagging_fraction': 0.45, 'learning_rate': 0.00014936568554617635, 'max_depth': 27}. Best is trial 0 with value: 0.7756764814234697.
[I 2024-02-20 10:39:18,861] Trial 2 finished with value: 0.7674939565190151 and parameters: {'subsample': 0.5808920093945671, 'bagging_fraction': 0.8, 'learning_rate': 0.00011527987128232407, 'max_depth': 30}. Best is trial 0 with value: 0.7756764814234697.
[I 2024-02-20 10:39:23,696] Trial 3 finished with value: 0.7518194398847189 and parameters: {'subsample': 0.76595

Found new best score with score 0.775676


[I 2024-02-20 10:41:43,803] Trial 0 finished with value: 0.7809595848534552 and parameters: {'subsample': 0.39963209507789, 'bagging_fraction': 0.95, 'learning_rate': 0.015702970884055395, 'max_depth': 19}. Best is trial 0 with value: 0.7809595848534552.
[I 2024-02-20 10:42:06,391] Trial 1 finished with value: 0.7670414463141508 and parameters: {'subsample': 0.22481491235394924, 'bagging_fraction': 0.45, 'learning_rate': 0.00014936568554617635, 'max_depth': 27}. Best is trial 0 with value: 0.7809595848534552.
[I 2024-02-20 10:43:05,414] Trial 2 finished with value: 0.7709892723196357 and parameters: {'subsample': 0.5808920093945671, 'bagging_fraction': 0.8, 'learning_rate': 0.00011527987128232407, 'max_depth': 30}. Best is trial 0 with value: 0.7809595848534552.
[I 2024-02-20 10:43:10,079] Trial 3 finished with value: 0.7526525096939074 and parameters: {'subsample': 0.7659541126403374, 'bagging_fraction': 0.5, 'learning_rate': 0.0003511356313970409, 'max_depth': 7}. Best is trial 0 wit

Found new best score with score 0.780960


[I 2024-02-20 10:45:23,779] Trial 0 finished with value: 0.7848941492206216 and parameters: {'subsample': 0.39963209507789, 'bagging_fraction': 0.95, 'learning_rate': 0.015702970884055395, 'max_depth': 19}. Best is trial 0 with value: 0.7848941492206216.
[I 2024-02-20 10:45:48,475] Trial 1 finished with value: 0.7716448887024355 and parameters: {'subsample': 0.22481491235394924, 'bagging_fraction': 0.45, 'learning_rate': 0.00014936568554617635, 'max_depth': 27}. Best is trial 0 with value: 0.7848941492206216.
[I 2024-02-20 10:46:52,751] Trial 2 finished with value: 0.774162683429845 and parameters: {'subsample': 0.5808920093945671, 'bagging_fraction': 0.8, 'learning_rate': 0.00011527987128232407, 'max_depth': 30}. Best is trial 0 with value: 0.7848941492206216.
[I 2024-02-20 10:46:57,463] Trial 3 finished with value: 0.7589878229119524 and parameters: {'subsample': 0.7659541126403374, 'bagging_fraction': 0.5, 'learning_rate': 0.0003511356313970409, 'max_depth': 7}. Best is trial 0 with

Found new best score with score 0.784894


[I 2024-02-20 10:49:14,046] Trial 0 finished with value: 0.779691531836892 and parameters: {'subsample': 0.39963209507789, 'bagging_fraction': 0.95, 'learning_rate': 0.015702970884055395, 'max_depth': 19}. Best is trial 0 with value: 0.779691531836892.
[I 2024-02-20 10:49:38,583] Trial 1 finished with value: 0.7689886976964158 and parameters: {'subsample': 0.22481491235394924, 'bagging_fraction': 0.45, 'learning_rate': 0.00014936568554617635, 'max_depth': 27}. Best is trial 0 with value: 0.779691531836892.
[I 2024-02-20 10:50:41,483] Trial 2 finished with value: 0.7710745791362733 and parameters: {'subsample': 0.5808920093945671, 'bagging_fraction': 0.8, 'learning_rate': 0.00011527987128232407, 'max_depth': 30}. Best is trial 0 with value: 0.779691531836892.
[I 2024-02-20 10:50:46,127] Trial 3 finished with value: 0.7565322545672111 and parameters: {'subsample': 0.7659541126403374, 'bagging_fraction': 0.5, 'learning_rate': 0.0003511356313970409, 'max_depth': 7}. Best is trial 0 with va


        Test AUC                           : 0.782
        

Nested CV: 4 of 5 outer fold


[I 2024-02-20 10:57:27,799] Trial 0 finished with value: 0.7786826269369955 and parameters: {'subsample': 0.39963209507789, 'bagging_fraction': 0.95, 'learning_rate': 0.015702970884055395, 'max_depth': 19}. Best is trial 0 with value: 0.7786826269369955.
[I 2024-02-20 10:57:50,591] Trial 1 finished with value: 0.7664252349535927 and parameters: {'subsample': 0.22481491235394924, 'bagging_fraction': 0.45, 'learning_rate': 0.00014936568554617635, 'max_depth': 27}. Best is trial 0 with value: 0.7786826269369955.
[I 2024-02-20 10:58:48,753] Trial 2 finished with value: 0.7695884248108371 and parameters: {'subsample': 0.5808920093945671, 'bagging_fraction': 0.8, 'learning_rate': 0.00011527987128232407, 'max_depth': 30}. Best is trial 0 with value: 0.7786826269369955.
[I 2024-02-20 10:58:53,316] Trial 3 finished with value: 0.7531547532688435 and parameters: {'subsample': 0.7659541126403374, 'bagging_fraction': 0.5, 'learning_rate': 0.0003511356313970409, 'max_depth': 7}. Best is trial 0 wit

Found new best score with score 0.778683


[I 2024-02-20 11:01:04,949] Trial 0 finished with value: 0.7810891503980034 and parameters: {'subsample': 0.39963209507789, 'bagging_fraction': 0.95, 'learning_rate': 0.015702970884055395, 'max_depth': 19}. Best is trial 0 with value: 0.7810891503980034.
[I 2024-02-20 11:01:29,052] Trial 1 finished with value: 0.7646600898011882 and parameters: {'subsample': 0.22481491235394924, 'bagging_fraction': 0.45, 'learning_rate': 0.00014936568554617635, 'max_depth': 27}. Best is trial 0 with value: 0.7810891503980034.
[I 2024-02-20 11:02:31,902] Trial 2 finished with value: 0.7679000428185893 and parameters: {'subsample': 0.5808920093945671, 'bagging_fraction': 0.8, 'learning_rate': 0.00011527987128232407, 'max_depth': 30}. Best is trial 0 with value: 0.7810891503980034.
[I 2024-02-20 11:02:36,600] Trial 3 finished with value: 0.7524311268883719 and parameters: {'subsample': 0.7659541126403374, 'bagging_fraction': 0.5, 'learning_rate': 0.0003511356313970409, 'max_depth': 7}. Best is trial 0 wit

Found new best score with score 0.781089


[I 2024-02-20 11:04:49,906] Trial 0 finished with value: 0.7792818747025153 and parameters: {'subsample': 0.39963209507789, 'bagging_fraction': 0.95, 'learning_rate': 0.015702970884055395, 'max_depth': 19}. Best is trial 0 with value: 0.7792818747025153.
[I 2024-02-20 11:05:12,141] Trial 1 finished with value: 0.766933319681447 and parameters: {'subsample': 0.22481491235394924, 'bagging_fraction': 0.45, 'learning_rate': 0.00014936568554617635, 'max_depth': 27}. Best is trial 0 with value: 0.7792818747025153.
[I 2024-02-20 11:06:09,080] Trial 2 finished with value: 0.769994973297385 and parameters: {'subsample': 0.5808920093945671, 'bagging_fraction': 0.8, 'learning_rate': 0.00011527987128232407, 'max_depth': 30}. Best is trial 0 with value: 0.7792818747025153.
[I 2024-02-20 11:06:13,543] Trial 3 finished with value: 0.7545159370130889 and parameters: {'subsample': 0.7659541126403374, 'bagging_fraction': 0.5, 'learning_rate': 0.0003511356313970409, 'max_depth': 7}. Best is trial 0 with 


        Test AUC                           : 0.779
        

Nested CV: 5 of 5 outer fold


[I 2024-02-20 11:15:30,358] A new study created in memory with name: no-name-1ed2b610-8d68-4975-b94c-e65341be417f
[I 2024-02-20 11:16:01,906] Trial 0 finished with value: 0.7794595099197855 and parameters: {'subsample': 0.39963209507789, 'bagging_fraction': 0.95, 'learning_rate': 0.015702970884055395, 'max_depth': 19}. Best is trial 0 with value: 0.7794595099197855.
[I 2024-02-20 11:16:23,919] Trial 1 finished with value: 0.7683792131625045 and parameters: {'subsample': 0.22481491235394924, 'bagging_fraction': 0.45, 'learning_rate': 0.00014936568554617635, 'max_depth': 27}. Best is trial 0 with value: 0.7794595099197855.
[I 2024-02-20 11:17:19,058] Trial 2 finished with value: 0.7704624565139642 and parameters: {'subsample': 0.5808920093945671, 'bagging_fraction': 0.8, 'learning_rate': 0.00011527987128232407, 'max_depth': 30}. Best is trial 0 with value: 0.7794595099197855.
[I 2024-02-20 11:17:23,493] Trial 3 finished with value: 0.7551586745868344 and parameters: {'subsample': 0.76595

Found new best score with score 0.779460


[I 2024-02-20 11:19:28,649] Trial 0 finished with value: 0.775857680544366 and parameters: {'subsample': 0.39963209507789, 'bagging_fraction': 0.95, 'learning_rate': 0.015702970884055395, 'max_depth': 19}. Best is trial 0 with value: 0.775857680544366.
[I 2024-02-20 11:19:50,533] Trial 1 finished with value: 0.7617738350613923 and parameters: {'subsample': 0.22481491235394924, 'bagging_fraction': 0.45, 'learning_rate': 0.00014936568554617635, 'max_depth': 27}. Best is trial 0 with value: 0.775857680544366.
[I 2024-02-20 11:20:47,551] Trial 2 finished with value: 0.766679074293054 and parameters: {'subsample': 0.5808920093945671, 'bagging_fraction': 0.8, 'learning_rate': 0.00011527987128232407, 'max_depth': 30}. Best is trial 0 with value: 0.775857680544366.
[I 2024-02-20 11:20:52,004] Trial 3 finished with value: 0.7516561477021101 and parameters: {'subsample': 0.7659541126403374, 'bagging_fraction': 0.5, 'learning_rate': 0.0003511356313970409, 'max_depth': 7}. Best is trial 0 with val

Found new best score with score 0.780471


[I 2024-02-20 11:30:27,413] Trial 0 finished with value: 0.7769158105426885 and parameters: {'subsample': 0.39963209507789, 'bagging_fraction': 0.95, 'learning_rate': 0.015702970884055395, 'max_depth': 19}. Best is trial 0 with value: 0.7769158105426885.
[I 2024-02-20 11:30:50,886] Trial 1 finished with value: 0.7629499525662949 and parameters: {'subsample': 0.22481491235394924, 'bagging_fraction': 0.45, 'learning_rate': 0.00014936568554617635, 'max_depth': 27}. Best is trial 0 with value: 0.7769158105426885.
[I 2024-02-20 11:31:51,520] Trial 2 finished with value: 0.7672015357386192 and parameters: {'subsample': 0.5808920093945671, 'bagging_fraction': 0.8, 'learning_rate': 0.00011527987128232407, 'max_depth': 30}. Best is trial 0 with value: 0.7769158105426885.
[I 2024-02-20 11:31:56,074] Trial 3 finished with value: 0.7488057766396702 and parameters: {'subsample': 0.7659541126403374, 'bagging_fraction': 0.5, 'learning_rate': 0.0003511356313970409, 'max_depth': 7}. Best is trial 0 wit


        Test AUC                           : 0.783
        


# Output and model savings

In [14]:
# Counter for current lgb version
n = 1
timestamp =datetime.today().strftime('%Y%m%d%H%M')

oof_lgb = np.concatenate(lgb_oof_preds)
true_lgb = np.concatenate(lgb_oof_targets)
auc_cv_lgb = roc_auc_score(true_lgb, oof_lgb)

print('Overall OOF AUC LGBM = {:.5f}'.format(auc_cv_lgb))

results = pd.DataFrame({'oof': oof_lgb, 'truth': true_lgb})
results.to_csv('{}/{}/output/oof_lgb_{}_{}.csv'.format(ROOT,
                                                       DATA_FOLDER,
                                                       n,
                                                       timestamp), index=False)

for model in lgb_clfs:
    joblib.dump(model, '{}/{}/models/lgb_{}_{}.pkl'.format(ROOT,
                                                           DATA_FOLDER,
                                                           n,
                                                           datetime.today().strftime('%Y%m%d%H%M')))

Overall OOF AUC LGBM = 0.78637


In [15]:
# Counter for current xgb version
n = 1
timestamp =datetime.today().strftime('%Y%m%d%H%M')

oof_xb = np.concatenate(xb_oof_preds)
true_xb = np.concatenate(xb_oof_targets)
auc_cv_xb = roc_auc_score(true_xb, oof_xb)

print('Overall OOF AUC XGB = {:.5f}'.format(auc_cv_xb))

results = pd.DataFrame({'oof': oof_xb, 'truth': true_xb})
results.to_csv('{}/{}/output/oof_xb_{}_{}.csv'.format(ROOT,
                                                      DATA_FOLDER,
                                                      n,
                                                      timestamp), index=False)

for model in xb_clfs:
    joblib.dump(model, '{}/{}/models/xb_{}_{}.pkl'.format(ROOT,
                                                          DATA_FOLDER,
                                                          n,
                                                          datetime.today().strftime('%Y%m%d%H%M')))

Overall OOF AUC XGB = 0.78023
