In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import optuna

from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold, train_test_split
from sklearn.metrics import mean_squared_error, log_loss

from sklearn.ensemble import RandomForestClassifier

import catboost
import xgboost as xgb
from xgboost.callback import EarlyStopping
import lightgbm as lgbm

import os
import sys
import datetime

import wandb

import warnings; warnings.filterwarnings('ignore')

sys.path.append('..')
from config import CFG
CFG = CFG()
pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('display.max_columns', CFG.NCOLS)
pd.set_option('display.max_rows', CFG.NROWS)

In [3]:
from colorama import Style, Fore
red = Style.BRIGHT + Fore.RED
blu = Style.BRIGHT + Fore.BLUE
mgt = Style.BRIGHT + Fore.MAGENTA
grn = Style.BRIGHT + Fore.GREEN
gld = Style.BRIGHT + Fore.YELLOW
res = Style.RESET_ALL

rc = {
    "axes.facecolor": "#FFFEF8",
    "figure.facecolor": "#FFFEF8",
    "axes.edgecolor": "#000000",
    "grid.color": "#EBEBE7" + "30",
    "font.family": "serif",
    "axes.labelcolor": "#000000",
    "xtick.color": "#000000",
    "ytick.color": "#000000",
    "grid.alpha": 0.4
}
sns.set(rc=rc)
palette = ['#302c36', '#037d97', '#E4591E', '#C09741',
           '#EC5B6D', '#90A6B1', '#6ca957', '#D8E3E2']

In [4]:
orig = pd.read_csv(os.path.join(CFG.RAW_DATA, 'Pulsar.csv'))
train = pd.read_csv(os.path.join(CFG.RAW_DATA, 'train.csv')).drop(columns='id')
test = pd.read_csv(os.path.join(CFG.RAW_DATA, 'test.csv')).drop(columns='id')

In [5]:
def f_eng(df):
    df['Skewness_Power3'] = df['Skewness'].apply(lambda x: x**3)
    df['EK_Power3'] = df['EK'].apply(lambda x: x**3)
    df['cos(EK)'] = df['EK'].apply(lambda x: np.cos(x))
    df['SD_x_EK'] = df['SD'] * df['EK']
    df['cos(EK)_x_SD'] = df['cos(EK)'] * df['SD']
    df['SD_DMSNR_Curve_x_Skewness_Power3'] = df['SD_DMSNR_Curve'] * df['Skewness_Power3']
    df['EK_divide_EK_Power3'] = df['EK'] / df['EK_Power3']
    df['EK_multiply_SD'] = df['EK'] * df['SD']
    df['EK_divide_SD'] = df['EK'] / df['SD']
    df['EK_multiply_SD_DMSNR_Curve'] = df['EK'] * df['SD_DMSNR_Curve']
    df['SD_DMSNR_Curve_divide_SD'] = df['SD_DMSNR_Curve'] / df['SD']
    df['SD_multiply_EK_Power3'] = df['SD'] * df['EK_Power3']
    df['SD_DMSNR_Curve_multiply_SD'] = df['SD_DMSNR_Curve'] * df['SD']
    df['EK_Power3_multiply_SD_DMSNR_Curve'] = df['EK_Power3'] * df['SD_DMSNR_Curve']
    df['cos(EK)_multiply_SD_DMSNR_Curve'] = df['cos(EK)'] * df['SD_DMSNR_Curve']
    return df

In [6]:
train = f_eng(train)
test = f_eng(test)
orig = f_eng(orig)

In [7]:
y = train['Class']
X = train.drop(['Class'], axis=1)

In [8]:
wandb.init(project='S3E10', tags='catboost')

<wandb.sdk.wandb_run.Run at 0x7f5438be6470>

In [9]:
model = catboost.CatBoostClassifier()
model.get_params()

{}

In [10]:
model = catboost.CatBoostClassifier()
model.get_all_params()

In [11]:
PATIENCE = 100

modelsCB = []
predsCB = []

k_fold = RepeatedStratifiedKFold(n_splits=CFG.NFOLDS, n_repeats=CFG.REPEATS, random_state=CFG.SEED) 

# gpu_params = {'task_type' : "GPU", 'devices' : '0:1'}
cbr_params = {
            'iterations':9999,
            # 'depth': 3,
            # 'learning_rate': 0.15687380686250746,
            # 'l2_leaf_reg': 4.0368544113430485,
            # 'random_strength': 0.1279482215776108,
            # 'max_bin': 238,
            # 'od_wait': 49,
            # 'one_hot_max_size': 39,
            # 'grow_policy': 'SymmetricTree',
            'bootstrap_type': 'Bayesian',
            # 'od_type': 'Iter',
            
            
            'random_state': CFG.SEED
        }

for train_index, test_index in k_fold.split(X, y):
    X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    
    model = catboost.CatBoostClassifier(**cbr_params)
    
    model.fit(X=X_train, y=y_train,
          eval_set=[(X_valid, y_valid)],
          early_stopping_rounds = PATIENCE,
          verbose=150,
         )
    print(model.get_all_params())
    break
    modelsCB.append(model)
    predsCB.append(model.predict_proba(test)[:, 1])

In [12]:
PATIENCE = 100

modelsCB = []
predsCB = []

k_fold = RepeatedStratifiedKFold(n_splits=CFG.NFOLDS, n_repeats=CFG.REPEATS, random_state=CFG.SEED) 

# gpu_params = {'task_type' : "GPU", 'devices' : '0:1'}
cbr_params = {
            'iterations':9999,
            # 'depth': 3,
            # 'learning_rate': 0.15687380686250746,
            # 'l2_leaf_reg': 4.0368544113430485,
            # 'random_strength': 0.1279482215776108,
            # 'max_bin': 238,
            # 'od_wait': 49,
            # 'one_hot_max_size': 39,
            # 'grow_policy': 'SymmetricTree',
            'bootstrap_type': 'Bayesian',
            # 'od_type': 'Iter',
            
            
            'random_state': CFG.SEED
        }

for train_index, test_index in k_fold.split(X, y):
    X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    
    model = catboost.CatBoostClassifier(**cbr_params)
    
    model.fit(X=X_train, y=y_train,
          eval_set=[(X_valid, y_valid)],
          early_stopping_rounds = PATIENCE,
          verbose=150,
         )
    print(model.get_params())
    break
    modelsCB.append(model)
    predsCB.append(model.predict_proba(test)[:, 1])

In [13]:
PATIENCE = 100

modelsCB = []
predsCB = []

k_fold = RepeatedStratifiedKFold(n_splits=CFG.NFOLDS, n_repeats=CFG.REPEATS, random_state=CFG.SEED) 

# gpu_params = {'task_type' : "GPU", 'devices' : '0:1'}
cbr_params = {
            'iterations':9999,
            # 'depth': 3,
            # 'learning_rate': 0.15687380686250746,
            # 'l2_leaf_reg': 4.0368544113430485,
            # 'random_strength': 0.1279482215776108,
            # 'max_bin': 238,
            # 'od_wait': 49,
            # 'one_hot_max_size': 39,
            # 'grow_policy': 'SymmetricTree',
            'bootstrap_type': 'Bayesian',
            # 'od_type': 'Iter',
            
            
            'random_state': CFG.SEED
        }

for train_index, test_index in k_fold.split(X, y):
    X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    
    model = catboost.CatBoostClassifier(**cbr_params)
    
    model.fit(X=X_train, y=y_train,
          eval_set=[(X_valid, y_valid)],
          early_stopping_rounds = PATIENCE,
          verbose=150,
         )
    
    model.get_params()
    break
    # del params['loss_function']; del params['eval_metric']; del params['verbose']; del params['use_best_model']; del params['best_model_min_trees']; del params['ignored_features']; del params[]
    modelsCB.append(model)
    predsCB.append(model.predict_proba(test)[:, 1])

In [14]:
PATIENCE = 100

modelsCB = []
predsCB = []

k_fold = RepeatedStratifiedKFold(n_splits=CFG.NFOLDS, n_repeats=CFG.REPEATS, random_state=CFG.SEED) 

# gpu_params = {'task_type' : "GPU", 'devices' : '0:1'}
cbr_params = {
            'iterations':9999,
            # 'depth': 3,
            # 'learning_rate': 0.15687380686250746,
            # 'l2_leaf_reg': 4.0368544113430485,
            # 'random_strength': 0.1279482215776108,
            # 'max_bin': 238,
            # 'od_wait': 49,
            # 'one_hot_max_size': 39,
            # 'grow_policy': 'SymmetricTree',
            'bootstrap_type': 'Bayesian',
            # 'od_type': 'Iter',
            
            
            'random_state': CFG.SEED
        }

for train_index, test_index in k_fold.split(X, y):
    X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    
    model = catboost.CatBoostClassifier(**cbr_params)
    
    model.fit(X=X_train, y=y_train,
          eval_set=[(X_valid, y_valid)],
          early_stopping_rounds = PATIENCE,
          verbose=150,
         )
    
    model.get_all_params()
    break
    # del params['loss_function']; del params['eval_metric']; del params['verbose']; del params['use_best_model']; del params['best_model_min_trees']; del params['ignored_features']; del params[]
    modelsCB.append(model)
    predsCB.append(model.predict_proba(test)[:, 1])

In [15]:
PATIENCE = 100

modelsCB = []
predsCB = []

k_fold = RepeatedStratifiedKFold(n_splits=CFG.NFOLDS, n_repeats=CFG.REPEATS, random_state=CFG.SEED) 

# gpu_params = {'task_type' : "GPU", 'devices' : '0:1'}
cbr_params = {
            'iterations':9999,
            # 'depth': 3,
            # 'learning_rate': 0.15687380686250746,
            # 'l2_leaf_reg': 4.0368544113430485,
            # 'random_strength': 0.1279482215776108,
            # 'max_bin': 238,
            # 'od_wait': 49,
            # 'one_hot_max_size': 39,
            # 'grow_policy': 'SymmetricTree',
            'bootstrap_type': 'Bayesian',
            # 'od_type': 'Iter',
            
            
            'random_state': CFG.SEED
        }

for train_index, test_index in k_fold.split(X, y):
    X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    
    model = catboost.CatBoostClassifier(**cbr_params)
    
    model.fit(X=X_train, y=y_train,
          eval_set=[(X_valid, y_valid)],
          early_stopping_rounds = PATIENCE,
          verbose=150,
         )
    
    model.get_all_params()
    break
    # del params['loss_function']; del params['eval_metric']; del params['verbose']; del params['use_best_model']; del params['best_model_min_trees']; del params['ignored_features']; del params[]
    modelsCB.append(model)
    predsCB.append(model.predict_proba(test)[:, 1])

In [16]:
PATIENCE = 100

modelsCB = []
predsCB = []

k_fold = RepeatedStratifiedKFold(n_splits=CFG.NFOLDS, n_repeats=CFG.REPEATS, random_state=CFG.SEED) 

# gpu_params = {'task_type' : "GPU", 'devices' : '0:1'}
cbr_params = {
            'iterations':9999,
            # 'depth': 3,
            # 'learning_rate': 0.15687380686250746,
            # 'l2_leaf_reg': 4.0368544113430485,
            # 'random_strength': 0.1279482215776108,
            # 'max_bin': 238,
            # 'od_wait': 49,
            # 'one_hot_max_size': 39,
            # 'grow_policy': 'SymmetricTree',
            'bootstrap_type': 'Bayesian',
            # 'od_type': 'Iter',
            
            
            'random_state': CFG.SEED
        }

for train_index, test_index in k_fold.split(X, y):
    X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    
    model = catboost.CatBoostClassifier(**cbr_params)
    
    model.fit(X=X_train, y=y_train,
          eval_set=[(X_valid, y_valid)],
          early_stopping_rounds = PATIENCE,
          verbose=150,
         )
    
    print(model.get_all_params())
    break
    # del params['loss_function']; del params['eval_metric']; del params['verbose']; del params['use_best_model']; del params['best_model_min_trees']; del params['ignored_features']; del params[]
    modelsCB.append(model)
    predsCB.append(model.predict_proba(test)[:, 1])

In [17]:
PATIENCE = 100

modelsCB = []
predsCB = []

k_fold = RepeatedStratifiedKFold(n_splits=CFG.NFOLDS, n_repeats=CFG.REPEATS, random_state=CFG.SEED) 

# gpu_params = {'task_type' : "GPU", 'devices' : '0:1'}
cbr_params = {
            'iterations':9999,
            # 'depth': 3,
            # 'learning_rate': 0.15687380686250746,
            # 'l2_leaf_reg': 4.0368544113430485,
            # 'random_strength': 0.1279482215776108,
            # 'max_bin': 238,
            # 'od_wait': 49,
            # 'one_hot_max_size': 39,
            # 'grow_policy': 'SymmetricTree',
            'bootstrap_type': 'Bayesian',
            # 'od_type': 'Iter',
            
            
            'random_state': CFG.SEED
        }

for train_index, test_index in k_fold.split(X, y):
    X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    
    model = catboost.CatBoostClassifier(**cbr_params)
    
    model.fit(X=X_train, y=y_train,
          eval_set=[(X_valid, y_valid)],
          early_stopping_rounds = PATIENCE,
          verbose=150,
         )
    
    print(model.get_all_params())
    del params['nan_mode']; del params['od_pval']; del params['penalties_coefficient']; del params['boosting_type']; del params['']; del params['']; del params['score_function']; del params['task_type']; del params['model_shrink_rate']; del params['bagging_temperature']; del params['best_model_min_trees']; del params['leaf_estimation_backtracking']; del params['sparse_features_conflict_fraction']; del params['classes_count']; del params['auto_class_weights']; del params['class_names']; del params['pool_metainfo_options']; del params['boost_from_average']; del params['rsm']; del params['od_type']; del params['force_unit_auto_pair_weights']; del params['eval_fraction']; del params['feature_border_type']; del params['model_shrink_mode'] del params['loss_function']; del params['eval_metric']; del params['verbose']; del params['use_best_model']; del params['best_model_min_trees']; del params['ignored_features']; del params[]
    print(params.keys())
    break
    modelsCB.append(model)
    predsCB.append(model.predict_proba(test)[:, 1])

In [18]:
PATIENCE = 100

modelsCB = []
predsCB = []

k_fold = RepeatedStratifiedKFold(n_splits=CFG.NFOLDS, n_repeats=CFG.REPEATS, random_state=CFG.SEED) 

# gpu_params = {'task_type' : "GPU", 'devices' : '0:1'}
cbr_params = {
            'iterations':9999,
            # 'depth': 3,
            # 'learning_rate': 0.15687380686250746,
            # 'l2_leaf_reg': 4.0368544113430485,
            # 'random_strength': 0.1279482215776108,
            # 'max_bin': 238,
            # 'od_wait': 49,
            # 'one_hot_max_size': 39,
            # 'grow_policy': 'SymmetricTree',
            'bootstrap_type': 'Bayesian',
            # 'od_type': 'Iter',
            
            
            'random_state': CFG.SEED
        }

for train_index, test_index in k_fold.split(X, y):
    X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    
    model = catboost.CatBoostClassifier(**cbr_params)
    
    model.fit(X=X_train, y=y_train,
          eval_set=[(X_valid, y_valid)],
          early_stopping_rounds = PATIENCE,
          verbose=150,
         )
    
    params = model.get_all_params()
    del params['nan_mode']; del params['od_pval']; del params['penalties_coefficient']; del params['boosting_type']; del params['']; del params['']; del params['score_function']; del params['task_type']; del params['model_shrink_rate']; del params['bagging_temperature']; del params['best_model_min_trees']; del params['leaf_estimation_backtracking']; del params['sparse_features_conflict_fraction']; del params['classes_count']; del params['auto_class_weights']; del params['class_names']; del params['pool_metainfo_options']; del params['boost_from_average']; del params['rsm']; del params['od_type']; del params['force_unit_auto_pair_weights']; del params['eval_fraction']; del params['feature_border_type']; del params['model_shrink_mode'] del params['loss_function']; del params['eval_metric']; del params['verbose']; del params['use_best_model']; del params['best_model_min_trees']; del params['ignored_features']; del params[]
    print(params.keys())
    break
    modelsCB.append(model)
    predsCB.append(model.predict_proba(test)[:, 1])

In [19]:
PATIENCE = 100

modelsCB = []
predsCB = []

k_fold = RepeatedStratifiedKFold(n_splits=CFG.NFOLDS, n_repeats=CFG.REPEATS, random_state=CFG.SEED) 

# gpu_params = {'task_type' : "GPU", 'devices' : '0:1'}
cbr_params = {
            'iterations':9999,
            # 'depth': 3,
            # 'learning_rate': 0.15687380686250746,
            # 'l2_leaf_reg': 4.0368544113430485,
            # 'random_strength': 0.1279482215776108,
            # 'max_bin': 238,
            # 'od_wait': 49,
            # 'one_hot_max_size': 39,
            # 'grow_policy': 'SymmetricTree',
            'bootstrap_type': 'Bayesian',
            # 'od_type': 'Iter',
            
            
            'random_state': CFG.SEED
        }

for train_index, test_index in k_fold.split(X, y):
    X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    
    model = catboost.CatBoostClassifier(**cbr_params)
    
    model.fit(X=X_train, y=y_train,
          eval_set=[(X_valid, y_valid)],
          early_stopping_rounds = PATIENCE,
          verbose=150,
         )
    
    params = model.get_all_params()
    del params['nan_mode']; del params['od_pval']; del params['penalties_coefficient']; del params['boosting_type']; del params['']; del params['']; del params['score_function']; del params['task_type']; del params['model_shrink_rate']; del params['bagging_temperature']; del params['best_model_min_trees']; del params['leaf_estimation_backtracking']; del params['sparse_features_conflict_fraction']; del params['classes_count']; del params['auto_class_weights']; del params['class_names']; del params['pool_metainfo_options']; del params['boost_from_average']; del params['rsm']; del params['od_type']; del params['force_unit_auto_pair_weights']; del params['eval_fraction']; del params['feature_border_type']; del params['model_shrink_mode'] del params['loss_function']; del params['eval_metric']; del params['verbose']; del params['use_best_model']; del params['best_model_min_trees']; del params['ignored_features']; del params[]
    print(params.keys())
    break
    modelsCB.append(model)
    predsCB.append(model.predict_proba(test)[:, 1])

In [20]:
PATIENCE = 100

modelsCB = []
predsCB = []

k_fold = RepeatedStratifiedKFold(n_splits=CFG.NFOLDS, n_repeats=CFG.REPEATS, random_state=CFG.SEED) 

# gpu_params = {'task_type' : "GPU", 'devices' : '0:1'}
cbr_params = {
            'iterations':9999,
            # 'depth': 3,
            # 'learning_rate': 0.15687380686250746,
            # 'l2_leaf_reg': 4.0368544113430485,
            # 'random_strength': 0.1279482215776108,
            # 'max_bin': 238,
            # 'od_wait': 49,
            # 'one_hot_max_size': 39,
            # 'grow_policy': 'SymmetricTree',
            'bootstrap_type': 'Bayesian',
            # 'od_type': 'Iter',
            
            
            'random_state': CFG.SEED
        }

for train_index, test_index in k_fold.split(X, y):
    X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    
    model = catboost.CatBoostClassifier(**cbr_params)
    
    model.fit(X=X_train, y=y_train,
          eval_set=[(X_valid, y_valid)],
          early_stopping_rounds = PATIENCE,
          verbose=150,
         )
    
    params = model.get_all_params()
    del params['nan_mode']; del params['od_pval']; del params['penalties_coefficient']; del params['boosting_type']; del params['']; del params['']; del params['score_function']; del params['task_type']; del params['model_shrink_rate']; del params['bagging_temperature']; del params['best_model_min_trees']; del params['leaf_estimation_backtracking']; del params['sparse_features_conflict_fraction']; del params['classes_count']; del params['auto_class_weights']; del params['class_names']; del params['pool_metainfo_options']; del params['boost_from_average']; del params['rsm']; del params['od_type']; del params['force_unit_auto_pair_weights']; del params['eval_fraction']; del params['feature_border_type']; del params['model_shrink_mode']; del params['loss_function']; del params['eval_metric']; del params['verbose']; del params['use_best_model']; del params['best_model_min_trees']; del params['ignored_features']; del params[]
    print(params.keys())
    break
    modelsCB.append(model)
    predsCB.append(model.predict_proba(test)[:, 1])

In [21]:
PATIENCE = 100

modelsCB = []
predsCB = []

k_fold = RepeatedStratifiedKFold(n_splits=CFG.NFOLDS, n_repeats=CFG.REPEATS, random_state=CFG.SEED) 

# gpu_params = {'task_type' : "GPU", 'devices' : '0:1'}
cbr_params = {
            'iterations':9999,
            # 'depth': 3,
            # 'learning_rate': 0.15687380686250746,
            # 'l2_leaf_reg': 4.0368544113430485,
            # 'random_strength': 0.1279482215776108,
            # 'max_bin': 238,
            # 'od_wait': 49,
            # 'one_hot_max_size': 39,
            # 'grow_policy': 'SymmetricTree',
            'bootstrap_type': 'Bayesian',
            # 'od_type': 'Iter',
            
            
            'random_state': CFG.SEED
        }

for train_index, test_index in k_fold.split(X, y):
    X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    
    model = catboost.CatBoostClassifier(**cbr_params)
    
    model.fit(X=X_train, y=y_train,
          eval_set=[(X_valid, y_valid)],
          early_stopping_rounds = PATIENCE,
          verbose=150,
         )
    
    params = model.get_all_params()
    del params['nan_mode']; del params['od_pval']; del params['penalties_coefficient']; del params['boosting_type']; del params['']; del params['']; del params['score_function']; del params['task_type']; del params['model_shrink_rate']; del params['bagging_temperature']; del params['best_model_min_trees']; del params['leaf_estimation_backtracking']; del params['sparse_features_conflict_fraction']; del params['classes_count']; del params['auto_class_weights']; del params['class_names']; del params['pool_metainfo_options']; del params['boost_from_average']; del params['rsm']; del params['od_type']; del params['force_unit_auto_pair_weights']; del params['eval_fraction']; del params['feature_border_type']; del params['model_shrink_mode']; del params['loss_function']; del params['eval_metric']; del params['verbose']; del params['use_best_model']; del params['best_model_min_trees']; del params['ignored_features']; del params[]
    print(params.keys())
    break
    modelsCB.append(model)
    predsCB.append(model.predict_proba(test)[:, 1])

In [22]:
PATIENCE = 100

modelsCB = []
predsCB = []

k_fold = RepeatedStratifiedKFold(n_splits=CFG.NFOLDS, n_repeats=CFG.REPEATS, random_state=CFG.SEED) 

# gpu_params = {'task_type' : "GPU", 'devices' : '0:1'}
cbr_params = {
            'iterations':9999,
            # 'depth': 3,
            # 'learning_rate': 0.15687380686250746,
            # 'l2_leaf_reg': 4.0368544113430485,
            # 'random_strength': 0.1279482215776108,
            # 'max_bin': 238,
            # 'od_wait': 49,
            # 'one_hot_max_size': 39,
            # 'grow_policy': 'SymmetricTree',
            'bootstrap_type': 'Bayesian',
            # 'od_type': 'Iter',
            
            
            'random_state': CFG.SEED
        }

for train_index, test_index in k_fold.split(X, y):
    X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    
    model = catboost.CatBoostClassifier(**cbr_params)
    
    model.fit(X=X_train, y=y_train,
          eval_set=[(X_valid, y_valid)],
          early_stopping_rounds = PATIENCE,
          verbose=150,
         )
    
    params = model.get_all_params()
    del params['nan_mode']; del params['od_pval']; del params['penalties_coefficient']; del params['boosting_type']; del params['']; del params['']; del params['score_function']; del params['task_type']; del params['model_shrink_rate']; del params['bagging_temperature']; del params['best_model_min_trees']; del params['leaf_estimation_backtracking']; del params['sparse_features_conflict_fraction']; del params['classes_count']; del params['auto_class_weights']; del params['class_names']; del params['pool_metainfo_options']; del params['boost_from_average']; del params['rsm']; del params['od_type']; del params['force_unit_auto_pair_weights']; del params['eval_fraction']; del params['feature_border_type']; del params['model_shrink_mode']; del params['loss_function']; del params['eval_metric']; del params['verbose']; del params['use_best_model']; del params['best_model_min_trees']; del params['ignored_features']
    print(params.keys())
    break
    modelsCB.append(model)
    predsCB.append(model.predict_proba(test)[:, 1])

In [23]:
PATIENCE = 100

modelsCB = []
predsCB = []

k_fold = RepeatedStratifiedKFold(n_splits=CFG.NFOLDS, n_repeats=CFG.REPEATS, random_state=CFG.SEED) 

# gpu_params = {'task_type' : "GPU", 'devices' : '0:1'}
cbr_params = {
            'iterations':9999,
            # 'depth': 3,
            # 'learning_rate': 0.15687380686250746,
            # 'l2_leaf_reg': 4.0368544113430485,
            # 'random_strength': 0.1279482215776108,
            # 'max_bin': 238,
            # 'od_wait': 49,
            # 'one_hot_max_size': 39,
            # 'grow_policy': 'SymmetricTree',
            'bootstrap_type': 'Bayesian',
            # 'od_type': 'Iter',
            
            
            'random_state': CFG.SEED
        }

for train_index, test_index in k_fold.split(X, y):
    X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    
    model = catboost.CatBoostClassifier(**cbr_params)
    
    model.fit(X=X_train, y=y_train,
          eval_set=[(X_valid, y_valid)],
          early_stopping_rounds = PATIENCE,
          verbose=150,
         )
    
    params = model.get_all_params()
    del params['nan_mode']; del params['od_pval']; del params['penalties_coefficient']; del params['boosting_type']; del params['score_function']; del params['task_type']; del params['model_shrink_rate']; del params['bagging_temperature']; del params['best_model_min_trees']; del params['leaf_estimation_backtracking']; del params['sparse_features_conflict_fraction']; del params['classes_count']; del params['auto_class_weights']; del params['class_names']; del params['pool_metainfo_options']; del params['boost_from_average']; del params['rsm']; del params['od_type']; del params['force_unit_auto_pair_weights']; del params['eval_fraction']; del params['feature_border_type']; del params['model_shrink_mode']; del params['loss_function']; del params['eval_metric']; del params['verbose']; del params['use_best_model']; del params['best_model_min_trees']; del params['ignored_features']
    print(params.keys())
    break
    modelsCB.append(model)
    predsCB.append(model.predict_proba(test)[:, 1])

In [24]:
PATIENCE = 100

modelsCB = []
predsCB = []

k_fold = RepeatedStratifiedKFold(n_splits=CFG.NFOLDS, n_repeats=CFG.REPEATS, random_state=CFG.SEED) 

# gpu_params = {'task_type' : "GPU", 'devices' : '0:1'}
cbr_params = {
            'iterations':9999,
            # 'depth': 3,
            # 'learning_rate': 0.15687380686250746,
            # 'l2_leaf_reg': 4.0368544113430485,
            # 'random_strength': 0.1279482215776108,
            # 'max_bin': 238,
            # 'od_wait': 49,
            # 'one_hot_max_size': 39,
            # 'grow_policy': 'SymmetricTree',
            'bootstrap_type': 'Bayesian',
            # 'od_type': 'Iter',
            
            
            'random_state': CFG.SEED
        }

for train_index, test_index in k_fold.split(X, y):
    X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    
    model = catboost.CatBoostClassifier(**cbr_params)
    
    model.fit(X=X_train, y=y_train,
          eval_set=[(X_valid, y_valid)],
          early_stopping_rounds = PATIENCE,
          verbose=150,
         )
    
    params = model.get_all_params()
    del params['nan_mode']; del params['od_pval']; del params['penalties_coefficient']; del params['boosting_type']; del params['score_function']; del params['task_type']; del params['model_shrink_rate']; del params['bagging_temperature']; del params['best_model_min_trees']; del params['leaf_estimation_backtracking']; del params['sparse_features_conflict_fraction']; del params['classes_count']; del params['auto_class_weights']; del params['class_names']; del params['pool_metainfo_options']; del params['boost_from_average']; del params['rsm']; del params['od_type']; del params['force_unit_auto_pair_weights']; del params['eval_fraction']; del params['feature_border_type']; del params['model_shrink_mode']; del params['loss_function']; del params['eval_metric']; del params['use_best_model']; del params['best_model_min_trees']; del params['ignored_features']
    print(params.keys())
    break
    modelsCB.append(model)
    predsCB.append(model.predict_proba(test)[:, 1])

In [25]:
PATIENCE = 100

modelsCB = []
predsCB = []

k_fold = RepeatedStratifiedKFold(n_splits=CFG.NFOLDS, n_repeats=CFG.REPEATS, random_state=CFG.SEED) 

# gpu_params = {'task_type' : "GPU", 'devices' : '0:1'}
cbr_params = {
            'iterations':9999,
            # 'depth': 3,
            # 'learning_rate': 0.15687380686250746,
            # 'l2_leaf_reg': 4.0368544113430485,
            # 'random_strength': 0.1279482215776108,
            # 'max_bin': 238,
            # 'od_wait': 49,
            # 'one_hot_max_size': 39,
            # 'grow_policy': 'SymmetricTree',
            'bootstrap_type': 'Bayesian',
            # 'od_type': 'Iter',
            
            
            'random_state': CFG.SEED
        }

for train_index, test_index in k_fold.split(X, y):
    X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    
    model = catboost.CatBoostClassifier(**cbr_params)
    
    model.fit(X=X_train, y=y_train,
          eval_set=[(X_valid, y_valid)],
          early_stopping_rounds = PATIENCE,
          verbose=150,
         )
    
    params = model.get_all_params()
    del params['nan_mode']; del params['od_pval']; del params['penalties_coefficient']; del params['boosting_type']; del params['score_function']; del params['task_type']; del params['model_shrink_rate']; del params['bagging_temperature']; del params['best_model_min_trees']; del params['leaf_estimation_backtracking']; del params['sparse_features_conflict_fraction']; del params['classes_count']; del params['auto_class_weights']; del params['class_names']; del params['pool_metainfo_options']; del params['boost_from_average']; del params['rsm']; del params['od_type']; del params['force_unit_auto_pair_weights']; del params['eval_fraction']; del params['feature_border_type']; del params['model_shrink_mode']; del params['loss_function']; del params['eval_metric']; del params['use_best_model']; del params['ignored_features']
    print(params.keys())
    break
    modelsCB.append(model)
    predsCB.append(model.predict_proba(test)[:, 1])

In [26]:
PATIENCE = 100

modelsCB = []
predsCB = []

k_fold = RepeatedStratifiedKFold(n_splits=CFG.NFOLDS, n_repeats=CFG.REPEATS, random_state=CFG.SEED) 

# gpu_params = {'task_type' : "GPU", 'devices' : '0:1'}
cbr_params = {
            'iterations':9999,
            # 'depth': 3,
            # 'learning_rate': 0.15687380686250746,
            # 'l2_leaf_reg': 4.0368544113430485,
            # 'random_strength': 0.1279482215776108,
            # 'max_bin': 238,
            # 'od_wait': 49,
            # 'one_hot_max_size': 39,
            # 'grow_policy': 'SymmetricTree',
            'bootstrap_type': 'Bayesian',
            # 'od_type': 'Iter',
            
            
            'random_state': CFG.SEED
        }

for train_index, test_index in k_fold.split(X, y):
    X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    
    model = catboost.CatBoostClassifier(**cbr_params)
    
    model.fit(X=X_train, y=y_train,
          eval_set=[(X_valid, y_valid)],
          early_stopping_rounds = PATIENCE,
          verbose=150,
         )
    
    params = model.get_all_params()
    del params['nan_mode']; del params['od_pval']; del params['penalties_coefficient']; del params['boosting_type']; del params['score_function']; del params['task_type']; del params['model_shrink_rate']; del params['bagging_temperature']; del params['best_model_min_trees']; del params['leaf_estimation_backtracking']; del params['sparse_features_conflict_fraction']; del params['classes_count']; del params['auto_class_weights']; del params['class_names']; del params['pool_metainfo_options']; del params['boost_from_average']; del params['rsm']; del params['od_type']; del params['force_unit_auto_pair_weights']; del params['eval_fraction']; del params['feature_border_type']; del params['model_shrink_mode']; del params['loss_function']; del params['eval_metric']; del params['use_best_model']
    print(params.keys())
    break
    modelsCB.append(model)
    predsCB.append(model.predict_proba(test)[:, 1])

In [27]:
param_list = [
    'iterations', 'sampling_frequency', 'leaf_estimation_method', 'grow_policy', 'bayesian_matrix_reg', 'l2_leaf_reg', 'random_strength', 'model_size_reg',
    'od_wait', 'random_seed', 'depth', 'posterior_sampling', 'border_count', 'min_data_in_leaf', 'learning_rate', 'leaf_estimation_iterations', 'bootstrap_type', 'max_leaves'
]
param_dict = {param: [] for param in param_list}
param_dict['iterations'] = [9999]
param_dict['random_state'] = [CFG.SEED]
param_dict['iterations', 'random_state']

In [28]:
param_list = [
    'iterations', 'sampling_frequency', 'leaf_estimation_method', 'grow_policy', 'bayesian_matrix_reg', 'l2_leaf_reg', 'random_strength', 'model_size_reg',
    'od_wait', 'random_seed', 'depth', 'posterior_sampling', 'border_count', 'min_data_in_leaf', 'learning_rate', 'leaf_estimation_iterations', 'bootstrap_type', 'max_leaves'
]
param_dict = {param: [] for param in param_list}
param_dict['iterations'] = [9999]
param_dict['random_state'] = [CFG.SEED]
param_dict[['iterations', 'random_state']]

In [29]:
PATIENCE = 100

modelsCB = []
predsCB = []

k_fold = RepeatedStratifiedKFold(n_splits=CFG.NFOLDS, n_repeats=CFG.REPEATS, random_state=CFG.SEED) 

param_list = [
    'iterations', 'sampling_frequency', 'leaf_estimation_method', 'grow_policy', 'bayesian_matrix_reg', 'l2_leaf_reg', 'random_strength', 'model_size_reg',
    'od_wait', 'random_seed', 'depth', 'posterior_sampling', 'border_count', 'min_data_in_leaf', 'learning_rate', 'leaf_estimation_iterations', 'bootstrap_type', 'max_leaves'
]
param_dict = {param: [] for param in param_list}
param_dict['iterations'] = [9999]
param_dict['random_state'] = [CFG.SEED]

wandb.init(project='S3E10', tags='catboost', config=param_dict, group='catboost')

for train_index, test_index in k_fold.split(X, y):
    X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    
    model = catboost.CatBoostClassifier(param_dict['iterations'], param_dict['random_state'])
    
    model.fit(X=X_train, y=y_train,
          eval_set=[(X_valid, y_valid)],
          early_stopping_rounds = PATIENCE,
          verbose=150,
         )
    
    params = model.get_all_params()
    del params['nan_mode']; del params['od_pval']; del params['penalties_coefficient']; del params['boosting_type']; del params['score_function']; del params['task_type']; del params['model_shrink_rate']; del params['bagging_temperature']; del params['best_model_min_trees']; del params['leaf_estimation_backtracking']; del params['sparse_features_conflict_fraction']; del params['classes_count']; del params['auto_class_weights']; del params['class_names']; del params['pool_metainfo_options']; del params['boost_from_average']; del params['rsm']; del params['od_type']; del params['force_unit_auto_pair_weights']; del params['eval_fraction']; del params['feature_border_type']; del params['model_shrink_mode']; del params['loss_function']; del params['eval_metric']; del params['use_best_model']
    wandb.config.update(params)
    print(model.best_score_['validation']['Logloss'])
    break
    modelsCB.append(model)
    predsCB.append(model.predict_proba(test)[:, 1])

In [30]:
PATIENCE = 100

modelsCB = []
predsCB = []

k_fold = RepeatedStratifiedKFold(n_splits=CFG.NFOLDS, n_repeats=CFG.REPEATS, random_state=CFG.SEED) 

param_list = [
    'iterations', 'sampling_frequency', 'leaf_estimation_method', 'grow_policy', 'bayesian_matrix_reg', 'l2_leaf_reg', 'random_strength', 'model_size_reg',
    'od_wait', 'random_seed', 'depth', 'posterior_sampling', 'border_count', 'min_data_in_leaf', 'learning_rate', 'leaf_estimation_iterations', 'bootstrap_type', 'max_leaves'
]
param_dict = {param: [] for param in param_list}
param_dict['iterations'] = 9999
param_dict['random_state'] = CFG.SEED
param_dict['learning_rate'] = 0.15

# wandb.init(project='S3E10', tags='catboost', config=param_dict, group='catboost')

for train_index, test_index in k_fold.split(X, y):
    X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    
    model = catboost.CatBoostClassifier(param_dict['iterations'], param_dict['random_state'], param_dict['learning_rate'])
    
    model.fit(X=X_train, y=y_train,
          eval_set=[(X_valid, y_valid)],
          early_stopping_rounds = PATIENCE,
          verbose=150,
         )
    
    params = model.get_all_params()
    del params['nan_mode']; del params['od_pval']; del params['penalties_coefficient']; del params['boosting_type']; del params['score_function']; del params['task_type']; del params['model_shrink_rate']; del params['bagging_temperature']; del params['best_model_min_trees']; del params['leaf_estimation_backtracking']; del params['sparse_features_conflict_fraction']; del params['classes_count']; del params['auto_class_weights']; del params['class_names']; del params['pool_metainfo_options']; del params['boost_from_average']; del params['rsm']; del params['od_type']; del params['force_unit_auto_pair_weights']; del params['eval_fraction']; del params['feature_border_type']; del params['model_shrink_mode']; del params['loss_function']; del params['eval_metric']; del params['use_best_model']
    wandb.config.update(params)
    print(model.best_score_['validation']['Logloss'])
    break
    modelsCB.append(model)
    predsCB.append(model.predict_proba(test)[:, 1])

In [31]:
PATIENCE = 100

modelsCB = []
predsCB = []

k_fold = RepeatedStratifiedKFold(n_splits=CFG.NFOLDS, n_repeats=CFG.REPEATS, random_state=CFG.SEED) 

param_list = [
    'iterations', 'sampling_frequency', 'leaf_estimation_method', 'grow_policy', 'bayesian_matrix_reg', 'l2_leaf_reg', 'random_strength', 'model_size_reg',
    'od_wait', 'random_seed', 'depth', 'posterior_sampling', 'border_count', 'min_data_in_leaf', 'learning_rate', 'leaf_estimation_iterations', 'bootstrap_type', 'max_leaves'
]
param_dict = {param: [] for param in param_list}
param_dict['iterations'] = 9999
param_dict['random_state'] = CFG.SEED
param_dict['learning_rate'] = 0.15

# wandb.init(project='S3E10', tags='catboost', config=param_dict, group='catboost')

for train_index, test_index in k_fold.split(X, y):
    X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    
    model = catboost.CatBoostClassifier(iterations=param_dict['iterations'], random_state=param_dict['random_state'], learning_rate=param_dict['learning_rate'])
    
    model.fit(X=X_train, y=y_train,
          eval_set=[(X_valid, y_valid)],
          early_stopping_rounds = PATIENCE,
          verbose=150,
         )
    
    params = model.get_all_params()
    del params['nan_mode']; del params['od_pval']; del params['penalties_coefficient']; del params['boosting_type']; del params['score_function']; del params['task_type']; del params['model_shrink_rate']; del params['bagging_temperature']; del params['best_model_min_trees']; del params['leaf_estimation_backtracking']; del params['sparse_features_conflict_fraction']; del params['classes_count']; del params['auto_class_weights']; del params['class_names']; del params['pool_metainfo_options']; del params['boost_from_average']; del params['rsm']; del params['od_type']; del params['force_unit_auto_pair_weights']; del params['eval_fraction']; del params['feature_border_type']; del params['model_shrink_mode']; del params['loss_function']; del params['eval_metric']; del params['use_best_model']
    wandb.config.update(params)
    print(model.best_score_['validation']['Logloss'])
    break
    modelsCB.append(model)
    predsCB.append(model.predict_proba(test)[:, 1])

In [32]:
PATIENCE = 100

modelsCB = []
predsCB = []

k_fold = RepeatedStratifiedKFold(n_splits=CFG.NFOLDS, n_repeats=CFG.REPEATS, random_state=CFG.SEED) 

param_list = [
    'iterations', 'sampling_frequency', 'leaf_estimation_method', 'grow_policy', 'bayesian_matrix_reg', 'l2_leaf_reg', 'random_strength', 'model_size_reg',
    'od_wait', 'random_seed', 'depth', 'posterior_sampling', 'border_count', 'min_data_in_leaf', 'learning_rate', 'leaf_estimation_iterations', 'bootstrap_type', 'max_leaves'
]
param_dict = {param: [] for param in param_list}
param_dict['iterations'] = 9999
param_dict['random_state'] = CFG.SEED
param_dict['learning_rate'] = 0.15

# wandb.init(project='S3E10', tags='catboost', config=param_dict, group='catboost')

for train_index, test_index in k_fold.split(X, y):
    X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    
    model = catboost.CatBoostClassifier() #iterations=param_dict['iterations'], random_state=param_dict['random_state'], learning_rate=param_dict['learning_rate']
    
    model.fit(X=X_train, y=y_train,
          eval_set=[(X_valid, y_valid)],
          early_stopping_rounds = PATIENCE,
          verbose=150,
         )
    
    params = model.get_all_params()
    del params['nan_mode']; del params['od_pval']; del params['penalties_coefficient']; del params['boosting_type']; del params['score_function']; del params['task_type']; del params['model_shrink_rate']; del params['bagging_temperature']; del params['best_model_min_trees']; del params['leaf_estimation_backtracking']; del params['sparse_features_conflict_fraction']; del params['classes_count']; del params['auto_class_weights']; del params['class_names']; del params['pool_metainfo_options']; del params['boost_from_average']; del params['rsm']; del params['od_type']; del params['force_unit_auto_pair_weights']; del params['eval_fraction']; del params['feature_border_type']; del params['model_shrink_mode']; del params['loss_function']; del params['eval_metric']; del params['use_best_model']
    wandb.config.update(params)
    print(model.best_score_['validation']['Logloss'])
    break
    modelsCB.append(model)
    predsCB.append(model.predict_proba(test)[:, 1])

In [33]:
PATIENCE = 100

modelsCB = []
predsCB = []

k_fold = RepeatedStratifiedKFold(n_splits=CFG.NFOLDS, n_repeats=CFG.REPEATS, random_state=CFG.SEED) 

param_list = [
    'iterations', 'sampling_frequency', 'leaf_estimation_method', 'grow_policy', 'bayesian_matrix_reg', 'l2_leaf_reg', 'random_strength', 'model_size_reg',
    'od_wait', 'random_seed', 'depth', 'posterior_sampling', 'border_count', 'min_data_in_leaf', 'learning_rate', 'leaf_estimation_iterations', 'bootstrap_type', 'max_leaves'
]
param_dict = {param: [] for param in param_list}
param_dict['iterations'] = 9999
param_dict['random_state'] = CFG.SEED
param_dict['learning_rate'] = 0.15

# wandb.init(project='S3E10', tags='catboost', config=param_dict, group='catboost')

for train_index, test_index in k_fold.split(X, y):
    X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    
    model = catboost.CatBoostClassifier() #iterations=param_dict['iterations'], random_state=param_dict['random_state'], learning_rate=param_dict['learning_rate']
    
    model.fit(X=X_train, y=y_train,
          eval_set=[(X_valid, y_valid)],
          early_stopping_rounds = PATIENCE,
          verbose=150,
         )
    
    params = model.get_all_params()
    del params['nan_mode']; del params['od_pval']; del params['penalties_coefficient']; del params['boosting_type']; del params['score_function']; del params['task_type']; del params['model_shrink_rate']; del params['best_model_min_trees']; del params['leaf_estimation_backtracking']; del params['sparse_features_conflict_fraction']; del params['classes_count']; del params['auto_class_weights']; del params['class_names']; del params['pool_metainfo_options']; del params['boost_from_average']; del params['rsm']; del params['od_type']; del params['force_unit_auto_pair_weights']; del params['eval_fraction']; del params['feature_border_type']; del params['model_shrink_mode']; del params['loss_function']; del params['eval_metric']; del params['use_best_model']
    wandb.config.update(params)
    print(model.best_score_['validation']['Logloss'])
    break
    modelsCB.append(model)
    predsCB.append(model.predict_proba(test)[:, 1])

In [34]:
PATIENCE = 100

modelsCB = []
predsCB = []

k_fold = RepeatedStratifiedKFold(n_splits=CFG.NFOLDS, n_repeats=CFG.REPEATS, random_state=CFG.SEED) 

param_list = [
    'iterations', 'sampling_frequency', 'leaf_estimation_method', 'grow_policy', 'bayesian_matrix_reg', 'l2_leaf_reg', 'random_strength', 'model_size_reg',
    'od_wait', 'random_seed', 'depth', 'posterior_sampling', 'border_count', 'min_data_in_leaf', 'learning_rate', 'leaf_estimation_iterations', 'bootstrap_type', 'max_leaves'
]
param_dict = {param: [] for param in param_list}
param_dict['iterations'] = 9999
param_dict['random_state'] = CFG.SEED
param_dict['learning_rate'] = 0.15

# wandb.init(project='S3E10', tags='catboost', config=param_dict, group='catboost')

for train_index, test_index in k_fold.split(X, y):
    X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    
    model = catboost.CatBoostClassifier(iterations=param_dict['iterations'], random_state=param_dict['random_state'], learning_rate=param_dict['learning_rate'])
    
    model.fit(X=X_train, y=y_train,
          eval_set=[(X_valid, y_valid)],
          early_stopping_rounds = PATIENCE,
          verbose=150,
         )
    
    params = model.get_all_params()
    del params['nan_mode']; del params['od_pval']; del params['penalties_coefficient']; del params['boosting_type']; del params['score_function']; del params['task_type']; del params['model_shrink_rate']; del params['best_model_min_trees']; del params['leaf_estimation_backtracking']; del params['sparse_features_conflict_fraction']; del params['classes_count']; del params['auto_class_weights']; del params['class_names']; del params['pool_metainfo_options']; del params['boost_from_average']; del params['rsm']; del params['od_type']; del params['force_unit_auto_pair_weights']; del params['eval_fraction']; del params['feature_border_type']; del params['model_shrink_mode']; del params['loss_function']; del params['eval_metric']; del params['use_best_model']
    wandb.config.update(params)
    print(model.best_score_['validation']['Logloss'])
    break
    modelsCB.append(model)
    predsCB.append(model.predict_proba(test)[:, 1])

In [35]:
PATIENCE = 100

modelsCB = []
predsCB = []

k_fold = RepeatedStratifiedKFold(n_splits=CFG.NFOLDS, n_repeats=CFG.REPEATS, random_state=CFG.SEED) 

param_list = [
    'iterations', 'sampling_frequency', 'leaf_estimation_method', 'grow_policy', 'bayesian_matrix_reg', 'l2_leaf_reg', 'random_strength', 'model_size_reg',
    'od_wait', 'random_seed', 'depth', 'posterior_sampling', 'border_count', 'min_data_in_leaf', 'learning_rate', 'leaf_estimation_iterations', 'bootstrap_type', 'max_leaves'
]
param_dict = {param: [] for param in param_list}
param_dict['iterations'] = 9999
param_dict['random_state'] = CFG.SEED
param_dict['learning_rate'] = 0.15

# wandb.init(project='S3E10', tags='catboost', config=param_dict, group='catboost')

for train_index, test_index in k_fold.split(X, y):
    X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    
    model = catboost.CatBoostClassifier(iterations=param_dict['iterations'], random_state=param_dict['random_state'], learning_rate=param_dict['learning_rate'])
    
    model.fit(X=X_train, y=y_train,
          eval_set=[(X_valid, y_valid)],
          early_stopping_rounds = PATIENCE,
          verbose=150,
         )
    
    params = model.get_all_params()
    del params['nan_mode']; del params['od_pval']; del params['penalties_coefficient']; del params['boosting_type']; del params['score_function']; del params['task_type']; del params['model_shrink_rate']; del params['best_model_min_trees']; del params['leaf_estimation_backtracking']; del params['sparse_features_conflict_fraction']; del params['classes_count']; del params['auto_class_weights']; del params['class_names']; del params['pool_metainfo_options']; del params['boost_from_average']; del params['rsm']; del params['od_type']; del params['force_unit_auto_pair_weights']; del params['eval_fraction']; del params['feature_border_type']; del params['model_shrink_mode']; del params['loss_function']; del params['eval_metric']; del params['use_best_model']
    wandb.log({model.best_score_['validation']['Logloss']})
    modelsCB.append(model)
    predsCB.append(model.predict_proba(test)[:, 1])
wandb.config.update(params)
wandb.finish()

In [36]:
PATIENCE = 100

modelsCB = []
predsCB = []

k_fold = RepeatedStratifiedKFold(n_splits=CFG.NFOLDS, n_repeats=CFG.REPEATS, random_state=CFG.SEED) 

param_list = [
    'iterations', 'sampling_frequency', 'leaf_estimation_method', 'grow_policy', 'bayesian_matrix_reg', 'l2_leaf_reg', 'random_strength', 'model_size_reg',
    'od_wait', 'random_seed', 'depth', 'posterior_sampling', 'border_count', 'min_data_in_leaf', 'learning_rate', 'leaf_estimation_iterations', 'bootstrap_type', 'max_leaves'
]
param_dict = {param: [] for param in param_list}
param_dict['iterations'] = 9999
param_dict['random_state'] = CFG.SEED
param_dict['learning_rate'] = 0.15

# wandb.init(project='S3E10', tags='catboost', config=param_dict, group='catboost')

for train_index, test_index in k_fold.split(X, y):
    X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    
    model = catboost.CatBoostClassifier(iterations=param_dict['iterations'], random_state=param_dict['random_state'], learning_rate=param_dict['learning_rate'])
    
    model.fit(X=X_train, y=y_train,
          eval_set=[(X_valid, y_valid)],
          early_stopping_rounds = PATIENCE,
          verbose=150,
         )
    
    params = model.get_all_params()
    del params['nan_mode']; del params['od_pval']; del params['penalties_coefficient']; del params['boosting_type']; del params['score_function']; del params['task_type']; del params['model_shrink_rate']; del params['best_model_min_trees']; del params['leaf_estimation_backtracking']; del params['sparse_features_conflict_fraction']; del params['classes_count']; del params['auto_class_weights']; del params['class_names']; del params['pool_metainfo_options']; del params['boost_from_average']; del params['rsm']; del params['od_type']; del params['force_unit_auto_pair_weights']; del params['eval_fraction']; del params['feature_border_type']; del params['model_shrink_mode']; del params['loss_function']; del params['eval_metric']; del params['use_best_model']
    wandb.log({'logloss': model.best_score_['validation']['Logloss']})
    modelsCB.append(model)
    predsCB.append(model.predict_proba(test)[:, 1])
wandb.config.update(params)
wandb.finish()

In [37]:
PATIENCE = 100

modelsCB = []
predsCB = []

k_fold = RepeatedStratifiedKFold(n_splits=CFG.NFOLDS, n_repeats=CFG.REPEATS, random_state=CFG.SEED) 

param_list = [
    'iterations', 'sampling_frequency', 'leaf_estimation_method', 'grow_policy', 'bayesian_matrix_reg', 'l2_leaf_reg', 'random_strength', 'model_size_reg',
    'od_wait', 'random_seed', 'depth', 'posterior_sampling', 'border_count', 'min_data_in_leaf', 'learning_rate', 'leaf_estimation_iterations', 'bootstrap_type', 'max_leaves'
]
param_dict = {param: [] for param in param_list}
param_dict['iterations'] = 9999
param_dict['random_state'] = CFG.SEED
param_dict['learning_rate'] = 0.15

wandb.init(project='S3E10', tags='catboost', config=param_dict, group='catboost')

for train_index, test_index in k_fold.split(X, y):
    X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    
    model = catboost.CatBoostClassifier(iterations=param_dict['iterations'], random_state=param_dict['random_state'], learning_rate=param_dict['learning_rate'])
    
    model.fit(X=X_train, y=y_train,
          eval_set=[(X_valid, y_valid)],
          early_stopping_rounds = PATIENCE,
          verbose=150,
         )
    
    params = model.get_all_params()
    del params['nan_mode']; del params['od_pval']; del params['penalties_coefficient']; del params['boosting_type']; del params['score_function']; del params['task_type']; del params['model_shrink_rate']; del params['best_model_min_trees']; del params['leaf_estimation_backtracking']; del params['sparse_features_conflict_fraction']; del params['classes_count']; del params['auto_class_weights']; del params['class_names']; del params['pool_metainfo_options']; del params['boost_from_average']; del params['rsm']; del params['od_type']; del params['force_unit_auto_pair_weights']; del params['eval_fraction']; del params['feature_border_type']; del params['model_shrink_mode']; del params['loss_function']; del params['eval_metric']; del params['use_best_model']
    wandb.log({'logloss': model.best_score_['validation']['Logloss']})
    modelsCB.append(model)
    predsCB.append(model.predict_proba(test)[:, 1])
wandb.config.update(params)
wandb.finish()

In [38]:
PATIENCE = 100

modelsCB = []
predsCB = []

k_fold = RepeatedStratifiedKFold(n_splits=CFG.NFOLDS, n_repeats=CFG.REPEATS, random_state=CFG.SEED) 

param_list = [
    'iterations', 'sampling_frequency', 'leaf_estimation_method', 'grow_policy', 'bayesian_matrix_reg', 'l2_leaf_reg', 'random_strength', 'model_size_reg',
    'od_wait', 'random_seed', 'depth', 'posterior_sampling', 'border_count', 'min_data_in_leaf', 'learning_rate', 'leaf_estimation_iterations', 'bootstrap_type', 'max_leaves'
]
param_dict = {param: [] for param in param_list}
param_dict['iterations'] = 9999
param_dict['random_state'] = CFG.SEED
param_dict['learning_rate'] = 0.15

wandb.init(project='S3E10', tags='catboost', config=param_dict, group='catboost')

for train_index, test_index in k_fold.split(X, y):
    X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    
    model = catboost.CatBoostClassifier(iterations=param_dict['iterations'], random_state=param_dict['random_state'], learning_rate=param_dict['learning_rate'])
    
    model.fit(X=X_train, y=y_train,
          eval_set=[(X_valid, y_valid)],
          early_stopping_rounds = PATIENCE,
          verbose=150,
         )
    
    params = model.get_all_params()
    del params['nan_mode']; del params['od_pval']; del params['penalties_coefficient']; del params['boosting_type']; del params['score_function']; del params['task_type']; del params['model_shrink_rate']; del params['best_model_min_trees']; del params['leaf_estimation_backtracking']; del params['sparse_features_conflict_fraction']; del params['classes_count']; del params['auto_class_weights']; del params['class_names']; del params['pool_metainfo_options']; del params['boost_from_average']; del params['rsm']; del params['od_type']; del params['force_unit_auto_pair_weights']; del params['eval_fraction']; del params['feature_border_type']; del params['model_shrink_mode']; del params['loss_function']; del params['eval_metric']; del params['use_best_model']
    wandb.log({
        'val_logloss': model.best_score_['validation']['Logloss'],
        'train_logloss': model.best_score_['train']['Logloss']})
    modelsCB.append(model)
    predsCB.append(model.predict_proba(test)[:, 1])
wandb.config.update(params)
wandb.finish()

In [39]:
model.best_score_

{'learn': {'Logloss': 0.021024448367654058},
 'validation': {'Logloss': 0.029755163598653437}}

In [40]:
PATIENCE = 100

modelsCB = []
predsCB = []

k_fold = RepeatedStratifiedKFold(n_splits=CFG.NFOLDS, n_repeats=CFG.REPEATS, random_state=CFG.SEED) 

param_list = [
    'iterations', 'sampling_frequency', 'leaf_estimation_method', 'grow_policy', 'bayesian_matrix_reg', 'l2_leaf_reg', 'random_strength', 'model_size_reg',
    'od_wait', 'random_seed', 'depth', 'posterior_sampling', 'border_count', 'min_data_in_leaf', 'learning_rate', 'leaf_estimation_iterations', 'bootstrap_type', 'max_leaves'
]
param_dict = {param: [] for param in param_list}
param_dict['iterations'] = 9999
param_dict['random_state'] = CFG.SEED
param_dict['learning_rate'] = 0.15

wandb.init(project='S3E10', tags='catboost', config=param_dict, group='catboost')

for train_index, test_index in k_fold.split(X, y):
    X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    
    model = catboost.CatBoostClassifier(iterations=param_dict['iterations'], random_state=param_dict['random_state'], learning_rate=param_dict['learning_rate'])
    
    model.fit(X=X_train, y=y_train,
          eval_set=[(X_valid, y_valid)],
          early_stopping_rounds = PATIENCE,
          verbose=150,
         )
    
    params = model.get_all_params()
    del params['nan_mode']; del params['od_pval']; del params['penalties_coefficient']; del params['boosting_type']; del params['score_function']; del params['task_type']; del params['model_shrink_rate']; del params['best_model_min_trees']; del params['leaf_estimation_backtracking']; del params['sparse_features_conflict_fraction']; del params['classes_count']; del params['auto_class_weights']; del params['class_names']; del params['pool_metainfo_options']; del params['boost_from_average']; del params['rsm']; del params['od_type']; del params['force_unit_auto_pair_weights']; del params['eval_fraction']; del params['feature_border_type']; del params['model_shrink_mode']; del params['loss_function']; del params['eval_metric']; del params['use_best_model']
    wandb.log({
        'val_logloss': model.best_score_['validation']['Logloss'],
        'train_logloss': model.best_score_['learn']['Logloss']})
    modelsCB.append(model)
    predsCB.append(model.predict_proba(test)[:, 1])
wandb.config.update(params)
wandb.finish()

In [41]:
from xgboost import XGBClassifier, XGBRegressor
from wandb.xgboost import WandbCallback
modelsXB = []
predsXB = []

PATIENCE = 50

wandb.init(project='S3E10', tags='xgboost', group='xgboost', reinit=True)

xgbr_params = {
            'n_estimators':9999,
            # 'max_depth': 4,
            # 'learning_rate': 0.05333221939055333,
            # 'min_child_weight': 4,
            # 'gamma': 5.301218558776368e-08,
            # 'subsample': 0.41010429946197946,
            # 'colsample_bytree': 0.8298539920447499,
            # 'reg_alpha': 0.000517878113716743,
            # 'reg_lambda': 0.00030121415155097723,
            'n_jobs': -1,
            'objective': 'binary:logistic',
            'verbosity': 0,
            'eval_metric': 'logloss',
            'random_state': CFG.SEED}

for train_index, test_index in k_fold.split(X, y):
    X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    
    model = XGBClassifier(**xgbr_params)
    
    model.fit(X=X_train, y=y_train,
          eval_set=[(X_valid, y_valid)],
          early_stopping_rounds = PATIENCE,
          verbose = 100,
          callbacks=[WandbCallback()]
         )
    modelsXB.append(model)
    predsXB.append(model.predict_proba(test)[:, 1])
wandb.finish()

In [42]:
get_fi(modelsXB)

In [43]:
from wandb.lightgbm import wandb_callback, log_summary

wandb.init(project='S3E10', tags='lightgbm', group='lightgbm', reinit=True)

modelsLB = []
predsLB = []

PATIENCE = 50

# gpu_params = {'device' : "gpu"}
lgbr_params = {
            'n_estimators': 9999,
            # 'max_depth': 5,
            'learning_rate': 0.00693702575527996,
            # 'subsample': 0.20851841295589477,
            # 'colsample_bytree': 0.5784778854092203,
            # 'reg_alpha': 0.2622912287429849,
            # 'reg_lambda': 2.8702494234117617e-08,
            'objective': 'binary',
            'metric': 'logloss',
            'boosting_type': 'gbdt',
           
            'random_state': CFG.SEED
        }

for train_index, test_index in k_fold.split(X, y):
    X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    
    model = lgbm.LGBMClassifier(**lgbr_params)
    
    model.fit(X=X_train, y=y_train,
          eval_set=[(X_valid, y_valid)],
          eval_metric = 'logloss',
          early_stopping_rounds = PATIENCE,
          verbose = 150,
          callbacks=[wandb_callback()]
         )
    modelsLB.append(model)
    predsLB.append(model.predict_proba(test)[:, 1])

In [44]:
wandb.init(project='S3E10', tags='randomforest', group='randomforest', reinit=True)
modelsRF = []
predsRF = []

params = {
        'criterion': 'logloss',
        'n_jobs': -1,
        'random_state': CFG.SEED,
        'verbose': 100
    }

for train_index, test_index in k_fold.split(X, y):
    X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    
    model = RandomForestClassifier(**params)
    model.fit(X=X_train, y=y_train)
    yprobas = model.predict_proba(X_valid)[:, 1]
    ypred = model.predict(X_valid)
    score = log_loss(y_valid, ypred)
    print(f'Score: {score}')
    
    modelsRF.append(model)
    predsRF.append(model.predict_proba(test)[:, 1])
    wandb.sklearn.plot_classifier(model, 
                              X_train, X_valid, 
                              y_train, y_valid,
                              ypred, yprobas, 
                              ['0', '1'], 
                              is_binary=True, 
                              model_name='RandomForest')

wandb.finish()

In [45]:
wandb.init(project='S3E10', tags='randomforest', group='randomforest', reinit=True)
modelsRF = []
predsRF = []

params = {
        'criterion': 'log_loss',
        'n_jobs': -1,
        'random_state': CFG.SEED,
        'verbose': 100
    }

for train_index, test_index in k_fold.split(X, y):
    X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    
    model = RandomForestClassifier(**params)
    model.fit(X=X_train, y=y_train)
    yprobas = model.predict_proba(X_valid)[:, 1]
    ypred = model.predict(X_valid)
    score = log_loss(y_valid, ypred)
    print(f'Score: {score}')
    
    modelsRF.append(model)
    predsRF.append(model.predict_proba(test)[:, 1])
    wandb.sklearn.plot_classifier(model, 
                              X_train, X_valid, 
                              y_train, y_valid,
                              ypred, yprobas, 
                              ['0', '1'], 
                              is_binary=True, 
                              model_name='RandomForest')

wandb.finish()

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01666842848332332, max=1.0)…

In [46]:
ypred

array([0, 0, 0, ..., 0, 0, 0])

In [47]:
yprobas

array([0.23, 0.  , 0.  , ..., 0.  , 0.  , 0.  ])

In [48]:
wandb.init(project='S3E10', tags='randomforest', group='randomforest', reinit=True)
modelsRF = []
predsRF = []

params = {
        'criterion': 'log_loss',
        'n_jobs': -1,
        'random_state': CFG.SEED,
        'verbose': 1
    }

for train_index, test_index in k_fold.split(X, y):
    X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    
    model = RandomForestClassifier(**params)
    model.fit(X=X_train, y=y_train)
    yprobas = model.predict_proba(X_valid)[:, 1]
    ypred = model.predict(X_valid)
    score = log_loss(y_valid, ypred)
    print(f'Score: {score}')
    
    modelsRF.append(model)
    predsRF.append(model.predict_proba(test)[:, 1])
    wandb.sklearn.plot_classifier(model, 
                              X_train, X_valid, 
                              y_train, y_valid,
                              ypred, yprobas, 
                              is_binary=True, 
                              model_name='RandomForest')

wandb.finish()