In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import matthews_corrcoef
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import r2_score
import scipy.stats as stats
from tensorflow import keras
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from datetime import datetime
from category_encoders import OrdinalEncoder, TargetEncoder
from catboost import CatBoostClassifier, CatBoostRegressor
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs

import warnings
warnings.filterwarnings("ignore")

In [2]:
#Functions

def evaluate(model, X_test, y_test):
    predictions = model.predict(X_test)
    errors = abs(predictions - y_test)
    mape = 100 * np.mean(errors / y_test)
    accuracy = 100 - mape
    roc = roc_auc_score(y_test, predictions)
    print('Model Performance')
    print('Average Error: {:0.4f} degrees'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%'.format(accuracy))
    print(f'AUC = {roc}')
    return accuracy

In [3]:
train = pd.read_csv('../Data/training_set_features.csv', index_col='respondent_id')
test = pd.read_csv('../Data/test_set_features.csv', index_col ='respondent_id')
labels = pd.read_csv('../Data/training_set_labels.csv', index_col='respondent_id')

In [4]:
train.loc[(train['age_group'] == '65+ Years') & (train['employment_status'].isnull()), 'employment_status'] = 'Not in Labor Force'

In [5]:
num_cols = list(train.select_dtypes('number').columns)

cat_cols = [
    'race',
    'sex',
    'marital_status',
    'rent_or_own',
    'hhs_geo_region',
    'census_msa',
    'employment_industry',
    'employment_occupation'
]

ord_cols = [
    'age_group',
    'education',
    'income_poverty',
    'employment_status'
]




#Impute Train
for col in num_cols:
    train[col] = train[col].fillna(value=-1)
    test[col] = test[col].fillna(value=-1)

for col in (cat_cols + ord_cols):
    train[col] = train[col].fillna(value='None')
    test[col] = test[col].fillna(value='None')
test_labels = labels.copy()    

In [6]:
train['age_group'] = train['age_group'].map({
    '18 - 34 Years': 1,
    '35 - 44 Years': 2,
    '45 - 54 Years': 3,
    '55 - 64 Years': 4,
    '65+ Years': 5
})

train['education'] = train['education'].map({
    '< 12 Years': 1,
    '12 Years': 2,
    'Some College': 3,
    'College Graduate': 4,
    'None': -1
})

train['income_poverty'] = train['income_poverty'].map({
    'None': -1,
    'Below Poverty': 1,
    '<= $75,000, Above Poverty': 2,
    '> $75,000': 3
})

train['employment_status'] = train['employment_status'].map({
    'None': -1,
    'Unemployed': 1,
    'Employed': 2,
    'Not in Labor Force': 3
})




test['education'] = test['education'].map({
    '< 12 Years': 1,
    '12 Years': 2,
    'Some College': 3,
    'College Graduate': 4,
    'None': -1
})

test['income_poverty'] = test['income_poverty'].map({
    'None': -1,
    'Below Poverty': 1,
    '<= $75,000, Above Poverty': 2,
    '> $75,000': 3
})

test['employment_status'] = test['employment_status'].map({
    'None': -1,
    'Unemployed': 1,
    'Employed': 2,
    'Not in Labor Force': 3
})

In [7]:
for x in train[ord_cols].columns:
    print(x, train[x].unique())

age_group [4 2 1 5 3]
education [ 1  2  4  3 -1]
income_poverty [ 1  2  3 -1]
employment_status [ 3  2  1 -1]


In [8]:
all_cols = train.columns

train_test = train.copy()

In [9]:
h1n1_labels = labels[['h1n1_vaccine']]
seas_labels = labels[['seasonal_vaccine']]

# Transformation

In [10]:
cat_cols = train.select_dtypes('object').columns

In [11]:
h1n1_train = train.copy()
seas_train = train.copy()

h1n1_scaler = StandardScaler()
h1n1_train[num_cols] = h1n1_scaler.fit_transform(h1n1_train[num_cols])

seas_scaler = StandardScaler()
seas_train[num_cols] = seas_scaler.fit_transform(seas_train[num_cols])

h1n1_train_trans = h1n1_train
seas_train_trans = seas_train

In [12]:
categorical_features_indices = np.where(train.dtypes != float)[0]

# H1N1

## CatBoost and Optuna

In [13]:
X = h1n1_train_trans
y = h1n1_labels

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

In [15]:
from catboost import CatBoostClassifier
from catboost import Pool, cv
from sklearn.metrics import roc_curve, roc_auc_score
import optuna

In [16]:
train_dataset = Pool(data=X_train,
                    label=y_train,
                    cat_features=categorical_features_indices)

In [17]:
def objective(trial):
    param = {
        'iterations':trial.suggest_categorical('iterations', [100,200,300,500,1000,1200,1500,1700,2000]),
        'learning_rate':trial.suggest_float("learning_rate", 0.001, 0.3),
        'random_strength':trial.suggest_int("random_strength", 1,10),
        'bagging_temperature':trial.suggest_int("bagging_temperature", 0,10),
        'max_bin':trial.suggest_categorical('max_bin', [4,5,6,8,10,20,30]),
        'grow_policy':trial.suggest_categorical('grow_policy', ['SymmetricTree', 'Depthwise', 'Lossguide']),
        'min_data_in_leaf':trial.suggest_int("min_data_in_leaf", 1,10),
        'od_type' : "Iter",
        'od_wait' : 100,
        "depth": trial.suggest_int("max_depth", 2,10),
        "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1e-8, 100),
         'one_hot_max_size':trial.suggest_categorical('one_hot_max_size', [5,10,12,25,100,500,1024]),
        'custom_metric' : ['AUC'],
        "loss_function": "Logloss",
        'auto_class_weights':trial.suggest_categorical('auto_class_weights', ['Balanced', 'SqrtBalanced']),
        }

    scores = cv(train_dataset,
            param,
            fold_count=7, 
            early_stopping_rounds=8,         
            plot=False, verbose=False)

    return scores['test-AUC-mean'].max()

In [18]:
sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=75)

[32m[I 2021-07-12 19:19:43,008][0m A new study created in memory with name: no-name-0f64e8c9-31bf-4b43-802a-5340d9a69b62[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 19:19:54,512][0m Trial 0 finished with value: 0.8617267124560974 and parameters: {'iterations': 200, 'learning_rate': 0.2127137007610176, 'random_strength': 1, 'bagging_temperature': 10, 'max_bin': 4, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 3, 'max_depth': 5, 'l2_leaf_reg': 0.0003636636071695854, 'one_hot_max_size': 5, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 0 with value: 0.8617267124560974.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 19:19:58,891][0m Trial 1 finished with value: 0.8657620717799731 and parameters: {'iterations': 100, 'learning_rate': 0.2728868002215558, 'random_strength': 3, 'bagging_temperature': 7, 'max_bin': 10, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 1, 'max_depth': 3, 'l2_leaf_reg': 2.833171661121252e-08, 'one_hot_max_size': 25, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 1 with value: 0.8657620717799731.[0m
[32m[I 2021-07-12 19:20:51,764][0m Trial 2 finished with value: 0.8641775823645944 and parameters: {'iterations': 200, 'learning_rate': 0.023139350868493017, 'random_strength': 4, 'bagging_temperature': 1, 'max_bin': 4, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 2, 'max_depth': 8, 'l2_leaf_reg': 0.4053689557345566, 'one_hot_max_size': 10, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 1 with value: 0.8657620717799731.[0m
[32m[I 2021-07-12 19:22:00,724][0m Trial 3 finished with value: 0.8665240467792362 and parameters: {'iterations': 300,

Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 19:26:30,009][0m Trial 5 finished with value: 0.8652869918685953 and parameters: {'iterations': 2000, 'learning_rate': 0.04211876229965198, 'random_strength': 4, 'bagging_temperature': 1, 'max_bin': 4, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 10, 'max_depth': 7, 'l2_leaf_reg': 2.4563933358965466e-05, 'one_hot_max_size': 12, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 3 with value: 0.8665240467792362.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 19:27:33,829][0m Trial 6 finished with value: 0.8685308445125622 and parameters: {'iterations': 1700, 'learning_rate': 0.06805652352870738, 'random_strength': 8, 'bagging_temperature': 2, 'max_bin': 8, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 10, 'max_depth': 5, 'l2_leaf_reg': 8.326558771196998, 'one_hot_max_size': 10, 'auto_class_weights': 'Balanced'}. Best is trial 6 with value: 0.8685308445125622.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 19:27:47,035][0m Trial 7 finished with value: 0.8269043874300017 and parameters: {'iterations': 500, 'learning_rate': 0.29002979089405717, 'random_strength': 10, 'bagging_temperature': 9, 'max_bin': 30, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 7, 'max_depth': 10, 'l2_leaf_reg': 2.5167504351014815e-07, 'one_hot_max_size': 10, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 6 with value: 0.8685308445125622.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 19:28:05,109][0m Trial 8 finished with value: 0.8669506196491532 and parameters: {'iterations': 200, 'learning_rate': 0.1020605518986092, 'random_strength': 4, 'bagging_temperature': 1, 'max_bin': 20, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 2, 'max_depth': 6, 'l2_leaf_reg': 0.5011128402145567, 'one_hot_max_size': 1024, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 6 with value: 0.8685308445125622.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 19:28:14,015][0m Trial 9 finished with value: 0.8623673466408182 and parameters: {'iterations': 1700, 'learning_rate': 0.20909628762201568, 'random_strength': 5, 'bagging_temperature': 1, 'max_bin': 30, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 5, 'max_depth': 4, 'l2_leaf_reg': 3.6284971517641624e-05, 'one_hot_max_size': 500, 'auto_class_weights': 'Balanced'}. Best is trial 6 with value: 0.8685308445125622.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 19:28:58,323][0m Trial 10 finished with value: 0.8660517377485574 and parameters: {'iterations': 1200, 'learning_rate': 0.11792927815981727, 'random_strength': 8, 'bagging_temperature': 4, 'max_bin': 8, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 10, 'max_depth': 2, 'l2_leaf_reg': 52.78127742712393, 'one_hot_max_size': 10, 'auto_class_weights': 'Balanced'}. Best is trial 6 with value: 0.8685308445125622.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 19:29:14,132][0m Trial 11 finished with value: 0.8644912836304952 and parameters: {'iterations': 1700, 'learning_rate': 0.10717601628426324, 'random_strength': 7, 'bagging_temperature': 3, 'max_bin': 20, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 7, 'max_depth': 6, 'l2_leaf_reg': 0.09031452697053467, 'one_hot_max_size': 1024, 'auto_class_weights': 'Balanced'}. Best is trial 6 with value: 0.8685308445125622.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 19:29:30,553][0m Trial 12 finished with value: 0.8694317933700908 and parameters: {'iterations': 1000, 'learning_rate': 0.09696834942006458, 'random_strength': 1, 'bagging_temperature': 0, 'max_bin': 20, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 8, 'max_depth': 6, 'l2_leaf_reg': 82.18923684531856, 'one_hot_max_size': 1024, 'auto_class_weights': 'Balanced'}. Best is trial 12 with value: 0.8694317933700908.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 19:29:46,653][0m Trial 13 finished with value: 0.8691234995634158 and parameters: {'iterations': 1000, 'learning_rate': 0.16225876293182054, 'random_strength': 1, 'bagging_temperature': 0, 'max_bin': 5, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 8, 'max_depth': 4, 'l2_leaf_reg': 76.86761980153686, 'one_hot_max_size': 1024, 'auto_class_weights': 'Balanced'}. Best is trial 12 with value: 0.8694317933700908.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 19:30:03,220][0m Trial 14 finished with value: 0.8677750595328201 and parameters: {'iterations': 1000, 'learning_rate': 0.176690419370986, 'random_strength': 1, 'bagging_temperature': 0, 'max_bin': 5, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 8, 'max_depth': 2, 'l2_leaf_reg': 76.84051435981505, 'one_hot_max_size': 1024, 'auto_class_weights': 'Balanced'}. Best is trial 12 with value: 0.8694317933700908.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 19:30:14,858][0m Trial 15 finished with value: 0.8654425343589313 and parameters: {'iterations': 1000, 'learning_rate': 0.1594825865773152, 'random_strength': 2, 'bagging_temperature': 0, 'max_bin': 5, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 8, 'max_depth': 4, 'l2_leaf_reg': 0.016180743832298193, 'one_hot_max_size': 1024, 'auto_class_weights': 'Balanced'}. Best is trial 12 with value: 0.8694317933700908.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 19:30:32,877][0m Trial 16 finished with value: 0.8685886326373853 and parameters: {'iterations': 1000, 'learning_rate': 0.13927537167905407, 'random_strength': 2, 'bagging_temperature': 6, 'max_bin': 6, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 6, 'max_depth': 10, 'l2_leaf_reg': 80.10808600309316, 'one_hot_max_size': 1024, 'auto_class_weights': 'Balanced'}. Best is trial 12 with value: 0.8694317933700908.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 19:30:37,338][0m Trial 17 finished with value: 0.8579545219728042 and parameters: {'iterations': 1000, 'learning_rate': 0.19516803805477634, 'random_strength': 1, 'bagging_temperature': 4, 'max_bin': 20, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 9, 'max_depth': 7, 'l2_leaf_reg': 0.015240296244988225, 'one_hot_max_size': 25, 'auto_class_weights': 'Balanced'}. Best is trial 12 with value: 0.8694317933700908.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 19:30:45,083][0m Trial 18 finished with value: 0.8669500689116001 and parameters: {'iterations': 1500, 'learning_rate': 0.25054375952511293, 'random_strength': 2, 'bagging_temperature': 3, 'max_bin': 5, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 5, 'max_depth': 3, 'l2_leaf_reg': 7.886805835291059, 'one_hot_max_size': 1024, 'auto_class_weights': 'Balanced'}. Best is trial 12 with value: 0.8694317933700908.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 19:31:08,036][0m Trial 19 finished with value: 0.8683739285605598 and parameters: {'iterations': 1000, 'learning_rate': 0.08084422349918685, 'random_strength': 3, 'bagging_temperature': 0, 'max_bin': 20, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 8, 'max_depth': 4, 'l2_leaf_reg': 9.620418566065942, 'one_hot_max_size': 500, 'auto_class_weights': 'Balanced'}. Best is trial 12 with value: 0.8694317933700908.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 19:31:19,089][0m Trial 20 finished with value: 0.8560489295930164 and parameters: {'iterations': 1000, 'learning_rate': 0.1473326755890596, 'random_strength': 1, 'bagging_temperature': 2, 'max_bin': 5, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 6, 'max_depth': 7, 'l2_leaf_reg': 0.002512205918251716, 'one_hot_max_size': 12, 'auto_class_weights': 'Balanced'}. Best is trial 12 with value: 0.8694317933700908.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 19:31:40,592][0m Trial 21 finished with value: 0.8686050028460998 and parameters: {'iterations': 1000, 'learning_rate': 0.13684256710179518, 'random_strength': 2, 'bagging_temperature': 6, 'max_bin': 6, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 6, 'max_depth': 10, 'l2_leaf_reg': 75.59294922200559, 'one_hot_max_size': 1024, 'auto_class_weights': 'Balanced'}. Best is trial 12 with value: 0.8694317933700908.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 19:31:59,993][0m Trial 22 finished with value: 0.8683763062586144 and parameters: {'iterations': 1000, 'learning_rate': 0.12893631362993474, 'random_strength': 2, 'bagging_temperature': 6, 'max_bin': 6, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 7, 'max_depth': 9, 'l2_leaf_reg': 68.01154029535058, 'one_hot_max_size': 1024, 'auto_class_weights': 'Balanced'}. Best is trial 12 with value: 0.8694317933700908.[0m
[32m[I 2021-07-12 19:32:06,996][0m Trial 23 finished with value: 0.8658405090398995 and parameters: {'iterations': 100, 'learning_rate': 0.17217152923230145, 'random_strength': 3, 'bagging_temperature': 7, 'max_bin': 6, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 9, 'max_depth': 3, 'l2_leaf_reg': 3.3783221025021426, 'one_hot_max_size': 100, 'auto_class_weights': 'Balanced'}. Best is trial 12 with value: 0.8694317933700908.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 19:32:25,550][0m Trial 24 finished with value: 0.8693311445320585 and parameters: {'iterations': 1000, 'learning_rate': 0.08564190418590055, 'random_strength': 1, 'bagging_temperature': 4, 'max_bin': 6, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 9, 'max_depth': 9, 'l2_leaf_reg': 29.523911961605414, 'one_hot_max_size': 1024, 'auto_class_weights': 'Balanced'}. Best is trial 12 with value: 0.8694317933700908.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 19:32:45,317][0m Trial 25 finished with value: 0.8693595537396718 and parameters: {'iterations': 2000, 'learning_rate': 0.07739164053723227, 'random_strength': 1, 'bagging_temperature': 2, 'max_bin': 6, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 9, 'max_depth': 9, 'l2_leaf_reg': 12.771230047127299, 'one_hot_max_size': 1024, 'auto_class_weights': 'Balanced'}. Best is trial 12 with value: 0.8694317933700908.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 19:33:09,432][0m Trial 26 finished with value: 0.8661795681906137 and parameters: {'iterations': 2000, 'learning_rate': 0.08292937208465356, 'random_strength': 3, 'bagging_temperature': 3, 'max_bin': 6, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 9, 'max_depth': 9, 'l2_leaf_reg': 0.2111837715232643, 'one_hot_max_size': 1024, 'auto_class_weights': 'Balanced'}. Best is trial 12 with value: 0.8694317933700908.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 19:33:39,119][0m Trial 27 finished with value: 0.8698765099983916 and parameters: {'iterations': 2000, 'learning_rate': 0.05196238428759904, 'random_strength': 1, 'bagging_temperature': 2, 'max_bin': 6, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 9, 'max_depth': 9, 'l2_leaf_reg': 14.852455469272511, 'one_hot_max_size': 1024, 'auto_class_weights': 'Balanced'}. Best is trial 27 with value: 0.8698765099983916.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 19:34:13,489][0m Trial 28 finished with value: 0.8681752518202718 and parameters: {'iterations': 2000, 'learning_rate': 0.053574965216909146, 'random_strength': 5, 'bagging_temperature': 2, 'max_bin': 6, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 10, 'max_depth': 8, 'l2_leaf_reg': 1.4028408894912066, 'one_hot_max_size': 25, 'auto_class_weights': 'Balanced'}. Best is trial 27 with value: 0.8698765099983916.[0m
[32m[I 2021-07-12 20:00:49,961][0m Trial 29 finished with value: 0.8654507296099375 and parameters: {'iterations': 2000, 'learning_rate': 0.002018714382510056, 'random_strength': 1, 'bagging_temperature': 2, 'max_bin': 20, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 9, 'max_depth': 9, 'l2_leaf_reg': 0.035779683477384014, 'one_hot_max_size': 5, 'auto_class_weights': 'Balanced'}. Best is trial 27 with value: 0.8698765099983916.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 20:06:03,094][0m Trial 30 finished with value: 0.867130307358835 and parameters: {'iterations': 2000, 'learning_rate': 0.02675417141678304, 'random_strength': 2, 'bagging_temperature': 3, 'max_bin': 6, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 7, 'max_depth': 8, 'l2_leaf_reg': 0.0018313564447508617, 'one_hot_max_size': 12, 'auto_class_weights': 'Balanced'}. Best is trial 27 with value: 0.8698765099983916.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 20:06:22,107][0m Trial 31 finished with value: 0.8690950824639646 and parameters: {'iterations': 2000, 'learning_rate': 0.08852470101790186, 'random_strength': 1, 'bagging_temperature': 4, 'max_bin': 6, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 9, 'max_depth': 9, 'l2_leaf_reg': 16.363859642593123, 'one_hot_max_size': 1024, 'auto_class_weights': 'Balanced'}. Best is trial 27 with value: 0.8698765099983916.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 20:06:43,980][0m Trial 32 finished with value: 0.8699709379541793 and parameters: {'iterations': 300, 'learning_rate': 0.07257717548601127, 'random_strength': 1, 'bagging_temperature': 5, 'max_bin': 6, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 8, 'max_depth': 10, 'l2_leaf_reg': 18.73179168604146, 'one_hot_max_size': 1024, 'auto_class_weights': 'Balanced'}. Best is trial 32 with value: 0.8699709379541793.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 20:07:09,439][0m Trial 33 finished with value: 0.8683917946953559 and parameters: {'iterations': 300, 'learning_rate': 0.06823686404608309, 'random_strength': 2, 'bagging_temperature': 5, 'max_bin': 10, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 8, 'max_depth': 10, 'l2_leaf_reg': 1.6878287473907805, 'one_hot_max_size': 1024, 'auto_class_weights': 'Balanced'}. Best is trial 32 with value: 0.8699709379541793.[0m
[32m[I 2021-07-12 20:10:54,882][0m Trial 34 finished with value: 0.8679986119102477 and parameters: {'iterations': 300, 'learning_rate': 0.039656563138266133, 'random_strength': 3, 'bagging_temperature': 1, 'max_bin': 6, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 10, 'max_depth': 10, 'l2_leaf_reg': 0.5896278620260849, 'one_hot_max_size': 5, 'auto_class_weights': 'Balanced'}. Best is trial 32 with value: 0.8699709379541793.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 20:11:17,597][0m Trial 35 finished with value: 0.8693737968161886 and parameters: {'iterations': 300, 'learning_rate': 0.06169374707418619, 'random_strength': 1, 'bagging_temperature': 5, 'max_bin': 4, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 8, 'max_depth': 8, 'l2_leaf_reg': 4.018892218518083, 'one_hot_max_size': 1024, 'auto_class_weights': 'Balanced'}. Best is trial 32 with value: 0.8699709379541793.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 20:11:49,556][0m Trial 36 finished with value: 0.8685315512548419 and parameters: {'iterations': 300, 'learning_rate': 0.057092544348530595, 'random_strength': 3, 'bagging_temperature': 7, 'max_bin': 4, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 7, 'max_depth': 7, 'l2_leaf_reg': 3.5607960798334743, 'one_hot_max_size': 100, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 32 with value: 0.8699709379541793.[0m
[32m[I 2021-07-12 20:12:26,896][0m Trial 37 finished with value: 0.8632374887221582 and parameters: {'iterations': 300, 'learning_rate': 0.02649232746005173, 'random_strength': 4, 'bagging_temperature': 5, 'max_bin': 4, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 4, 'max_depth': 8, 'l2_leaf_reg': 0.15195604575177288, 'one_hot_max_size': 1024, 'auto_class_weights': 'Balanced'}. Best is trial 32 with value: 0.8699709379541793.[0m
[32m[I 2021-07-12 20:13:02,671][0m Trial 38 finished with value: 0.8470665947184706 and parameters: {'iterations': 30

Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 20:13:15,366][0m Trial 39 finished with value: 0.8612193900177142 and parameters: {'iterations': 1200, 'learning_rate': 0.10884017988565098, 'random_strength': 1, 'bagging_temperature': 10, 'max_bin': 4, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 8, 'max_depth': 7, 'l2_leaf_reg': 1.8884629102714177e-06, 'one_hot_max_size': 25, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 32 with value: 0.8699709379541793.[0m
[32m[I 2021-07-12 20:13:41,469][0m Trial 40 finished with value: 0.8681619285656612 and parameters: {'iterations': 200, 'learning_rate': 0.06287493445353562, 'random_strength': 3, 'bagging_temperature': 8, 'max_bin': 10, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 7, 'max_depth': 6, 'l2_leaf_reg': 1.5113168401030426, 'one_hot_max_size': 1024, 'auto_class_weights': 'Balanced'}. Best is trial 32 with value: 0.8699709379541793.[0m
[32m[I 2021-07-12 20:14:18,997][0m Trial 41 finished with value: 0.8703111982144548 and parameters: {'iterations'

Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 20:14:56,965][0m Trial 42 finished with value: 0.8702052983447004 and parameters: {'iterations': 300, 'learning_rate': 0.04163501713130602, 'random_strength': 1, 'bagging_temperature': 6, 'max_bin': 8, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 10, 'max_depth': 10, 'l2_leaf_reg': 21.852606807006246, 'one_hot_max_size': 1024, 'auto_class_weights': 'Balanced'}. Best is trial 41 with value: 0.8703111982144548.[0m
[32m[I 2021-07-12 20:15:34,029][0m Trial 43 finished with value: 0.8633670274652944 and parameters: {'iterations': 300, 'learning_rate': 0.014926066247918979, 'random_strength': 2, 'bagging_temperature': 6, 'max_bin': 8, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 10, 'max_depth': 10, 'l2_leaf_reg': 22.3179286921454, 'one_hot_max_size': 1024, 'auto_class_weights': 'Balanced'}. Best is trial 41 with value: 0.8703111982144548.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 20:16:55,939][0m Trial 44 finished with value: 0.8689785514428439 and parameters: {'iterations': 500, 'learning_rate': 0.04027719685026368, 'random_strength': 1, 'bagging_temperature': 5, 'max_bin': 8, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 10, 'max_depth': 10, 'l2_leaf_reg': 30.49839965659867, 'one_hot_max_size': 10, 'auto_class_weights': 'Balanced'}. Best is trial 41 with value: 0.8703111982144548.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 20:17:44,383][0m Trial 45 finished with value: 0.8691866394818214 and parameters: {'iterations': 1500, 'learning_rate': 0.033854346538953736, 'random_strength': 2, 'bagging_temperature': 6, 'max_bin': 8, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 10, 'max_depth': 9, 'l2_leaf_reg': 0.9115106092480676, 'one_hot_max_size': 1024, 'auto_class_weights': 'Balanced'}. Best is trial 41 with value: 0.8703111982144548.[0m
[32m[I 2021-07-12 20:18:17,925][0m Trial 46 finished with value: 0.8671086045224304 and parameters: {'iterations': 300, 'learning_rate': 0.016724719665844878, 'random_strength': 1, 'bagging_temperature': 8, 'max_bin': 8, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 9, 'max_depth': 5, 'l2_leaf_reg': 4.234474060732237, 'one_hot_max_size': 1024, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 41 with value: 0.8703111982144548.[0m
[32m[I 2021-07-12 20:18:44,586][0m Trial 47 finished with value: 0.8631294236702731 and parameters: {'iterations': 

Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 20:19:32,842][0m Trial 49 finished with value: 0.8540437854816068 and parameters: {'iterations': 1700, 'learning_rate': 0.11852398060184076, 'random_strength': 1, 'bagging_temperature': 5, 'max_bin': 8, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 10, 'max_depth': 10, 'l2_leaf_reg': 0.30750700765758515, 'one_hot_max_size': 12, 'auto_class_weights': 'Balanced'}. Best is trial 41 with value: 0.8703111982144548.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 20:19:59,117][0m Trial 50 finished with value: 0.8692511281998998 and parameters: {'iterations': 300, 'learning_rate': 0.0707516562027285, 'random_strength': 2, 'bagging_temperature': 6, 'max_bin': 20, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 8, 'max_depth': 8, 'l2_leaf_reg': 8.782897441433569, 'one_hot_max_size': 1024, 'auto_class_weights': 'Balanced'}. Best is trial 41 with value: 0.8703111982144548.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 20:20:33,018][0m Trial 51 finished with value: 0.8703266987708048 and parameters: {'iterations': 300, 'learning_rate': 0.05289609246540887, 'random_strength': 1, 'bagging_temperature': 5, 'max_bin': 8, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 8, 'max_depth': 9, 'l2_leaf_reg': 88.79810792634227, 'one_hot_max_size': 1024, 'auto_class_weights': 'Balanced'}. Best is trial 51 with value: 0.8703266987708048.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 20:21:02,547][0m Trial 52 finished with value: 0.8695459409152039 and parameters: {'iterations': 300, 'learning_rate': 0.05029689030864542, 'random_strength': 1, 'bagging_temperature': 4, 'max_bin': 8, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 7, 'max_depth': 9, 'l2_leaf_reg': 97.00035845034988, 'one_hot_max_size': 1024, 'auto_class_weights': 'Balanced'}. Best is trial 51 with value: 0.8703266987708048.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 20:21:36,159][0m Trial 53 finished with value: 0.8700613668619763 and parameters: {'iterations': 300, 'learning_rate': 0.0488027241875225, 'random_strength': 1, 'bagging_temperature': 4, 'max_bin': 8, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 7, 'max_depth': 9, 'l2_leaf_reg': 88.43784954318768, 'one_hot_max_size': 1024, 'auto_class_weights': 'Balanced'}. Best is trial 51 with value: 0.8703266987708048.[0m
[32m[I 2021-07-12 20:22:12,240][0m Trial 54 finished with value: 0.8572310017863245 and parameters: {'iterations': 300, 'learning_rate': 0.007146820667334665, 'random_strength': 2, 'bagging_temperature': 5, 'max_bin': 8, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 6, 'max_depth': 9, 'l2_leaf_reg': 24.077374257884557, 'one_hot_max_size': 1024, 'auto_class_weights': 'Balanced'}. Best is trial 51 with value: 0.8703266987708048.[0m
[32m[I 2021-07-12 20:26:17,009][0m Trial 55 finished with value: 0.869031902099889 and parameters: {'iterations': 300, 'lea

Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 20:31:13,682][0m Trial 57 finished with value: 0.8708849878265229 and parameters: {'iterations': 1200, 'learning_rate': 0.023793510396254353, 'random_strength': 1, 'bagging_temperature': 6, 'max_bin': 8, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 8, 'max_depth': 9, 'l2_leaf_reg': 89.35313522855303, 'one_hot_max_size': 500, 'auto_class_weights': 'Balanced'}. Best is trial 57 with value: 0.8708849878265229.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 20:32:34,572][0m Trial 58 finished with value: 0.8708143434747891 and parameters: {'iterations': 1200, 'learning_rate': 0.01822957506875239, 'random_strength': 1, 'bagging_temperature': 6, 'max_bin': 8, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 8, 'max_depth': 9, 'l2_leaf_reg': 97.82938585510763, 'one_hot_max_size': 500, 'auto_class_weights': 'Balanced'}. Best is trial 57 with value: 0.8708849878265229.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 20:34:06,598][0m Trial 59 finished with value: 0.8702076895716202 and parameters: {'iterations': 1200, 'learning_rate': 0.02082199594349247, 'random_strength': 4, 'bagging_temperature': 6, 'max_bin': 8, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 7, 'max_depth': 8, 'l2_leaf_reg': 58.44987194542407, 'one_hot_max_size': 500, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 57 with value: 0.8708849878265229.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 20:35:30,552][0m Trial 60 finished with value: 0.8665282790592255 and parameters: {'iterations': 1200, 'learning_rate': 0.01883304034580447, 'random_strength': 6, 'bagging_temperature': 6, 'max_bin': 8, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 5, 'max_depth': 8, 'l2_leaf_reg': 1.0004224452913733e-05, 'one_hot_max_size': 500, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 57 with value: 0.8708849878265229.[0m
[32m[I 2021-07-12 20:37:46,052][0m Trial 61 finished with value: 0.8679180780989961 and parameters: {'iterations': 1200, 'learning_rate': 0.010345188550750159, 'random_strength': 7, 'bagging_temperature': 6, 'max_bin': 8, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 7, 'max_depth': 9, 'l2_leaf_reg': 88.97065459388654, 'one_hot_max_size': 500, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 57 with value: 0.8708849878265229.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 20:38:58,933][0m Trial 62 finished with value: 0.8698932760673382 and parameters: {'iterations': 1200, 'learning_rate': 0.02430413379390573, 'random_strength': 4, 'bagging_temperature': 7, 'max_bin': 8, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 6, 'max_depth': 8, 'l2_leaf_reg': 52.75648991084858, 'one_hot_max_size': 500, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 57 with value: 0.8708849878265229.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 20:40:05,614][0m Trial 63 finished with value: 0.8699305551371824 and parameters: {'iterations': 1200, 'learning_rate': 0.0336115990378984, 'random_strength': 5, 'bagging_temperature': 8, 'max_bin': 8, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 7, 'max_depth': 9, 'l2_leaf_reg': 81.37284481123424, 'one_hot_max_size': 500, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 57 with value: 0.8708849878265229.[0m
[32m[I 2021-07-12 20:42:22,318][0m Trial 64 finished with value: 0.854993345101896 and parameters: {'iterations': 1200, 'learning_rate': 0.0015058492840720299, 'random_strength': 2, 'bagging_temperature': 6, 'max_bin': 8, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 8, 'max_depth': 9, 'l2_leaf_reg': 95.19396324074165, 'one_hot_max_size': 500, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 57 with value: 0.8708849878265229.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 20:43:13,733][0m Trial 65 finished with value: 0.8693041351392141 and parameters: {'iterations': 1200, 'learning_rate': 0.03781797041249919, 'random_strength': 8, 'bagging_temperature': 7, 'max_bin': 8, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 8, 'max_depth': 9, 'l2_leaf_reg': 43.42161174357205, 'one_hot_max_size': 500, 'auto_class_weights': 'Balanced'}. Best is trial 57 with value: 0.8708849878265229.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 20:44:11,302][0m Trial 66 finished with value: 0.870176918590554 and parameters: {'iterations': 1200, 'learning_rate': 0.02277186854470771, 'random_strength': 1, 'bagging_temperature': 5, 'max_bin': 8, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 6, 'max_depth': 8, 'l2_leaf_reg': 9.453222005991888, 'one_hot_max_size': 500, 'auto_class_weights': 'Balanced'}. Best is trial 57 with value: 0.8708849878265229.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 20:44:19,690][0m Trial 67 finished with value: 0.8661315980759896 and parameters: {'iterations': 1200, 'learning_rate': 0.25318097053845473, 'random_strength': 3, 'bagging_temperature': 6, 'max_bin': 8, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 5, 'max_depth': 7, 'l2_leaf_reg': 9.842831480316075, 'one_hot_max_size': 500, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 57 with value: 0.8708849878265229.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 20:45:45,971][0m Trial 68 finished with value: 0.8702422951404098 and parameters: {'iterations': 1200, 'learning_rate': 0.014717315902471627, 'random_strength': 1, 'bagging_temperature': 5, 'max_bin': 8, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 6, 'max_depth': 8, 'l2_leaf_reg': 2.0595949110735376, 'one_hot_max_size': 500, 'auto_class_weights': 'Balanced'}. Best is trial 57 with value: 0.8708849878265229.[0m
[32m[I 2021-07-12 20:48:03,488][0m Trial 69 finished with value: 0.8698416087675277 and parameters: {'iterations': 1200, 'learning_rate': 0.01009517206815392, 'random_strength': 2, 'bagging_temperature': 7, 'max_bin': 8, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 6, 'max_depth': 8, 'l2_leaf_reg': 2.132790088056709, 'one_hot_max_size': 500, 'auto_class_weights': 'Balanced'}. Best is trial 57 with value: 0.8708849878265229.[0m
[32m[I 2021-07-12 20:50:07,692][0m Trial 70 finished with value: 0.8540493013126795 and parameters: {'iterations': 1200, '

Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 20:51:05,768][0m Trial 71 finished with value: 0.8707212423342684 and parameters: {'iterations': 1200, 'learning_rate': 0.02359362482526857, 'random_strength': 1, 'bagging_temperature': 5, 'max_bin': 8, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 6, 'max_depth': 8, 'l2_leaf_reg': 7.0471990636228306, 'one_hot_max_size': 500, 'auto_class_weights': 'Balanced'}. Best is trial 57 with value: 0.8708849878265229.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 20:51:53,189][0m Trial 72 finished with value: 0.870344136914314 and parameters: {'iterations': 1200, 'learning_rate': 0.028387651750897042, 'random_strength': 1, 'bagging_temperature': 5, 'max_bin': 8, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 4, 'max_depth': 8, 'l2_leaf_reg': 5.6327160448886895, 'one_hot_max_size': 500, 'auto_class_weights': 'Balanced'}. Best is trial 57 with value: 0.8708849878265229.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 20:52:43,942][0m Trial 73 finished with value: 0.8693384248935571 and parameters: {'iterations': 1200, 'learning_rate': 0.02091958137615044, 'random_strength': 1, 'bagging_temperature': 5, 'max_bin': 8, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 4, 'max_depth': 8, 'l2_leaf_reg': 0.7183954893316693, 'one_hot_max_size': 500, 'auto_class_weights': 'Balanced'}. Best is trial 57 with value: 0.8708849878265229.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 20:53:30,550][0m Trial 74 finished with value: 0.8694136207219428 and parameters: {'iterations': 1200, 'learning_rate': 0.029893633727196826, 'random_strength': 2, 'bagging_temperature': 4, 'max_bin': 8, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 3, 'max_depth': 7, 'l2_leaf_reg': 3.132524331015214, 'one_hot_max_size': 500, 'auto_class_weights': 'Balanced'}. Best is trial 57 with value: 0.8708849878265229.[0m


In [19]:
trial = study.best_trial
final_model = CatBoostClassifier(verbose=False,  cat_features=categorical_features_indices, 
                          **trial.params)

final_model.fit(X_train, y_train)

final_h1n1_model = final_model
trial.params

{'iterations': 1200,
 'learning_rate': 0.023793510396254353,
 'random_strength': 1,
 'bagging_temperature': 6,
 'max_bin': 8,
 'grow_policy': 'Lossguide',
 'min_data_in_leaf': 8,
 'max_depth': 9,
 'l2_leaf_reg': 89.35313522855303,
 'one_hot_max_size': 500,
 'auto_class_weights': 'Balanced'}

In [33]:
params = trial.params
final_h1n1_model = CatBoostClassifier(cat_features=categorical_features_indices,
                                     verbose=False,
                                     iterations=1200,
                                     learning_rate=0.023793510396254353,
                                     random_strength=1,
                                     bagging_temperature=6,
                                     max_bin=8,
                                     grow_policy='Lossguide',
                                     min_data_in_leaf=8,
                                     max_depth=9,
                                     l2_leaf_reg=89.35313522855303,
                                     one_hot_max_size=500,
                                     auto_class_weights='Balanced').fit(h1n1_train_trans, h1n1_labels)

In [21]:
params = trial.params
final_h1n1_model = CatBoostClassifier(cat_features=categorical_features_indices,
                                     verbose=False,
                                     **params)

# Seasonal

## Catboost and Optuna

In [22]:
X = seas_train_trans
y = seas_labels

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

In [24]:
from catboost import CatBoostClassifier
from catboost import Pool, cv
from sklearn.metrics import roc_curve, roc_auc_score
import optuna

In [25]:
train_dataset = Pool(data=X_train,
                    label=y_train,
                    cat_features=categorical_features_indices)

In [26]:
def objective(trial):
    param = {
        'iterations':trial.suggest_categorical('iterations', [100,200,300,500,1000,1200,1500,1700,2000]),
        'learning_rate':trial.suggest_float("learning_rate", 0.001, 0.3),
        'random_strength':trial.suggest_int("random_strength", 1,10),
        'bagging_temperature':trial.suggest_int("bagging_temperature", 0,10),
        'max_bin':trial.suggest_categorical('max_bin', [4,5,6,8,10,20,30]),
        'grow_policy':trial.suggest_categorical('grow_policy', ['SymmetricTree', 'Depthwise', 'Lossguide']),
        'min_data_in_leaf':trial.suggest_int("min_data_in_leaf", 1,10),
        'od_type' : "Iter",
        'od_wait' : 100,
        "depth": trial.suggest_int("max_depth", 2,10),
        "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1e-8, 100),
         'one_hot_max_size':trial.suggest_categorical('one_hot_max_size', [5,10,12,25,100,500,1024]),
        'custom_metric' : ['AUC'],
        "loss_function": "Logloss",
        'auto_class_weights':trial.suggest_categorical('auto_class_weights', ['Balanced', 'SqrtBalanced']),
        }

    scores = cv(train_dataset,
            param,
            fold_count=7, 
            early_stopping_rounds=8,         
            plot=False, verbose=False)

    return scores['test-AUC-mean'].max()

In [27]:
sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=75)

[32m[I 2021-07-12 20:53:53,796][0m A new study created in memory with name: no-name-d01ccdb2-195e-46e4-9ba1-c16b55870384[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 20:54:07,833][0m Trial 0 finished with value: 0.8580664207414844 and parameters: {'iterations': 200, 'learning_rate': 0.2127137007610176, 'random_strength': 1, 'bagging_temperature': 10, 'max_bin': 4, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 3, 'max_depth': 5, 'l2_leaf_reg': 0.0003636636071695854, 'one_hot_max_size': 5, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 0 with value: 0.8580664207414844.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 20:54:15,446][0m Trial 1 finished with value: 0.858374522877302 and parameters: {'iterations': 100, 'learning_rate': 0.2728868002215558, 'random_strength': 3, 'bagging_temperature': 7, 'max_bin': 10, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 1, 'max_depth': 3, 'l2_leaf_reg': 2.833171661121252e-08, 'one_hot_max_size': 25, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 1 with value: 0.858374522877302.[0m
[32m[I 2021-07-12 20:55:12,541][0m Trial 2 finished with value: 0.8588582664911347 and parameters: {'iterations': 200, 'learning_rate': 0.023139350868493017, 'random_strength': 4, 'bagging_temperature': 1, 'max_bin': 4, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 2, 'max_depth': 8, 'l2_leaf_reg': 0.4053689557345566, 'one_hot_max_size': 10, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 2 with value: 0.8588582664911347.[0m
[32m[I 2021-07-12 20:56:21,743][0m Trial 3 finished with value: 0.8610563616945875 and parameters: {'iterations': 300, '

Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 21:01:03,388][0m Trial 5 finished with value: 0.8615125630020055 and parameters: {'iterations': 2000, 'learning_rate': 0.04211876229965198, 'random_strength': 4, 'bagging_temperature': 1, 'max_bin': 4, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 10, 'max_depth': 7, 'l2_leaf_reg': 2.4563933358965466e-05, 'one_hot_max_size': 12, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 5 with value: 0.8615125630020055.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 21:02:17,365][0m Trial 6 finished with value: 0.8626982952022045 and parameters: {'iterations': 1700, 'learning_rate': 0.06805652352870738, 'random_strength': 8, 'bagging_temperature': 2, 'max_bin': 8, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 10, 'max_depth': 5, 'l2_leaf_reg': 8.326558771196998, 'one_hot_max_size': 10, 'auto_class_weights': 'Balanced'}. Best is trial 6 with value: 0.8626982952022045.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 21:02:28,260][0m Trial 7 finished with value: 0.8289886849535966 and parameters: {'iterations': 500, 'learning_rate': 0.29002979089405717, 'random_strength': 10, 'bagging_temperature': 9, 'max_bin': 30, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 7, 'max_depth': 10, 'l2_leaf_reg': 2.5167504351014815e-07, 'one_hot_max_size': 10, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 6 with value: 0.8626982952022045.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 21:02:46,142][0m Trial 8 finished with value: 0.8604401292291295 and parameters: {'iterations': 200, 'learning_rate': 0.1020605518986092, 'random_strength': 4, 'bagging_temperature': 1, 'max_bin': 20, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 2, 'max_depth': 6, 'l2_leaf_reg': 0.5011128402145567, 'one_hot_max_size': 1024, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 6 with value: 0.8626982952022045.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 21:02:53,901][0m Trial 9 finished with value: 0.8585526929942423 and parameters: {'iterations': 1700, 'learning_rate': 0.20909628762201568, 'random_strength': 5, 'bagging_temperature': 1, 'max_bin': 30, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 5, 'max_depth': 4, 'l2_leaf_reg': 3.6284971517641624e-05, 'one_hot_max_size': 500, 'auto_class_weights': 'Balanced'}. Best is trial 6 with value: 0.8626982952022045.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 21:03:46,376][0m Trial 10 finished with value: 0.8601306178955141 and parameters: {'iterations': 1200, 'learning_rate': 0.11792927815981727, 'random_strength': 8, 'bagging_temperature': 4, 'max_bin': 8, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 10, 'max_depth': 2, 'l2_leaf_reg': 52.78127742712393, 'one_hot_max_size': 10, 'auto_class_weights': 'Balanced'}. Best is trial 6 with value: 0.8626982952022045.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 21:06:28,248][0m Trial 11 finished with value: 0.8618460264950877 and parameters: {'iterations': 2000, 'learning_rate': 0.07350367008679506, 'random_strength': 7, 'bagging_temperature': 3, 'max_bin': 6, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 10, 'max_depth': 7, 'l2_leaf_reg': 0.008006098282358874, 'one_hot_max_size': 12, 'auto_class_weights': 'Balanced'}. Best is trial 6 with value: 0.8626982952022045.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 21:09:05,911][0m Trial 12 finished with value: 0.8600482003609284 and parameters: {'iterations': 2000, 'learning_rate': 0.08535872539360286, 'random_strength': 8, 'bagging_temperature': 4, 'max_bin': 6, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 8, 'max_depth': 10, 'l2_leaf_reg': 0.012804987361617112, 'one_hot_max_size': 12, 'auto_class_weights': 'Balanced'}. Best is trial 6 with value: 0.8626982952022045.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 21:10:25,910][0m Trial 13 finished with value: 0.8621995380870038 and parameters: {'iterations': 1700, 'learning_rate': 0.17741651583128776, 'random_strength': 7, 'bagging_temperature': 3, 'max_bin': 5, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 9, 'max_depth': 7, 'l2_leaf_reg': 87.84778701678394, 'one_hot_max_size': 12, 'auto_class_weights': 'Balanced'}. Best is trial 6 with value: 0.8626982952022045.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 21:10:38,885][0m Trial 14 finished with value: 0.8619930553819785 and parameters: {'iterations': 1700, 'learning_rate': 0.16471437624320762, 'random_strength': 9, 'bagging_temperature': 3, 'max_bin': 5, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 8, 'max_depth': 6, 'l2_leaf_reg': 78.91747521574642, 'one_hot_max_size': 1024, 'auto_class_weights': 'Balanced'}. Best is trial 6 with value: 0.8626982952022045.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 21:12:05,687][0m Trial 15 finished with value: 0.8611329413248965 and parameters: {'iterations': 1700, 'learning_rate': 0.15956244616996276, 'random_strength': 6, 'bagging_temperature': 6, 'max_bin': 8, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 8, 'max_depth': 8, 'l2_leaf_reg': 10.525473468319174, 'one_hot_max_size': 12, 'auto_class_weights': 'Balanced'}. Best is trial 6 with value: 0.8626982952022045.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 21:12:30,343][0m Trial 16 finished with value: 0.8617843489947518 and parameters: {'iterations': 1000, 'learning_rate': 0.22332966525101075, 'random_strength': 8, 'bagging_temperature': 0, 'max_bin': 5, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 9, 'max_depth': 4, 'l2_leaf_reg': 6.9731727924752605, 'one_hot_max_size': 10, 'auto_class_weights': 'Balanced'}. Best is trial 6 with value: 0.8626982952022045.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 21:12:39,956][0m Trial 17 finished with value: 0.850731433003163 and parameters: {'iterations': 1700, 'learning_rate': 0.1309149622238913, 'random_strength': 6, 'bagging_temperature': 3, 'max_bin': 5, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 6, 'max_depth': 9, 'l2_leaf_reg': 0.03235202112761483, 'one_hot_max_size': 25, 'auto_class_weights': 'Balanced'}. Best is trial 6 with value: 0.8626982952022045.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 21:12:58,129][0m Trial 18 finished with value: 0.8610337803805386 and parameters: {'iterations': 1500, 'learning_rate': 0.16188549710511987, 'random_strength': 9, 'bagging_temperature': 2, 'max_bin': 8, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 9, 'max_depth': 7, 'l2_leaf_reg': 65.26127944006275, 'one_hot_max_size': 500, 'auto_class_weights': 'Balanced'}. Best is trial 6 with value: 0.8626982952022045.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 21:13:08,905][0m Trial 19 finished with value: 0.8570945417694531 and parameters: {'iterations': 1700, 'learning_rate': 0.2441503326701509, 'random_strength': 7, 'bagging_temperature': 5, 'max_bin': 8, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 5, 'max_depth': 5, 'l2_leaf_reg': 0.23360444417161425, 'one_hot_max_size': 100, 'auto_class_weights': 'Balanced'}. Best is trial 6 with value: 0.8626982952022045.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 21:13:35,708][0m Trial 20 finished with value: 0.860410625017923 and parameters: {'iterations': 1700, 'learning_rate': 0.18597370482311476, 'random_strength': 9, 'bagging_temperature': 0, 'max_bin': 20, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 9, 'max_depth': 2, 'l2_leaf_reg': 8.096325026785987, 'one_hot_max_size': 10, 'auto_class_weights': 'Balanced'}. Best is trial 6 with value: 0.8626982952022045.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 21:13:50,377][0m Trial 21 finished with value: 0.8619297421014328 and parameters: {'iterations': 1700, 'learning_rate': 0.1789818633951753, 'random_strength': 9, 'bagging_temperature': 3, 'max_bin': 5, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 8, 'max_depth': 6, 'l2_leaf_reg': 87.67444084903545, 'one_hot_max_size': 1024, 'auto_class_weights': 'Balanced'}. Best is trial 6 with value: 0.8626982952022045.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 21:14:09,116][0m Trial 22 finished with value: 0.8625935392654015 and parameters: {'iterations': 1700, 'learning_rate': 0.13867679509964037, 'random_strength': 10, 'bagging_temperature': 2, 'max_bin': 5, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 7, 'max_depth': 6, 'l2_leaf_reg': 95.15931033341283, 'one_hot_max_size': 1024, 'auto_class_weights': 'Balanced'}. Best is trial 6 with value: 0.8626982952022045.[0m
[32m[I 2021-07-12 21:14:17,715][0m Trial 23 finished with value: 0.8591583807999434 and parameters: {'iterations': 100, 'learning_rate': 0.12753733298821815, 'random_strength': 10, 'bagging_temperature': 2, 'max_bin': 5, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 6, 'max_depth': 5, 'l2_leaf_reg': 3.0634717625756895, 'one_hot_max_size': 1024, 'auto_class_weights': 'Balanced'}. Best is trial 6 with value: 0.8626982952022045.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 21:15:50,048][0m Trial 24 finished with value: 0.862153818481956 and parameters: {'iterations': 1500, 'learning_rate': 0.06671125601164339, 'random_strength': 8, 'bagging_temperature': 2, 'max_bin': 5, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 7, 'max_depth': 4, 'l2_leaf_reg': 19.100983531655803, 'one_hot_max_size': 12, 'auto_class_weights': 'Balanced'}. Best is trial 6 with value: 0.8626982952022045.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 21:16:01,501][0m Trial 25 finished with value: 0.8582174780638968 and parameters: {'iterations': 1200, 'learning_rate': 0.13725203020901788, 'random_strength': 7, 'bagging_temperature': 4, 'max_bin': 5, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 7, 'max_depth': 7, 'l2_leaf_reg': 0.08062965344821288, 'one_hot_max_size': 1024, 'auto_class_weights': 'Balanced'}. Best is trial 6 with value: 0.8626982952022045.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 21:16:49,048][0m Trial 26 finished with value: 0.8617043950089808 and parameters: {'iterations': 1700, 'learning_rate': 0.09757280255555012, 'random_strength': 6, 'bagging_temperature': 2, 'max_bin': 8, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 9, 'max_depth': 6, 'l2_leaf_reg': 1.717208628407799, 'one_hot_max_size': 10, 'auto_class_weights': 'Balanced'}. Best is trial 6 with value: 0.8626982952022045.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 21:17:04,426][0m Trial 27 finished with value: 0.8613415422468915 and parameters: {'iterations': 300, 'learning_rate': 0.18444733834488405, 'random_strength': 10, 'bagging_temperature': 0, 'max_bin': 5, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 10, 'max_depth': 3, 'l2_leaf_reg': 83.43916159559782, 'one_hot_max_size': 25, 'auto_class_weights': 'Balanced'}. Best is trial 6 with value: 0.8626982952022045.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 21:18:05,103][0m Trial 28 finished with value: 0.8536151249483162 and parameters: {'iterations': 1000, 'learning_rate': 0.24950383107946503, 'random_strength': 8, 'bagging_temperature': 4, 'max_bin': 10, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 7, 'max_depth': 9, 'l2_leaf_reg': 0.001332234553166288, 'one_hot_max_size': 5, 'auto_class_weights': 'Balanced'}. Best is trial 6 with value: 0.8626982952022045.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 21:18:15,556][0m Trial 29 finished with value: 0.8556686419242091 and parameters: {'iterations': 1700, 'learning_rate': 0.2052992515158666, 'random_strength': 5, 'bagging_temperature': 6, 'max_bin': 8, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 9, 'max_depth': 5, 'l2_leaf_reg': 2.8749704005565047e-06, 'one_hot_max_size': 1024, 'auto_class_weights': 'Balanced'}. Best is trial 6 with value: 0.8626982952022045.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 21:19:09,526][0m Trial 30 finished with value: 0.8621173852162267 and parameters: {'iterations': 1700, 'learning_rate': 0.10628201573058149, 'random_strength': 9, 'bagging_temperature': 2, 'max_bin': 20, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 4, 'max_depth': 7, 'l2_leaf_reg': 18.913471712699458, 'one_hot_max_size': 12, 'auto_class_weights': 'Balanced'}. Best is trial 6 with value: 0.8626982952022045.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 21:20:30,323][0m Trial 31 finished with value: 0.8623859710572536 and parameters: {'iterations': 1500, 'learning_rate': 0.055369535969299895, 'random_strength': 1, 'bagging_temperature': 2, 'max_bin': 5, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 7, 'max_depth': 4, 'l2_leaf_reg': 12.877546605845946, 'one_hot_max_size': 12, 'auto_class_weights': 'Balanced'}. Best is trial 6 with value: 0.8626982952022045.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 21:22:13,850][0m Trial 32 finished with value: 0.8616755716140901 and parameters: {'iterations': 1500, 'learning_rate': 0.04759973255850625, 'random_strength': 1, 'bagging_temperature': 3, 'max_bin': 5, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 6, 'max_depth': 3, 'l2_leaf_reg': 1.8557651797879717, 'one_hot_max_size': 12, 'auto_class_weights': 'Balanced'}. Best is trial 6 with value: 0.8626982952022045.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 21:24:31,731][0m Trial 33 finished with value: 0.8622646076776189 and parameters: {'iterations': 1500, 'learning_rate': 0.029096743342316322, 'random_strength': 1, 'bagging_temperature': 1, 'max_bin': 5, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 8, 'max_depth': 4, 'l2_leaf_reg': 25.903450608164942, 'one_hot_max_size': 12, 'auto_class_weights': 'Balanced'}. Best is trial 6 with value: 0.8626982952022045.[0m
[32m[I 2021-07-12 21:28:10,436][0m Trial 34 finished with value: 0.8391094321329582 and parameters: {'iterations': 1500, 'learning_rate': 0.001350636236642784, 'random_strength': 1, 'bagging_temperature': 1, 'max_bin': 4, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 7, 'max_depth': 3, 'l2_leaf_reg': 0.5147894478738813, 'one_hot_max_size': 5, 'auto_class_weights': 'Balanced'}. Best is trial 6 with value: 0.8626982952022045.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 21:29:20,563][0m Trial 35 finished with value: 0.8622921781796767 and parameters: {'iterations': 1500, 'learning_rate': 0.030432324086581936, 'random_strength': 2, 'bagging_temperature': 0, 'max_bin': 5, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 8, 'max_depth': 4, 'l2_leaf_reg': 19.555382189571798, 'one_hot_max_size': 100, 'auto_class_weights': 'Balanced'}. Best is trial 6 with value: 0.8626982952022045.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 21:29:51,037][0m Trial 36 finished with value: 0.8618036877768341 and parameters: {'iterations': 1500, 'learning_rate': 0.06584750113690342, 'random_strength': 2, 'bagging_temperature': 0, 'max_bin': 10, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 6, 'max_depth': 4, 'l2_leaf_reg': 0.13163342028219222, 'one_hot_max_size': 100, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 6 with value: 0.8626982952022045.[0m
[32m[I 2021-07-12 21:29:59,875][0m Trial 37 finished with value: 0.8485410528699785 and parameters: {'iterations': 100, 'learning_rate': 0.02322069737145118, 'random_strength': 2, 'bagging_temperature': 0, 'max_bin': 5, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 5, 'max_depth': 5, 'l2_leaf_reg': 3.7979696562817704, 'one_hot_max_size': 100, 'auto_class_weights': 'Balanced'}. Best is trial 6 with value: 0.8626982952022045.[0m
[32m[I 2021-07-12 21:30:14,842][0m Trial 38 finished with value: 0.8399064308232056 and parameters: {'iteration

Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 21:30:40,910][0m Trial 39 finished with value: 0.8602347972255283 and parameters: {'iterations': 1500, 'learning_rate': 0.05236980132011715, 'random_strength': 3, 'bagging_temperature': 10, 'max_bin': 6, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 8, 'max_depth': 5, 'l2_leaf_reg': 1.2773639880074984e-08, 'one_hot_max_size': 100, 'auto_class_weights': 'Balanced'}. Best is trial 6 with value: 0.8626982952022045.[0m
[32m[I 2021-07-12 21:31:16,028][0m Trial 40 finished with value: 0.860804396154949 and parameters: {'iterations': 500, 'learning_rate': 0.03100470821157585, 'random_strength': 3, 'bagging_temperature': 1, 'max_bin': 30, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 6, 'max_depth': 3, 'l2_leaf_reg': 0.0010185419341471238, 'one_hot_max_size': 500, 'auto_class_weights': 'Balanced'}. Best is trial 6 with value: 0.8626982952022045.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 21:34:08,129][0m Trial 41 finished with value: 0.8629466236010676 and parameters: {'iterations': 1500, 'learning_rate': 0.03291808362514427, 'random_strength': 1, 'bagging_temperature': 1, 'max_bin': 5, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 8, 'max_depth': 4, 'l2_leaf_reg': 17.823861529588896, 'one_hot_max_size': 12, 'auto_class_weights': 'Balanced'}. Best is trial 41 with value: 0.8629466236010676.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 21:34:42,009][0m Trial 42 finished with value: 0.8622157698917007 and parameters: {'iterations': 1500, 'learning_rate': 0.08510344243321766, 'random_strength': 2, 'bagging_temperature': 2, 'max_bin': 5, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 7, 'max_depth': 4, 'l2_leaf_reg': 29.766196411765232, 'one_hot_max_size': 25, 'auto_class_weights': 'Balanced'}. Best is trial 41 with value: 0.8629466236010676.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 21:36:48,179][0m Trial 43 finished with value: 0.8631053039011282 and parameters: {'iterations': 1500, 'learning_rate': 0.04012435893700535, 'random_strength': 1, 'bagging_temperature': 1, 'max_bin': 5, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 8, 'max_depth': 5, 'l2_leaf_reg': 5.294668875077849, 'one_hot_max_size': 10, 'auto_class_weights': 'Balanced'}. Best is trial 43 with value: 0.8631053039011282.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 21:38:06,345][0m Trial 44 finished with value: 0.8630613839113904 and parameters: {'iterations': 1500, 'learning_rate': 0.05531247394315447, 'random_strength': 1, 'bagging_temperature': 2, 'max_bin': 5, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 10, 'max_depth': 5, 'l2_leaf_reg': 4.812763331771353, 'one_hot_max_size': 10, 'auto_class_weights': 'Balanced'}. Best is trial 43 with value: 0.8631053039011282.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 21:39:03,359][0m Trial 45 finished with value: 0.862400486065324 and parameters: {'iterations': 300, 'learning_rate': 0.08690384441013776, 'random_strength': 3, 'bagging_temperature': 1, 'max_bin': 5, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 10, 'max_depth': 5, 'l2_leaf_reg': 0.7137253551249758, 'one_hot_max_size': 10, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 43 with value: 0.8631053039011282.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 21:42:45,346][0m Trial 46 finished with value: 0.8628933483520032 and parameters: {'iterations': 1200, 'learning_rate': 0.014493923563224098, 'random_strength': 1, 'bagging_temperature': 1, 'max_bin': 30, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 10, 'max_depth': 6, 'l2_leaf_reg': 2.214848742330733, 'one_hot_max_size': 10, 'auto_class_weights': 'Balanced'}. Best is trial 43 with value: 0.8631053039011282.[0m
[32m[I 2021-07-12 21:43:30,840][0m Trial 47 finished with value: 0.8541127823942691 and parameters: {'iterations': 200, 'learning_rate': 0.012910694697924281, 'random_strength': 1, 'bagging_temperature': 1, 'max_bin': 30, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 10, 'max_depth': 6, 'l2_leaf_reg': 2.4486093316049944, 'one_hot_max_size': 10, 'auto_class_weights': 'Balanced'}. Best is trial 43 with value: 0.8631053039011282.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 21:45:31,258][0m Trial 48 finished with value: 0.8627654560398942 and parameters: {'iterations': 1200, 'learning_rate': 0.041194605680506806, 'random_strength': 4, 'bagging_temperature': 1, 'max_bin': 30, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 10, 'max_depth': 5, 'l2_leaf_reg': 0.04655801780693433, 'one_hot_max_size': 10, 'auto_class_weights': 'Balanced'}. Best is trial 43 with value: 0.8631053039011282.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 21:47:10,712][0m Trial 49 finished with value: 0.862397179203323 and parameters: {'iterations': 1200, 'learning_rate': 0.03871760759732019, 'random_strength': 3, 'bagging_temperature': 0, 'max_bin': 30, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 10, 'max_depth': 5, 'l2_leaf_reg': 0.028692188682064083, 'one_hot_max_size': 10, 'auto_class_weights': 'Balanced'}. Best is trial 43 with value: 0.8631053039011282.[0m
[32m[I 2021-07-12 21:52:01,002][0m Trial 50 finished with value: 0.8540069731781059 and parameters: {'iterations': 1200, 'learning_rate': 0.0028899658645015203, 'random_strength': 4, 'bagging_temperature': 1, 'max_bin': 30, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 9, 'max_depth': 6, 'l2_leaf_reg': 0.0036411142599251448, 'one_hot_max_size': 10, 'auto_class_weights': 'Balanced'}. Best is trial 43 with value: 0.8631053039011282.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 21:56:02,989][0m Trial 51 finished with value: 0.8628756527676906 and parameters: {'iterations': 1200, 'learning_rate': 0.01504640877766807, 'random_strength': 1, 'bagging_temperature': 1, 'max_bin': 30, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 10, 'max_depth': 5, 'l2_leaf_reg': 0.17642891146334738, 'one_hot_max_size': 10, 'auto_class_weights': 'Balanced'}. Best is trial 43 with value: 0.8631053039011282.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 21:58:45,557][0m Trial 52 finished with value: 0.8621483604846938 and parameters: {'iterations': 1200, 'learning_rate': 0.018974004817276437, 'random_strength': 1, 'bagging_temperature': 1, 'max_bin': 30, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 10, 'max_depth': 5, 'l2_leaf_reg': 0.0001916546391962657, 'one_hot_max_size': 10, 'auto_class_weights': 'Balanced'}. Best is trial 43 with value: 0.8631053039011282.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 22:00:34,421][0m Trial 53 finished with value: 0.8629180320383908 and parameters: {'iterations': 1200, 'learning_rate': 0.03995391356123135, 'random_strength': 2, 'bagging_temperature': 0, 'max_bin': 30, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 10, 'max_depth': 5, 'l2_leaf_reg': 0.18658475591846327, 'one_hot_max_size': 10, 'auto_class_weights': 'Balanced'}. Best is trial 43 with value: 0.8631053039011282.[0m
[32m[I 2021-07-12 22:04:59,936][0m Trial 54 finished with value: 0.8447672306161191 and parameters: {'iterations': 1200, 'learning_rate': 0.0010442452069274518, 'random_strength': 2, 'bagging_temperature': 0, 'max_bin': 30, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 9, 'max_depth': 6, 'l2_leaf_reg': 0.2062490594461113, 'one_hot_max_size': 10, 'auto_class_weights': 'Balanced'}. Best is trial 43 with value: 0.8631053039011282.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 22:06:26,926][0m Trial 55 finished with value: 0.8631648076658863 and parameters: {'iterations': 1200, 'learning_rate': 0.05538294788958143, 'random_strength': 1, 'bagging_temperature': 0, 'max_bin': 30, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 1, 'max_depth': 5, 'l2_leaf_reg': 4.767209427703021, 'one_hot_max_size': 10, 'auto_class_weights': 'Balanced'}. Best is trial 55 with value: 0.8631648076658863.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 22:07:14,244][0m Trial 56 finished with value: 0.8633351414446357 and parameters: {'iterations': 1200, 'learning_rate': 0.05702761208876678, 'random_strength': 1, 'bagging_temperature': 0, 'max_bin': 30, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 3, 'max_depth': 5, 'l2_leaf_reg': 4.356777742355234, 'one_hot_max_size': 10, 'auto_class_weights': 'Balanced'}. Best is trial 56 with value: 0.8633351414446357.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 22:08:03,952][0m Trial 57 finished with value: 0.863336657058032 and parameters: {'iterations': 1200, 'learning_rate': 0.05795395250208695, 'random_strength': 2, 'bagging_temperature': 0, 'max_bin': 30, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 1, 'max_depth': 5, 'l2_leaf_reg': 4.941110328481645, 'one_hot_max_size': 10, 'auto_class_weights': 'Balanced'}. Best is trial 57 with value: 0.863336657058032.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 22:09:04,593][0m Trial 58 finished with value: 0.8629067074901505 and parameters: {'iterations': 500, 'learning_rate': 0.059136423132727045, 'random_strength': 1, 'bagging_temperature': 0, 'max_bin': 6, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 1, 'max_depth': 3, 'l2_leaf_reg': 5.868433906504972, 'one_hot_max_size': 10, 'auto_class_weights': 'Balanced'}. Best is trial 57 with value: 0.863336657058032.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 22:09:46,968][0m Trial 59 finished with value: 0.8631533679169249 and parameters: {'iterations': 2000, 'learning_rate': 0.07579484026530653, 'random_strength': 2, 'bagging_temperature': 0, 'max_bin': 30, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 2, 'max_depth': 4, 'l2_leaf_reg': 1.0891068535290322, 'one_hot_max_size': 10, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 57 with value: 0.863336657058032.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 22:10:26,586][0m Trial 60 finished with value: 0.8626299061178557 and parameters: {'iterations': 1200, 'learning_rate': 0.08217679215182559, 'random_strength': 2, 'bagging_temperature': 8, 'max_bin': 30, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 2, 'max_depth': 5, 'l2_leaf_reg': 0.9231705050800255, 'one_hot_max_size': 10, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 57 with value: 0.863336657058032.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 22:11:17,096][0m Trial 61 finished with value: 0.8635541167908904 and parameters: {'iterations': 2000, 'learning_rate': 0.07310876332831831, 'random_strength': 1, 'bagging_temperature': 0, 'max_bin': 30, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 1, 'max_depth': 4, 'l2_leaf_reg': 4.454291285733577, 'one_hot_max_size': 10, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 61 with value: 0.8635541167908904.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 22:12:00,397][0m Trial 62 finished with value: 0.8630409759861067 and parameters: {'iterations': 2000, 'learning_rate': 0.06879461265432088, 'random_strength': 2, 'bagging_temperature': 0, 'max_bin': 30, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 1, 'max_depth': 4, 'l2_leaf_reg': 4.382839171825742, 'one_hot_max_size': 10, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 61 with value: 0.8635541167908904.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 22:12:39,298][0m Trial 63 finished with value: 0.8632651328142185 and parameters: {'iterations': 2000, 'learning_rate': 0.07823185256722659, 'random_strength': 1, 'bagging_temperature': 0, 'max_bin': 30, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 3, 'max_depth': 5, 'l2_leaf_reg': 38.87603193605458, 'one_hot_max_size': 10, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 61 with value: 0.8635541167908904.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 22:13:30,126][0m Trial 64 finished with value: 0.862472659857478 and parameters: {'iterations': 2000, 'learning_rate': 0.07585102516978505, 'random_strength': 3, 'bagging_temperature': 0, 'max_bin': 30, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 3, 'max_depth': 4, 'l2_leaf_reg': 8.812523352964348, 'one_hot_max_size': 10, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 61 with value: 0.8635541167908904.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 22:14:16,895][0m Trial 65 finished with value: 0.8627246789998193 and parameters: {'iterations': 2000, 'learning_rate': 0.07617327057858764, 'random_strength': 2, 'bagging_temperature': 0, 'max_bin': 30, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 2, 'max_depth': 4, 'l2_leaf_reg': 29.742653095812166, 'one_hot_max_size': 10, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 61 with value: 0.8635541167908904.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 22:14:44,647][0m Trial 66 finished with value: 0.8627901381418585 and parameters: {'iterations': 2000, 'learning_rate': 0.09555279814873477, 'random_strength': 1, 'bagging_temperature': 0, 'max_bin': 30, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 3, 'max_depth': 5, 'l2_leaf_reg': 1.0333791092746931, 'one_hot_max_size': 10, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 61 with value: 0.8635541167908904.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 22:15:20,198][0m Trial 67 finished with value: 0.862571997350848 and parameters: {'iterations': 2000, 'learning_rate': 0.11249779318263847, 'random_strength': 1, 'bagging_temperature': 0, 'max_bin': 30, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 1, 'max_depth': 6, 'l2_leaf_reg': 32.715786869464225, 'one_hot_max_size': 10, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 61 with value: 0.8635541167908904.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 22:16:19,721][0m Trial 68 finished with value: 0.8637094552038512 and parameters: {'iterations': 2000, 'learning_rate': 0.0647020282001146, 'random_strength': 2, 'bagging_temperature': 0, 'max_bin': 30, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 4, 'max_depth': 5, 'l2_leaf_reg': 50.04446925880975, 'one_hot_max_size': 10, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 68 with value: 0.8637094552038512.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 22:16:42,315][0m Trial 69 finished with value: 0.8616633213697209 and parameters: {'iterations': 2000, 'learning_rate': 0.09486130139781393, 'random_strength': 2, 'bagging_temperature': 0, 'max_bin': 30, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 4, 'max_depth': 3, 'l2_leaf_reg': 44.43731256224508, 'one_hot_max_size': 500, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 68 with value: 0.8637094552038512.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 22:17:40,427][0m Trial 70 finished with value: 0.8627644850923304 and parameters: {'iterations': 2000, 'learning_rate': 0.06093541348374671, 'random_strength': 3, 'bagging_temperature': 0, 'max_bin': 30, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 2, 'max_depth': 4, 'l2_leaf_reg': 98.6318505997737, 'one_hot_max_size': 5, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 68 with value: 0.8637094552038512.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 22:18:27,986][0m Trial 71 finished with value: 0.8629219070845731 and parameters: {'iterations': 2000, 'learning_rate': 0.07472070663276764, 'random_strength': 2, 'bagging_temperature': 0, 'max_bin': 30, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 2, 'max_depth': 5, 'l2_leaf_reg': 9.81275491678058, 'one_hot_max_size': 10, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 68 with value: 0.8637094552038512.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 22:19:18,308][0m Trial 72 finished with value: 0.8631210524060586 and parameters: {'iterations': 2000, 'learning_rate': 0.048552385823553854, 'random_strength': 1, 'bagging_temperature': 0, 'max_bin': 20, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 4, 'max_depth': 5, 'l2_leaf_reg': 0.44088219820618896, 'one_hot_max_size': 10, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 68 with value: 0.8637094552038512.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 22:20:12,117][0m Trial 73 finished with value: 0.8623630733076553 and parameters: {'iterations': 2000, 'learning_rate': 0.04820652592477086, 'random_strength': 1, 'bagging_temperature': 0, 'max_bin': 20, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 4, 'max_depth': 6, 'l2_leaf_reg': 0.3089687450075541, 'one_hot_max_size': 10, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 68 with value: 0.8637094552038512.[0m


Stopped by overfitting detector  (8 iterations wait)


[32m[I 2021-07-12 22:20:51,649][0m Trial 74 finished with value: 0.8624337826045686 and parameters: {'iterations': 2000, 'learning_rate': 0.08950725519297613, 'random_strength': 2, 'bagging_temperature': 0, 'max_bin': 20, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 3, 'max_depth': 5, 'l2_leaf_reg': 1.4122647681571165, 'one_hot_max_size': 10, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 68 with value: 0.8637094552038512.[0m


In [28]:
trial = study.best_trial
final_model = CatBoostClassifier(verbose=False,  cat_features=categorical_features_indices, 
                          **trial.params)
trial.params

{'iterations': 2000,
 'learning_rate': 0.0647020282001146,
 'random_strength': 2,
 'bagging_temperature': 0,
 'max_bin': 30,
 'grow_policy': 'Depthwise',
 'min_data_in_leaf': 4,
 'max_depth': 5,
 'l2_leaf_reg': 50.04446925880975,
 'one_hot_max_size': 10,
 'auto_class_weights': 'SqrtBalanced'}

In [34]:
params = trial.params
final_seas_model = CatBoostClassifier(cat_features=categorical_features_indices,
                                     verbose=False,
                                     iterations=2000,
                                     learning_rate=0.0647020282001146,
                                     random_strength=2,
                                     bagging_temperature=0,
                                     max_bin=30,
                                     grow_policy='Depthwise',
                                     min_data_in_leaf=4,
                                     max_depth=5,
                                     l2_leaf_reg=50.04446925880975,
                                     one_hot_max_size=10,
                                     auto_class_weights='SqrtBalanced').fit(seas_train_trans, seas_labels)

In [30]:
params = trial.params
final_seas_model = CatBoostClassifier(cat_features=categorical_features_indices,
                                     verbose=False,
                                     **params)

### Train Accuracy

In [35]:
h1n1_train_data = train_test.copy()
h1n1_train_data[num_cols] = h1n1_scaler.transform(h1n1_train_data[num_cols])

seas_train_data = train_test.copy()
seas_train_data[num_cols] = seas_scaler.transform(seas_train_data[num_cols])

In [36]:
y_predicted_h1n1 = final_h1n1_model.predict_proba(h1n1_train_data)[:,1].reshape(-1,1)
y_predicted_seas = final_seas_model.predict_proba(seas_train_data)[:,1].reshape(-1,1)

y_true = np.array(labels)

y_predicted = np.concatenate((y_predicted_h1n1, y_predicted_seas), axis=1)

In [37]:
roc_auc_score(np.array(labels), y_predicted)

0.9265800144791575

# Submission

In [38]:
test = pd.read_csv('../Data/test_set_features.csv', index_col='respondent_id')
full_test = test.copy()

In [39]:
num_cols = list(test.select_dtypes('number').columns)

cat_cols = [
    'race',
    'sex',
    'marital_status',
    'rent_or_own',
    'hhs_geo_region',
    'census_msa',
    'employment_industry',
    'employment_occupation'
]

ord_cols = [
    'age_group',
    'education',
    'income_poverty',
    'employment_status'
]


#Impute Test
for col in num_cols:
    test[col] = test[col].fillna(value=-1)


for col in (cat_cols+ord_cols):
    test[col] = test[col].fillna(value='None')

    
test['age_group'] = test['age_group'].map({
    '18 - 34 Years': 1,
    '35 - 44 Years': 2,
    '45 - 54 Years': 3,
    '55 - 64 Years': 4,
    '65+ Years': 5
})
    
test['education'] = test['education'].map({
    '< 12 Years': 1,
    '12 Years': 2,
    'Some College': 3,
    'College Graduate': 4,
    'None': -1
})

test['income_poverty'] = test['income_poverty'].map({
    'None': -1,
    'Below Poverty': 1,
    '<= $75,000, Above Poverty': 2,
    '> $75,000': 3
})

test['employment_status'] = test['employment_status'].map({
    'None': -1,
    'Unemployed': 1,
    'Employed': 2,
    'Not in Labor Force': 3
})

In [40]:
test_h1n1 = test.copy()
test_seas = test.copy()

test_h1n1[num_cols] = h1n1_scaler.transform(test_h1n1[num_cols])
test_seas[num_cols] = seas_scaler.transform(test_seas[num_cols])

In [41]:
y_h1n1 = final_h1n1_model.predict_proba(test_h1n1)[:,1].reshape(-1,1)
y_seas = final_seas_model.predict_proba(test_seas)[:,1].reshape(-1,1)

y_comb = np.concatenate((y_h1n1, y_seas), axis=1)

In [42]:
y_comb

array([[0.28747632, 0.15994814],
       [0.11570729, 0.01653142],
       [0.41235066, 0.79593744],
       ...,
       [0.38341579, 0.13381159],
       [0.05137968, 0.34525411],
       [0.81370657, 0.58536735]])

In [43]:
results = pd.DataFrame(y_comb, columns=['h1n1_vaccine', 'seasonal_vaccine'], index=test.index)

submission = pd.concat([full_test, results], axis=1)
submission = submission[['h1n1_vaccine', 'seasonal_vaccine']]

In [44]:
submission

Unnamed: 0_level_0,h1n1_vaccine,seasonal_vaccine
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1
26707,0.287476,0.159948
26708,0.115707,0.016531
26709,0.412351,0.795937
26710,0.868239,0.903654
26711,0.616027,0.497508
...,...,...
53410,0.618118,0.489700
53411,0.405331,0.300675
53412,0.383416,0.133812
53413,0.051380,0.345254


In [45]:
today = datetime.today().date()

submission.to_csv(f'../Submissions/Neural Network Submission {today}.csv')

In [None]:
submission