# Baseline
Setting up a more robust baseline notebook, suitable for use with all of the "Big Three" (XGBoost, CatBoost, LightGBM) libraries and on either Google Colab or the local machine.

# Setup

In [1]:
# two manual flags (ex-config)
COLAB = False
GPU_AVAILABLE = True
# libraries = ['xgboost', 'lightgbm', 'catboost']
libraries = ['xgboost', 'lightgbm', 'catboost']

In [2]:
# basic imports
from pathlib import Path
import os
import math
from datetime import datetime
import random

In [3]:
%matplotlib inline
%config Completer.use_jedi = False
os.environ['WANDB_NOTEBOOK_NAME'] = f"stacking_manual_{datetime.now().strftime('%Y%m%d')}.ipynb"

In [4]:
# handle Google Colab-specific library installation/updating
if COLAB:
    # much of the below inspired by or cribbed from the May 2021 Kaggle Tabular Playground winner, at 
    # https://colab.research.google.com/gist/academicsuspect/0aac7bd6e506f5f70295bfc9a3dc2250/tabular-may-baseline.ipynb?authuser=1#scrollTo=LJoVKJb5wN0L
    
    # Kaggle API for downloading the datasets
#     !pip install --upgrade -q kaggle

    # weights and biases
    !pip install -qqqU wandb
    
    # Optuna for parameter search
    !pip install -q optuna

    # upgrade sklearn
    !pip install --upgrade scikit-learn

#     !pip install category_encoders
    
    if 'catboost' in libraries:
        !pip install catboost
    
    if 'xgboost' in libraries:
        if gpu_available: 
            # this part is from https://github.com/rapidsai/gputreeshap/issues/24
            !pip install cmake --upgrade
            # !pip install sklearn --upgrade
            !git clone --recursive https://github.com/dmlc/xgboost
            %cd /content/xgboost
            !mkdir build
            %cd build
            !cmake .. -DUSE_CUDA=ON
            !make -j4
            %cd /content/xgboost/python-package
            !python setup.py install --use-cuda --use-nccl
            !/opt/bin/nvidia-smi
            !pip install shap
        else:
            !pip install --upgrade xgboost
    if 'lightgbm' in libraries:
        if gpu_available:
            # lighgbm gpu compatible
            !git clone --recursive https://github.com/Microsoft/LightGBM
            ! cd LightGBM && rm -rf build && mkdir build && cd build && cmake -DUSE_GPU=1 ../../LightGBM && make -j4 && cd ../python-package && python3 setup.py install --precompile --gpu;
        else:
            !pip install --upgrade lightgbm
        

        

Now, non-stdlib imports

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

# general ML tooling
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.metrics import log_loss, roc_auc_score
import wandb
from wandb.xgboost import wandb_callback
from wandb.lightgbm import wandb_callback
from sklearn.impute import SimpleImputer #, KNNImputer
# import timm

import seaborn as sns

from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
# from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.preprocessing import StandardScaler #, MinMaxScaler, MaxAbsScaler, RobustScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
# from sklearn.feature_selection import SelectKBest, f_regression
from joblib import dump, load
# feature engineering tools
# from sklearn.feature_selection import mutual_info_regression
# import featuretools as ft


Now, datapath setup

In [6]:
# # This is the code for reading the train.csv and converting it to a .feather file
# df = pd.read_csv(datapath/'train.csv', index_col='id', low_memory=False)
# df.index.name = None
# df.to_feather(path='./dataset_df.feather')

In [7]:
if COLAB:
    # mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')
    
    # handling datapath
    datapath = Path('/content/drive/MyDrive/kaggle/tabular_playgrounds/sep2021/')
    
else:
    # if on local machine
    datapath = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/sep2021/')    
    


In [8]:
# Function to seed everything
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

seed_everything(seed=42)

## Ex-Model Config

In [9]:
# meta-config for preprocessing and cross-validation, but NOT for model parameters
exmodel_config = {
    # model config
#     "model": XGBClassifier,
#     "n_estimators": 100, 
#     "max_depth": 3,
#     "learning_rate": 0.1,
#     "test_size": 0.2,
#     "reg_lambda": None, 
#     "library": libraries,
    "scaler": StandardScaler, # TODO: experiment with others (but imputation may be slow)
    "scale_b4_impute": False,
    "imputer": SimpleImputer(strategy='median', add_indicator=True),
#     "knn_imputer_n_neighbors": None, # None if a different imputer is used
#     "feature_selector": SelectKBest,
#     "k_best": 80,
#     "feature_selection_scoring": f_regression,
    'random_state': 42,
    'feature_generation': ['NaN_counts', 'SummaryStats', 'NaN_OneHots'],
#     'subsample': 1,
    'cross_val_strategy': KFold, # None for holdout, or the relevant sklearn class
    'kfolds': 5, # if 1, that means just doing holdout
    'test_size': 0.2,
#     'features_created': False,
#     'feature_creator': None,
}

## Data Setup

**TODO** Write some conditional logic here to automate it -- possibly as part of a sklearn.*pipeline

In [10]:
# if exmodel_config['scaler']:
#     scaler = exmodel_config['scaler']()
#     scaler.fit_transform()

In [11]:
X_source = 'X_NaNcounts_SummaryStats_imputed-Median-wIndicators-StandardScaled.feather'
X_train = pd.read_feather(datapath/X_source) 
y_train = load(datapath/'y.joblib')    
# X.index.name = 'id'
# y.index.name = 'id'
X = np.array(X_train)
y = np.array(y_train)

del X_train, y_train


# exmodel_config['feature_count'] = len(X.columns)
exmodel_config['feature_count'] = X.shape[1]
exmodel_config['instance_count'] = X.shape[0]

# exmodel_config['feature_generator'] = None
exmodel_config['feature_generator'] = "Summary statistics"

exmodel_config['X_source'] = X_source

In [12]:
test_set_path = str(datapath/'X_test_NaNcounts_SummaryStats_imputed-Median-wIndicators-StandardScaled.feather')
exmodel_config['test_set'] = test_set_path
X_test= pd.read_feather(path=datapath/'X_test_NaNcounts_SummaryStats_imputed-Median-wIndicators-StandardScaled.feather')

## Weights and Biases Run Config

Below is the configuration for a Weights and Biases (`wandb`) run.

In [13]:
# wandb config:
wandb_config = {
    'name': f"{os.environ['WANDB_NOTEBOOK_NAME'][:-6]}_{datetime.now().strftime('%H%M%S')}", # just removes the .ipynb extension, leaving the notebook filename's stem
    'tags': ['stacking-manual', 'attempt'],
    'notes': "Attempting a manual stacking baseline, using best-to-date hyperparams",
}

# Training

# Hyperparameters

In [14]:
 # optuna 20210924
best_xgboost_params = {
    'n_estimators': 4205,
    'max_depth': 9,
    'learning_rate': 0.002953166980699093,
    'reg_alpha': 4.496528786028185,
    'reg_lambda': 0.07084201124334108,
    'subsample': 0.611948848824097,
    'min_child_weight': 0.8353153853897145,
    'colsample_bytree': 0.8562542843394833,
    'gamma': 11.710075953347133
}

# best as of 20210923, for valid_auc of 0.8156
best_lightgbm_params = {
    'n_estimators': 4429,
    'max_depth': 3,
    'learning_rate': 0.02952568423554658,
    'reg_alpha': 9.285152197932742,
    'reg_lambda': 7.784744078293992,
    'subsample': 0.8628511640697093,
    'boosting_type': 'gbdt',
    'min_child_samples': 47,
    'num_leaves': 57,
    'colsample_bytree': 0.573325963741589
}

# catboost 20210921-25 on colab (only 15 trials though)
best_catboost_params = {
    'iterations': 3302,
    'depth': 5,
    'learning_rate': 0.017183208677599107,
    'random_strength': 41,
    'l2_leaf_reg': 30,
    'border_count': 251,
    'bagging_temperature': 9.898390369028036, 
    'od_type': 'IncToDec'
}

In [25]:
def cross_validate_model(library:str, params:dict, X, y, X_test, start_fold=0, 
                         exmodel_config=exmodel_config, wandb_config=wandb_config, 
                         random_state=42, wandb_tracked=False):
    """
    Function to handle model training process in the context of cross-validation -- via hold-out or via k-fold.
    If exmodel_config['cross_val_strategy'] == None, then any kfolds= input is ignored; otherwise, the number specified is used.
    
    :param kfolds: int specifying number of k-folds to use in cross-validation
    :param exmodel_config: dict containing general config including for cross-validation -- `kfold=1` implies hold-out
    """
#     if exmodel_config['kfolds'] == 1:
#         print("Proceeding with holdout")
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                          test_size=0.2, 
                                                          random_state=random_state)                 
#         model = train(X_train, X_valid, y_train, y_valid, exmodel_config=exmodel_config, 
#                                                     model_config=model_config,
#                                                     config_run=config_run)
#         wandb.finish()
        
#     else:
#         X, y = X.to_numpy(), y.to_numpy()
    
    if wandb_tracked:
        exmodel_config['library'] = library
        exmodel_config[f'{library}_params'] = str(params)
        wandb.init(
            project="202109_Kaggle_tabular_playground",
            save_code=True,
            tags=wandb_config['tags'],
            name=wandb_config['name'],
            notes=wandb_config['notes'],
            config=exmodel_config
    )   
    
    # prepare for k-fold cross-validation
    kfold = exmodel_config['cross_val_strategy'](n_splits=exmodel_config['kfolds'], shuffle=True, random_state=random_state)
    
    # setup for serialization
    model_path = Path(datapath/f"models/{wandb_config['name']}_{library}_{exmodel_config['kfolds']}folds/")
    (model_path).mkdir(exist_ok=True)
    
    
    # initialize lists for out-of-fold preds and ground truth
    oof_preds, oof_y = [], []
    
    # initialize a numpy.ndarray containing the fold-model's preds for test set
    test_preds = np.zeros((X_test.shape[0]))
    
    for fold, (train_ids, valid_ids) in enumerate(kfold.split(X,y)):
        if fold < start_fold: # skip folds that are already trained
            continue
        else:
            print(f"FOLD {fold}")
            print("---------------------------------------------------")
            X_train, X_valid = X[train_ids], X[valid_ids]
            y_train, y_valid = y[train_ids], y[valid_ids]
    
        # define models
        if library == 'xgboost':
            model = XGBClassifier(
                booster='gbtree',
                random_state=random_state,
                n_jobs=-1, 
                verbosity=1, 
                objective='binary:logistic',
                **params)

        elif library == 'lightgbm':
            model = LGBMClassifier(
                objective='binary',
                random_state=random_state,
                n_jobs=-1,
                eval_metric='auc',
                **params)

        elif library == 'catboost':
            model = CatBoostClassifier(
                task_type='GPU',
                random_state=random_state,
                **params) 
        
        model.fit(X_train, y_train)
        
        # take the training set predictions, if desired
#         y_train_pred = model.predict_proba(X_train)[:,1]
#         train_loss = log_loss(y_train, y_train_pred)
#         train_auc = roc_auc_score(y_train, y_train_pred)
#         wandb.log({'train_loss': train_loss, 'train_auc': train_auc})

        # log the parameters, if desired
#         if exmodel_config['library'] == 'catboost':
#             print(model.get_all_params())
#             wandb.log(model.get_all_params())
#         else:
#             wandb.log(model.get_params()) # logging model parameters, trying bare-invocation rather than params: model.get_params()

        y_valid_preds = model.predict_proba(X_valid)[:,1]
        
        # add the fold-model's OOF preds and ground truths to the out-of-loop lists
        oof_preds.extend(y_valid_preds)
        oof_y.extend(y_valid)
        
        # add the fold's predictions to the model's test-set predictions (will divide later)
        test_preds += model.predict_proba(X_test)[:,1]

#         valid_loss = log_loss(y_valid, y_pred)
        # give the valid AUC score, for edification
        fold_valid_auc = roc_auc_score(y_valid, y_valid_preds)
        print(f"Valid AUC for fold {fold} is {fold_valid_auc}")   
        dump(model, Path(model_path/f"{exmodel_config['library']}_fold{fold}_model.joblib"))

    model_valid_auc = roc_auc_score(oof_y, oof_preds)
    print(f"Valid AUC score for {library} model is {model_valid_auc}")
    
    # finalize test preds
    test_preds /= exmodel_config['kfolds']
    
    # save OOF preds and test-set preds
    dump(oof_preds, Path(datapath/f"preds/{wandb_config['name']}_{library}_{exmodel_config['kfolds']}folds_rs{random_state}_oof_preds.joblib"))
    dump(test_preds, Path(datapath/f"preds/{wandb_config['name']}_{library}_{exmodel_config['kfolds']}folds_rs{random_state}_test_preds.joblib"))

    if wandb_tracked:
        wandb.log({'model_valid_auc': model_valid_auc,
                   'oof_preds': oof_preds,
                   'test_preds': test_preds,
                  })
        wandb.finish()
    return oof_preds, test_preds #/ exmodel_config['kfolds']
        

# Predictions

## Level One

In [16]:
oof_lv1, test_lv1 = pd.DataFrame(), pd.DataFrame()
oof_xgb, test_xgb = cross_validate_model(library='xgboost', X=X, y=y, X_test=X_test, 
                                         wandb_config=wandb_config,
                                         random_state=42,
                                         params=best_xgboost_params,
                                         exmodel_config=exmodel_config, 
                                         wandb_tracked=True
                                        )
oof_lv1['xgboost'] = oof_xgb
test_lv1['xgboost'] = test_xgb

[34m[1mwandb[0m: Currently logged in as: [33mhushifang[0m (use `wandb login --relogin` to force relogin)


FOLD 0
---------------------------------------------------








Valid AUC for fold 0 is 0.8137315974508983
FOLD 1
---------------------------------------------------








Valid AUC for fold 1 is 0.815414714437866
FOLD 2
---------------------------------------------------








Valid AUC for fold 2 is 0.8142660530547183
FOLD 3
---------------------------------------------------








Valid AUC for fold 3 is 0.814687853062167
FOLD 4
---------------------------------------------------








Valid AUC for fold 4 is 0.8152669138492216
Valid AUC score for xgboost model is 0.8146662938898428




VBox(children=(Label(value=' 0.08MB of 0.08MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
model_valid_auc,0.81467


0,1
model_valid_auc,▁


In [26]:
oof_cat1, test_cat1 = cross_validate_model(library='catboost', X=X, y=y, X_test=X_test, 
                                         wandb_config=wandb_config,
                                         random_state=42,
                                         params=best_catboost_params,
                                         exmodel_config=exmodel_config, 
                                         wandb_tracked=True
                                        )
oof_lv1['catboost_1'] = oof_cat1
test_lv1['catboost_1'] = test_cat1

FOLD 0
---------------------------------------------------
0:	learn: 0.6866599	total: 14.8ms	remaining: 48.9s
1:	learn: 0.6801131	total: 30.2ms	remaining: 49.9s
2:	learn: 0.6741074	total: 45.4ms	remaining: 49.9s
3:	learn: 0.6683258	total: 60.7ms	remaining: 50.1s
4:	learn: 0.6627659	total: 75ms	remaining: 49.5s
5:	learn: 0.6574141	total: 89.4ms	remaining: 49.1s
6:	learn: 0.6522663	total: 105ms	remaining: 49.6s
7:	learn: 0.6472868	total: 119ms	remaining: 49s
8:	learn: 0.6425209	total: 132ms	remaining: 48.3s
9:	learn: 0.6390949	total: 145ms	remaining: 47.7s
10:	learn: 0.6346153	total: 157ms	remaining: 47.1s
11:	learn: 0.6314154	total: 172ms	remaining: 47.1s
12:	learn: 0.6272087	total: 184ms	remaining: 46.6s
13:	learn: 0.6242215	total: 198ms	remaining: 46.5s
14:	learn: 0.6202690	total: 213ms	remaining: 46.7s
15:	learn: 0.6174843	total: 226ms	remaining: 46.5s
16:	learn: 0.6137620	total: 242ms	remaining: 46.8s
17:	learn: 0.6101871	total: 262ms	remaining: 47.8s
18:	learn: 0.6076715	total: 276



VBox(children=(Label(value=' 1.93MB of 1.93MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
model_valid_auc,0.81167


0,1
model_valid_auc,▁


In [20]:
oof_cat2, test_cat2 = cross_validate_model(library='catboost', X=X, y=y, X_test=X_test, 
                                                 wandb_config=wandb_config,
                                                 random_state=1983,
                                                 params=best_catboost_params,
                                                 exmodel_config=exmodel_config, 
                                                 wandb_tracked=True
                                                )
oof_lv1['catboost_2'] = oof_cat2
test_lv1['catboost_2'] = test_cat2

FOLD 0
---------------------------------------------------
0:	learn: 0.6866714	total: 15.6ms	remaining: 51.6s
1:	learn: 0.6804339	total: 31.4ms	remaining: 51.8s
2:	learn: 0.6760872	total: 45.3ms	remaining: 49.8s
3:	learn: 0.6702243	total: 59.8ms	remaining: 49.3s
4:	learn: 0.6645851	total: 72.8ms	remaining: 48s
5:	learn: 0.6591544	total: 88.4ms	remaining: 48.6s
6:	learn: 0.6536521	total: 102ms	remaining: 48s
7:	learn: 0.6486306	total: 118ms	remaining: 48.6s
8:	learn: 0.6437007	total: 133ms	remaining: 48.7s
9:	learn: 0.6390446	total: 150ms	remaining: 49.3s
10:	learn: 0.6345676	total: 164ms	remaining: 49.2s
11:	learn: 0.6302664	total: 179ms	remaining: 49.1s
12:	learn: 0.6271784	total: 194ms	remaining: 49.1s
13:	learn: 0.6231329	total: 208ms	remaining: 48.8s
14:	learn: 0.6202535	total: 222ms	remaining: 48.7s
15:	learn: 0.6163797	total: 234ms	remaining: 48.2s
16:	learn: 0.6127255	total: 253ms	remaining: 49s
17:	learn: 0.6089917	total: 268ms	remaining: 48.9s
18:	learn: 0.6056062	total: 280ms



VBox(children=(Label(value=' 1.98MB of 1.98MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
model_valid_auc,0.81163


0,1
model_valid_auc,▁


In [27]:
oof_lgb2, test_lgb2 = cross_validate_model(library='lightgbm', X=X, y=y, X_test=X_test, 
                                                 wandb_config=wandb_config,
                                                 random_state=1983,
                                                 params=best_lightgbm_params,
                                                 exmodel_config=exmodel_config, 
                                                 wandb_tracked=True
                                                )
oof_lv1['lightgbm_2'] = oof_lgb2
test_lv1['lightgbm_2'] = test_lgb2

FOLD 0
---------------------------------------------------
Valid AUC for fold 0 is 0.8151092444571697
FOLD 1
---------------------------------------------------
Valid AUC for fold 1 is 0.8151047373997116
FOLD 2
---------------------------------------------------
Valid AUC for fold 2 is 0.815672329592551
FOLD 3
---------------------------------------------------
Valid AUC for fold 3 is 0.8182009448735955
FOLD 4
---------------------------------------------------
Valid AUC for fold 4 is 0.8142021047470133
Valid AUC score for lightgbm model is 0.8156503194185875




VBox(children=(Label(value=' 2.22MB of 2.22MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
model_valid_auc,0.81565


0,1
model_valid_auc,▁


In [24]:
oof_lgb1, test_lgb1 = cross_validate_model(library='lightgbm', X=X, y=y, X_test=X_test, 
                                         wandb_config=wandb_config,
                                         random_state=42,
                                         params=best_lightgbm_params,
                                         exmodel_config=exmodel_config, 
                                         wandb_tracked=True
                                        )
oof_lv1['lightgbm_1'] = oof_lgb1
test_lv1['lightgbm)1'] = test_lgb1

FOLD 0
---------------------------------------------------
Valid AUC for fold 0 is 0.8146739205467073
FOLD 1
---------------------------------------------------
Valid AUC for fold 1 is 0.8163359353225333
FOLD 2
---------------------------------------------------
Valid AUC for fold 2 is 0.8152348311302269
FOLD 3
---------------------------------------------------
Valid AUC for fold 3 is 0.8156395913224025
FOLD 4
---------------------------------------------------
Valid AUC for fold 4 is 0.8165437789149925
Valid AUC score for lightgbm model is 0.8156810521798477




VBox(children=(Label(value=' 2.21MB of 2.21MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
model_valid_auc,0.81568


0,1
model_valid_auc,▁


# Stacking

## Via `sklearn.ensemble.StackingClassifier`

In [28]:
# xgboost_estimators = [(f'xgboost_fold{fold}', xgboost_models[fold]) for fold in range(5)]

In [29]:
# leaving this default for first try
# final_estimator = 

In [30]:
def stacker(estimators:dict, library:str, X=X, y=y): #, load_models:bool=False, load_path:Path=None):
    """
    A wrapper that will take a dict of the form {fold:int : model} and a string representing the library (for file-naming), 
    then run `sklearn.ensemble.StackingClassifier` with it, and save the stacked model afterward
    """
    estimators_list = [(f'{library}_fold{fold}', estimators[fold]) for fold in range(5)]
    blender = StackingClassifier(estimators=estimators_list,
                                 cv=5,
                                 stack_method='predict_proba',
                                 n_jobs=2,
                                 passthrough=False,
                                 verbose=1
                                )
    print(f"Starting fitting at {datetime.now().strftime('%Y%m%d_%H%M%S')}")
    blender.fit(X,y)
    print(f"Fitting complete at {datetime.now().strftime('%Y%m%d_%H%M%S')}")
    dump(blender, filename=datapath/f"models/{config_run['name']}_{exmodel_config['kfolds']}folds/{library}_stack.joblib")
    print(f"Blender model saved at {datetime.now().strftime('%Y%m%d_%H%M%S')}")
    return blender
    

In [31]:
# might encapsulate this in a new version of the above train function later
exmodel_config['ensemble'] = 'stacking'

wandb.init(
        project="202109_Kaggle_tabular_playground",
        save_code=True,
        tags=config_run['tags'],
        name=config_run['name'],
        notes=config_run['notes'],
        config=exmodel_config)   

random_state = exmodel_config['random_state'] # 42


# optuna 20210924
best_xgboost_params = {'n_estimators': 4205,
                        'max_depth': 9,
                        'learning_rate': 0.002953166980699093,
                        'reg_alpha': 4.496528786028185,
                        'reg_lambda': 0.07084201124334108,
                        'subsample': 0.611948848824097,
                        'min_child_weight': 0.8353153853897145,
                        'colsample_bytree': 0.8562542843394833,
                        'gamma': 11.710075953347133
                      }

model_config = model_configurator('xgboost')
xgboost_model = XGBClassifier(
            tree_method=model_config['tree_method'],
            random_state=random_state,
#             n_jobs=model_config['n_jobs'], 
            verbosity=model_config['verbosity'], 
            objective=model_config['objective'],
            **best_xgboost_params
        )

# best as of 20210923, for valid_auc of 0.8156
best_lightgbm_params = {'n_estimators': 4429,
                        'max_depth': 3,
                        'learning_rate': 0.02952568423554658,
                        'reg_alpha': 9.285152197932742,
                        'reg_lambda': 7.784744078293992,
                        'subsample': 0.8628511640697093,
                        'boosting_type': 'gbdt',
                        'min_child_samples': 47,
                        'num_leaves': 57,
                        'colsample_bytree': 0.573325963741589
                       }

model_config = model_configurator('lightgbm')
lightgbm_model = LGBMClassifier(
            random_state=random_state,
#             n_jobs=model_config['n_jobs'],
            objective=model_config['objective'],
#             boosting_type=model_config['boosting_type'],
            device_type=model_config['device_type'],
            n_jobs=-1,
            **best_lightgbm_params
            # comment out the below for a basically default model
#             n_estimators=model_config['n_estimators'],
#             learning_rate=model_config['learning_rate'],
#             max_depth=model_config['max_depth'],
#             reg_alpha=model_config['reg_alpha'],
#             reg_lambda=model_config['reg_lambda'],
#             subsample=model_config['subsample'],
        )

model_config = model_configurator('catboost', gpu_available=False) # set GPU false to avoid parallel threads blocking GPU

# # As of 20210920, best CatBoost config is:
# best_20210920_catboost_params = {
#     'iterations': 3493,
#     'depth': 5,
#     'learning_rate': 0.09397459954141321,
#     'random_strength': 43,
#     'l2_leaf_reg': 26,
#     'border_count': 239,
#     'bagging_temperature': 12.532400413798356,
#     'od_type': 'Iter'
# }

# catboost 20210921 on colab (only 15 trials though)
# best_catboost_params = {
#     'iterations': 3302,
#     'depth': 5,
#     'learning_rate': 0.017183208677599107,
#     'random_strength': 41,
#     'l2_leaf_reg': 30,
#     'border_count': 251,
#     'bagging_temperature': 9.898390369028036, 
#     'od_type': 'IncToDec'
# }
   
# optuna 20210924 100 trials
best_catboost_params = {
    'iterations': 2028,
    'depth': 5,
    'learning_rate': 0.01894945292658026,
    'random_strength': 4,
    'od_wait': 1895,
    'reg_lambda': 36.31771727215521,
    'border_count': 263,
    'min_child_samples': 5,
    'leaf_estimation_iterations': 2,
    'subsample': 0.5882959573476197
}

catboost_model = CatBoostClassifier(
            task_type=model_config['task_type'],
#             n_estimators=model_config['n_estimators'],
            random_state=random_state,
            
            **best_catboost_params
        ) 



estimators_list = [
    ('xgboost', xgboost_model),
    ('lightgbm', lightgbm_model),
    ('catboost', catboost_model)
]

# wandb.log({'estimators': estimators_list})

final_estimator = LogisticRegression(max_iter=1000)
exmodel_config['blender_final_estimator'] = str(final_estimator)
exmodel_config['blender-passthrough'] = False

blender = StackingClassifier(estimators=estimators_list,
                             final_estimator=final_estimator,
                             cv=5,
                             stack_method='predict_proba',
                             n_jobs=4, # 4 is max allowable for CPU
                             passthrough=exmodel_config['blender-passthrough'],
                             verbose=1
                            )


           
    

[34m[1mwandb[0m: Currently logged in as: [33mhushifang[0m (use `wandb login --relogin` to force relogin)


In [32]:
wandb.log({'blender-final_estimator': str(blender.final_estimator),
#            'blender-final_estimator_params': str(blender.final_estimator.get_params()),
           'blender-stack_mdethod': 'predict_proba',
           'blender-cv': 5
          })

In [33]:
print(f"Starting fitting at {datetime.now().strftime('%Y%m%d_%H%M%S')}")
blender.fit(X,y) # unsure of this -- given kwarg cv=5, is it producing the splits? Or do I have to somehow?
print(f"Fitting complete at {datetime.now().strftime('%Y%m%d_%H%M%S')}")

Starting fitting at 20210922_213434
Fitting complete at 20210923_015139


In [34]:
wandb.log({
            'xgboost_params':str(blender.estimators[0][1].get_params()),
            'lightgbm_params':str(blender.estimators[1][1].get_params()),
# #            'catboost_params':str(blender.estimators[2][1].get_all_params()),
          })

In [35]:
model_path = Path(datapath/f"models/{config_run['name']}/")
(model_path).mkdir(exist_ok=True)
dump(blender, filename=model_path/f"{config_run['name']}_stack.joblib")
print(f"Blender model saved at {datetime.now().strftime('%Y%m%d_%H%M%S')}")

Blender model saved at 20210923_015144


In [36]:
train_preds = blender.predict_proba(X)[:,1]
train_loss = log_loss(y_pred=train_preds, y_true=y)
train_auc = roc_auc_score(y, train_preds)
wandb.log({'train_loss': train_loss, 'train_auc': train_auc})
print(f"train_loss is {train_loss}, train_auc is {train_auc}")



train_loss is 0.49399568426265916, train_auc is 0.8414049580581542


In [37]:
# train_preds[:20]

# Test set preprocessing


(Here's where encapsulating the transformations in a pipeline would come in handy. But I'll do it manually for now.)

In [38]:
# features = [x for x in test_df.columns if x != 'claim']
# X_test = test_df[features] # this is just for naming consistency

Now, let's get the features the model was trained on and subset the test set's features accordingly

In [39]:
# generation polynomial features
# poly = PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)
# X_test_poly = poly.fit_transform(X_test)

In [40]:
# X_test_poly_names = poly.get_feature_names(X_test.columns)
# X_poly_names[100:150]
# features = pd.read_csv('X_candidates_20210827.csv').columns

In [41]:
# checks = [feature in X_test_poly_names for feature in features]
# checks

In [42]:
# X_test_final = pd.DataFrame(X_test_poly, columns=X_test_poly_names)

In [43]:
# X_test_final = X_test_final[features[1:]]
# X_test_final = X_test

In [44]:
# X_test['nan_count'] = X_test.isnull().sum(axis=1)

In [45]:
# imputer = SimpleImputer(strategy='median', add_indicator=True)
# X_test_imputed_np = imputer.fit_transform(X_test)

In [46]:
# X_test_imputed = pd.DataFrame(X_test_imputed, columns=[str(x) for x in range(X_test_imputed.shape[1])])
# X_test_imputed.to_feather(path=datapath/'X_test_NaNcounts_imputed-Median-wIndicators.feather')

In [47]:
# scaler = exmodel_config['scaler']()
# X_test_imputed_scaled_np = scaler.fit_transform(X_test_imputed)
# X_test_imputed_scaled = pd.DataFrame(X_test_imputed_scaled_np, columns=X_test_imputed.columns)
# X_test_imputed_scaled.to_feather(path=datapath/'X_test_NaNcounts_imputed-Median-wIndicators_StandardScaled.feather')
# X_scaled_df = pd.DataFrame(X_scaled, columns=X_poly_names)

In [48]:
test_set_path = str(datapath/'X_test_NaNcounts_SummaryStats_imputed-Median-wIndicators-StandardScaled.feather')
wandb.log({'test_set': test_set_path})

In [49]:
# X_test_imputed_scaled = pd.read_feather(path=datapath/'X_test_NaNcounts_imputed-Median-wIndicators_StandardScaled.feather')
X_test_imputed_scaled = pd.read_feather(path=datapath/'X_test_NaNcounts_SummaryStats_imputed-Median-wIndicators-StandardScaled.feather')

## Prediction Generation

In [50]:
preds_path = Path(datapath/"preds/")

blender_preds = blender.predict_proba(X_test_imputed_scaled)[:,1]
dump(blender_preds, preds_path/f"{config_run['name']}_stack.joblib")



['/media/sf/easystore/kaggle_data/tabular_playgrounds/sep2021/preds/stacking_off-shelf_20210922_213426_stack.joblib']

# Submission

In [51]:
sample_df = pd.read_csv(datapath/'sample_solution.csv')

In [52]:
sample_df.loc[:, 'claim'] = blender_preds

In [53]:
sample_df.head()

Unnamed: 0,id,claim
0,957919,0.570854
1,957920,0.127431
2,957921,0.642251
3,957922,0.125008
4,957923,0.147727


In [54]:
submission_path = datapath/'submissions'
submission_path.mkdir(exist_ok=True)

In [55]:
sample_df.to_csv(submission_path/f"{config_run['name']}_blended.csv", index=False)

In [59]:
# str(blender.estimators[2][1].get_all_params())
# blender.estimators[2][1]

<catboost.core.CatBoostClassifier at 0x7f227c7b81c0>

In [56]:
wandb.log({'leaderboard_auc': 0.81725,
#            'catboost_params': str(best_catboost_params),
          })

In [57]:
wandb.finish()

VBox(children=(Label(value=' 0.23MB of 0.23MB uploaded (0.06MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
blender-cv,5
blender-final_estimator,LogisticRegression(m...
blender-stack_mdethod,predict_proba
leaderboard_auc,0.81725
lightgbm_params,{'boosting_type': 'g...
test_set,/media/sf/easystore/...
train_auc,0.8414
train_loss,0.494
xgboost_params,{'objective': 'binar...


0,1
blender-cv,▁
leaderboard_auc,▁
train_auc,▁
train_loss,▁
