# Baseline
Setting up a more robust baseline notebook, suitable for use with all of the "Big Three" (XGBoost, CatBoost, LightGBM) libraries and on either Google Colab or the local machine.

# Setup

In [2]:
# two manual flags (ex-config)
COLAB = False
USE_GPU = True
# libraries = ['xgboost', 'lightgbm', 'catboost']
libraries = ['pytorch-widedeep']

In [3]:
# basic imports
from pathlib import Path
import os
import math
from datetime import datetime
import random

In [4]:
%matplotlib inline
%config Completer.use_jedi = False
os.environ['WANDB_NOTEBOOK_NAME'] = f"stacking_manual_{datetime.now().strftime('%Y%m%d')}.ipynb"

In [5]:
# handle Google Colab-specific library installation/updating
if COLAB:
    # much of the below inspired by or cribbed from the May 2021 Kaggle Tabular Playground winner, at 
    # https://colab.research.google.com/gist/academicsuspect/0aac7bd6e506f5f70295bfc9a3dc2250/tabular-may-baseline.ipynb?authuser=1#scrollTo=LJoVKJb5wN0L
    
    # Kaggle API for downloading the datasets
#     !pip install --upgrade -q kaggle

    # weights and biases
    !pip install -qqqU wandb
    
    # Optuna for parameter search
    !pip install -q optuna

    # upgrade sklearn
    !pip install --upgrade scikit-learn

#     !pip install category_encoders
    
    if 'catboost' in libraries:
        !pip install catboost
    
    if 'xgboost' in libraries:
        if USE_GPU: 
            # this part is from https://github.com/rapidsai/gputreeshap/issues/24
            !pip install cmake --upgrade
            # !pip install sklearn --upgrade
            !git clone --recursive https://github.com/dmlc/xgboost
            %cd /content/xgboost
            !mkdir build
            %cd build
            !cmake .. -DUSE_CUDA=ON
            !make -j4
            %cd /content/xgboost/python-package
            !python setup.py install --use-cuda --use-nccl
            !/opt/bin/nvidia-smi
            !pip install shap
        else:
            !pip install --upgrade xgboost
    if 'lightgbm' in libraries:
        if USE_GPU:
            # lighgbm gpu compatible
            !git clone --recursive https://github.com/Microsoft/LightGBM
            ! cd LightGBM && rm -rf build && mkdir build && cd build && cmake -DUSE_GPU=1 ../../LightGBM && make -j4 && cd ../python-package && python3 setup.py install --precompile --gpu;
        else:
            !pip install --upgrade lightgbm
        

        

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

# general ML tooling
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.metrics import log_loss, roc_auc_score
import wandb
from wandb.xgboost import wandb_callback
from wandb.lightgbm import wandb_callback
from sklearn.impute import SimpleImputer #, KNNImputer
# import timm

import seaborn as sns

from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
# from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.preprocessing import StandardScaler #, MinMaxScaler, MaxAbsScaler, RobustScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
# from sklearn.feature_selection import SelectKBest, f_regression
from joblib import dump, load
# feature engineering tools
# from sklearn.feature_selection import mutual_info_regression
# import featuretools as ft


In [29]:
from pytorch_widedeep import Trainer
from pytorch_widedeep.preprocessing import WidePreprocessor, TabPreprocessor
from pytorch_widedeep.models import Wide, TabMlp, WideDeep
from pytorch_widedeep.metrics import Accuracy

Now, datapath setup

In [7]:
# # This is the code for reading the train.csv and converting it to a .feather file
# df = pd.read_csv(datapath/'train.csv', index_col='id', low_memory=False)
# df.index.name = None
# df.to_feather(path='./dataset_df.feather')

In [8]:
if COLAB:
    # mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')
    
    # handling datapath
    datapath = Path('/content/drive/MyDrive/kaggle/tabular_playgrounds/oct2021/')
    
else:
    # if on local machine
#     datapath = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/sep2021/')  
    root = Path('/home/sf/code/kaggle/tabular_playgrounds/oct2021/')
    datapath = root/'datasets'
    edapath = root/'EDA'
    modelpath = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/oct2021/models/')
    predpath = root/'preds'
    subpath = root/'submissions'
    
    for pth in [root, datapath, edapath, modelpath, predpath, subpath]:
        pth.mkdir(exist_ok=True)
    


In [9]:
SEED = 42

# Function to seed everything
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

seed_everything(seed=SEED)

## Ex-Model Config

In [10]:
# meta-config for preprocessing and cross-validation, but NOT for model parameters
exmodel_config = {
#     "feature_selector": SelectKBest,
#     "k_best": 80,
#     "feature_selection_scoring": f_regression,
#     'random_state': SEED,
#     'feature_generation': ['NaN_counts', 'SummaryStats', 'NaN_OneHots'],
#     'subsample': 1,
    'cross_val_strategy': KFold, # None for holdout, or the relevant sklearn class
    'kfolds': 5, # if 1, that means just doing holdout
    'test_size': 0.2,
#     'features_created': False,
#     'feature_creator': None,
}

## Data Setup

**TODO** Write some conditional logic here to automate it -- possibly as part of a sklearn.*pipeline

In [11]:
# if exmodel_config['scaler']:
#     scaler = exmodel_config['scaler']()
#     scaler.fit_transform()

In [12]:
train_source = datapath/'train.feather'
df = pd.read_feather(path=train_source)
df.index.name = 'id'
y_train = df.target
features = [x for x in df.columns if x != 'target']
X_train = df[features]
# X.index.name = 'id'
# y.index.name = 'id'
X = np.array(X_train)
y = np.array(y_train)

# del df, X_train, y_train


# exmodel_config['feature_count'] = len(X.columns)
exmodel_config['feature_count'] = X.shape[1]
exmodel_config['instance_count'] = X.shape[0]

# exmodel_config['feature_generator'] = None
# exmodel_config['feature_generator'] = "Summary statistics"

exmodel_config['train_source'] = str(train_source)

In [13]:
test_source = datapath/'test.feather'
exmodel_config['test_source'] = str(test_source)
X_test = pd.read_feather(path=test_source)
# X_test = X_test.iloc[:, 1:]

In [14]:
X_test = np.array(X_test)

## Weights and Biases Run Config

Below is the configuration for a Weights and Biases (`wandb`) run.

In [16]:
# wandb config:
wandb_config = {
    'name': f"{os.environ['WANDB_NOTEBOOK_NAME'][:-6]}_{datetime.now().strftime('%H%M%S')}", # just removes the .ipynb extension, leaving the notebook filename's stem
    'tags': ['baseline', 'deep-learning'],
    'notes': "Going to try getting pytorch-widedeep working, initially with TabMLP",
}

## Deep Learning Data Setup

Due to the importance of identifying categorical variables for deep learning on tabular data (namely, the generation of embeddings containing meaningful information about them), I'm going to try using `fastai`'s `cont_cat_split` on the original dataset (post-imputation and generation of features based on summary statistics) and then proceeding with the other transforms.

In [19]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [21]:
cardinalities = X_train.nunique(axis=0)

In [23]:
cardinalities.sort_values(ascending=False)

f144    607935
f169    606385
f226    594319
f207    582192
f125    579880
f237    578749
f168    576993
f72     573057
f142    560152
f225    558377
f217    557482
f224    557335
f227    553207
f174    542053
f166    537722
f215    535865
f161    532853
f23     531719
f191    529250
f184    513613
f122    508674
f94     504544
f198    501658
f173    500033
f114    499970
f179    497596
f138    497263
f157    492134
f192    490036
f141    489019
f124    486410
f58     482130
f209    476044
f193    465107
f158    463093
f171    461900
f214    460571
f163    453071
f2      452401
f146    451339
f182    451296
f219    449261
f154    446224
f162    445373
f143    444524
f91     439202
f188    437466
f139    436631
f84     435536
f206    431717
f8      428659
f90     426170
f6      421827
f223    420899
f200    420271
f160    419967
f153    412726
f195    412007
f123    411379
f93     410485
f234    408673
f194    403041
f4      401939
f152    399983
f34     395805
f126    390685
f39     38

So we have several features with cardinalities in the range (50,000, 100,000), and a bunch of binary indicator variables. Jeremy Howard is opposed to treating features with more than 10k values as categorical, and gets nervous over cardinalities over 5k, so I think I'll just treat the card-2 variables as wide and the others as deep (for now); I might experiment with embeddings for the 50k-100k cardinality features later.

In [14]:
# max_card_embed = 10000
# max_card_cat = 100000
# exmodel_config['max_card_for_embedding'] = max_card_embed
# exmodel_config['max_card_for_categorical'] = max_card_cat


# X_orig = X.iloc[:, :118] # excluding summary, meta-statistics
# X_meta = X.iloc[:, 118:] # including summary, meta-statistics

In [15]:
# exmodel_config['max_card_for_categorical']

100000

In [16]:
# low_card_features = [f for f in X.columns if X[f].nunique() <= 50000]
# high_card_features = [f for f in X.columns if X[f].nunique() > 50000]

In [17]:
# len(low_card_features)

129

In [18]:
# len(high_card_features)

117

# WideDeep

## (Example)

In [19]:
# df = pd.read_csv(datapath/"adult.csv.zip")
# df["income_label"] = (df["income"].apply(lambda x: ">50K" in x)).astype(int)
# df.drop("income", axis=1, inplace=True)
# df_train, df_test = train_test_split(df, test_size=0.2, stratify=df.income_label)

In [20]:
# for f in df.columns:
#     print(f"{f}: {df[f].nunique()}")
#     print(f"NaNs: {df[f].isna().sum()}\n")

In [25]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Columns: 285 entries, f0 to f284
dtypes: float64(240), int64(45)
memory usage: 2.1 GB


In [26]:
# wide_cols_pre = [f for f in X.columns if X[f].nunique() <= max_card_cat and X[f].nunique() > 2]
# wide_cols_onehot = [f for f in X.columns if X[f].nunique() == 2]
wide_cols = [f for f in X_train.columns if X_train[f].nunique() == 2]
cont_cols = [f for f in X_train.columns if X_train[f].nunique() > 2]
# embed_cols = [f for f in X.columns if X[f].nunique() <= max_card_embed and X[f].nunique() > 2]

In [72]:
X_train_np = np.array(X_train)

In [None]:
wide_cols_np = np.where(X_train_np)

In [27]:
len(wide_cols), len(cont_cols)#, len(embed_cols)

(45, 240)

In [67]:
wide_cols

[16,
 21,
 37,
 42,
 45,
 47,
 53,
 58,
 68,
 70,
 75,
 80,
 96,
 98,
 110,
 114,
 118,
 119,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,
 185,
 186,
 187,
 188,
 189,
 190,
 191,
 192,
 193,
 194,
 195,
 196,
 197,
 198,
 199,
 200,
 201,
 202,
 203,
 204,
 205,
 206,
 207,
 208,
 209,
 210,
 211,
 212,
 213,
 214,
 215,
 216,
 217,
 218,
 219,
 220,
 221,
 222,
 223,
 224,
 225,
 226,
 227,
 228,
 229,
 230,
 231,
 232,
 233,
 234,
 235,
 236,
 237,
 238,
 239,
 240,
 241,
 242,
 243,
 244,
 245]

In [70]:
# X_np = np.array(X)

In [76]:
# del X_np

In [30]:
wide_preprocessor = WidePreprocessor(wide_cols=wide_cols)
X_wide_pre = wide_preprocessor.fit_transform(X_train)

  and should_run_async(code)


In [31]:
X_wide_pre.shape

(1000000, 45)

In [30]:
# X_wide_pre[:10,:]

array([[     1,  77995, 161433, ..., 794314, 794316, 794318],
       [     2,  77996, 161434, ..., 794314, 794316, 794318],
       [     3,  77997, 161435, ..., 794314, 794316, 794318],
       ...,
       [     8,  78002, 161440, ..., 794314, 794316, 794318],
       [     9,  78003, 161441, ..., 794314, 794316, 794318],
       [    10,  78004, 161442, ..., 794314, 794316, 794318]])

In [28]:
# X_wide_pre_df = pd.DataFrame(X_wide_pre)

In [29]:
# X_wide_pre_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,1,77995,161433,199700,281240,365284,452743,460202,536580,550629,581609,660306,693792,694221,736126,780964,793934,793949
1,2,77996,161434,199701,281241,365285,452744,460203,536581,550630,581610,660307,693793,694222,736127,780965,793935,793950
2,3,77997,161435,199702,281242,365286,452745,460204,536582,550631,581611,660308,693794,694223,736128,780966,793936,793951
3,4,77998,161436,199703,281243,365287,452746,460205,536583,550632,581612,660309,693795,694224,736129,780967,793937,793952
4,5,77999,161437,199704,281244,365288,452747,460206,536584,550633,581613,660310,693794,694225,736130,780968,793938,793953


In [30]:
# X.loc[:, wide_cols_onehot].head()

  and should_run_async(code)


Unnamed: 0_level_0,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1
0,-0.127178,-0.126936,-0.128208,-0.128498,-0.127846,-0.128334,-0.128263,-0.127711,-0.127186,-0.127076,-0.12793,-0.128637,-0.128095,-0.127072,-0.128284,-0.12801,-0.127939,-0.127508,-0.128137,-0.128057,-0.128052,-0.127309,-0.127639,-0.128792,-0.128271,-0.127648,-0.12801,-0.127254,-0.127888,-0.127791,-0.128992,-0.128368,-0.128212,-0.127182,-0.127546,-0.127669,-0.127453,-0.127968,-0.128494,-0.12769,-0.127808,-0.127892,-0.128057,-0.12809,-0.128174,-0.128804,-0.128343,-0.128019,-0.12774,-0.128506,7.814957,-0.127466,-0.128086,-0.12793,-0.127918,-0.128107,-0.128553,-0.128057,-0.127825,-0.128498,-0.127956,-0.128322,-0.127867,-0.128574,-0.127884,-0.127884,-0.128187,-0.128745,-0.128464,-0.127242,-0.12817,-0.127059,-0.128401,-0.12854,-0.128061,-0.128536,-0.127195,-0.128031,-0.127732,-0.127487,-0.127597,-0.128183,-0.128779,-0.127762,-0.128031,-0.128343,-0.12747,-0.128448,-0.128015,-0.12817,-0.128275,-0.128212,-0.128065,-0.127884,-0.128662,-0.127339,-0.127254,-0.127352,-0.127968,-0.128355,-0.12761,-0.126843,-0.128666,-0.12697,-0.127766,-0.128431,-0.127757,-0.127572,-0.128368,-0.127677,-0.128242,-0.127867,-0.127119,-0.127985,-0.128494,-0.12862,-0.127854,-0.12703
1,-0.127178,-0.126936,-0.128208,-0.128498,-0.127846,-0.128334,-0.128263,-0.127711,-0.127186,-0.127076,-0.12793,-0.128637,-0.128095,-0.127072,-0.128284,-0.12801,-0.127939,-0.127508,-0.128137,-0.128057,-0.128052,-0.127309,-0.127639,-0.128792,-0.128271,-0.127648,-0.12801,-0.127254,-0.127888,-0.127791,-0.128992,-0.128368,-0.128212,-0.127182,-0.127546,-0.127669,-0.127453,-0.127968,-0.128494,-0.12769,-0.127808,-0.127892,-0.128057,-0.12809,-0.128174,-0.128804,-0.128343,-0.128019,-0.12774,-0.128506,-0.12796,-0.127466,-0.128086,-0.12793,-0.127918,-0.128107,-0.128553,-0.128057,-0.127825,-0.128498,-0.127956,-0.128322,-0.127867,-0.128574,-0.127884,-0.127884,-0.128187,-0.128745,-0.128464,-0.127242,-0.12817,-0.127059,-0.128401,-0.12854,-0.128061,-0.128536,-0.127195,-0.128031,-0.127732,-0.127487,-0.127597,-0.128183,-0.128779,-0.127762,-0.128031,-0.128343,-0.12747,-0.128448,-0.128015,-0.12817,-0.128275,-0.128212,-0.128065,-0.127884,-0.128662,-0.127339,-0.127254,-0.127352,-0.127968,-0.128355,-0.12761,-0.126843,-0.128666,-0.12697,-0.127766,-0.128431,-0.127757,-0.127572,-0.128368,-0.127677,-0.128242,-0.127867,-0.127119,-0.127985,-0.128494,-0.12862,-0.127854,-0.12703
2,-0.127178,-0.126936,-0.128208,-0.128498,-0.127846,-0.128334,-0.128263,-0.127711,-0.127186,-0.127076,-0.12793,-0.128637,-0.128095,-0.127072,7.795214,-0.12801,-0.127939,-0.127508,-0.128137,-0.128057,-0.128052,-0.127309,-0.127639,-0.128792,-0.128271,-0.127648,-0.12801,-0.127254,-0.127888,-0.127791,-0.128992,-0.128368,-0.128212,-0.127182,-0.127546,-0.127669,7.846019,-0.127968,-0.128494,-0.12769,-0.127808,-0.127892,-0.128057,-0.12809,-0.128174,-0.128804,-0.128343,-0.128019,-0.12774,-0.128506,-0.12796,-0.127466,-0.128086,-0.12793,-0.127918,-0.128107,-0.128553,-0.128057,-0.127825,-0.128498,-0.127956,-0.128322,-0.127867,-0.128574,-0.127884,-0.127884,-0.128187,-0.128745,7.784252,-0.127242,-0.12817,-0.127059,-0.128401,-0.12854,-0.128061,-0.128536,-0.127195,-0.128031,-0.127732,-0.127487,-0.127597,-0.128183,-0.128779,-0.127762,-0.128031,-0.128343,-0.12747,-0.128448,-0.128015,7.80212,-0.128275,-0.128212,-0.128065,-0.127884,-0.128662,-0.127339,7.858285,-0.127352,-0.127968,-0.128355,-0.12761,-0.126843,-0.128666,-0.12697,-0.127766,-0.128431,-0.127757,-0.127572,-0.128368,-0.127677,-0.128242,-0.127867,-0.127119,-0.127985,-0.128494,-0.12862,-0.127854,-0.12703
3,-0.127178,-0.126936,-0.128208,-0.128498,-0.127846,-0.128334,-0.128263,-0.127711,-0.127186,-0.127076,-0.12793,-0.128637,-0.128095,-0.127072,-0.128284,-0.12801,-0.127939,-0.127508,-0.128137,-0.128057,-0.128052,-0.127309,-0.127639,-0.128792,-0.128271,-0.127648,-0.12801,-0.127254,-0.127888,-0.127791,-0.128992,-0.128368,7.79956,-0.127182,-0.127546,-0.127669,-0.127453,-0.127968,-0.128494,-0.12769,-0.127808,-0.127892,-0.128057,-0.12809,-0.128174,-0.128804,-0.128343,-0.128019,-0.12774,-0.128506,7.814957,-0.127466,-0.128086,-0.12793,-0.127918,-0.128107,-0.128553,-0.128057,-0.127825,-0.128498,-0.127956,-0.128322,-0.127867,-0.128574,-0.127884,-0.127884,-0.128187,-0.128745,-0.128464,-0.127242,-0.12817,-0.127059,-0.128401,-0.12854,-0.128061,-0.128536,-0.127195,-0.128031,-0.127732,-0.127487,-0.127597,-0.128183,-0.128779,-0.127762,-0.128031,-0.128343,-0.12747,-0.128448,-0.128015,-0.12817,-0.128275,-0.128212,-0.128065,-0.127884,-0.128662,-0.127339,-0.127254,-0.127352,-0.127968,-0.128355,-0.12761,-0.126843,-0.128666,-0.12697,-0.127766,-0.128431,-0.127757,-0.127572,-0.128368,-0.127677,-0.128242,-0.127867,-0.127119,-0.127985,-0.128494,-0.12862,-0.127854,-0.12703
4,-0.127178,-0.126936,-0.128208,-0.128498,-0.127846,-0.128334,-0.128263,-0.127711,-0.127186,-0.127076,-0.12793,-0.128637,-0.128095,-0.127072,7.795214,-0.12801,-0.127939,-0.127508,7.80417,7.809044,-0.128052,-0.127309,-0.127639,-0.128792,-0.128271,-0.127648,-0.12801,-0.127254,-0.127888,-0.127791,-0.128992,-0.128368,-0.128212,-0.127182,-0.127546,-0.127669,-0.127453,-0.127968,-0.128494,-0.12769,-0.127808,-0.127892,-0.128057,7.806991,-0.128174,-0.128804,-0.128343,-0.128019,-0.12774,-0.128506,-0.12796,-0.127466,-0.128086,-0.12793,-0.127918,-0.128107,-0.128553,-0.128057,-0.127825,-0.128498,-0.127956,-0.128322,-0.127867,-0.128574,-0.127884,-0.127884,7.801096,-0.128745,-0.128464,-0.127242,-0.12817,-0.127059,-0.128401,-0.12854,-0.128061,-0.128536,-0.127195,-0.128031,7.82889,-0.127487,-0.127597,-0.128183,-0.128779,-0.127762,-0.128031,-0.128343,-0.12747,-0.128448,-0.128015,-0.12817,-0.128275,-0.128212,-0.128065,-0.127884,-0.128662,-0.127339,7.858285,-0.127352,-0.127968,-0.128355,-0.12761,-0.126843,-0.128666,-0.12697,-0.127766,-0.128431,-0.127757,-0.127572,-0.128368,-0.127677,-0.128242,-0.127867,-0.127119,-0.127985,-0.128494,-0.12862,7.821398,-0.12703


In [31]:
# X_wide = X_wide_pre_df.join(X.loc[:,wide_cols_onehot])

  and should_run_async(code)


In [None]:
# X_wide.info()

In [32]:
X_wide = X_wide_pre

In [33]:
wide = Wide(wide_dim=np.unique(X_wide).shape[0], pred_dim=1)

In [34]:
tab_preprocessor = TabPreprocessor(continuous_cols=cont_cols)#, embed_cols=embed_cols, )

In [35]:
# X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 957919 entries, 0 to 957918
Columns: 246 entries, 0 to 245
dtypes: float64(246)
memory usage: 1.8 GB


In [35]:
X_tab = tab_preprocessor.fit_transform(X_train)

In [36]:
X_tab.shape

  and should_run_async(code)


(1000000, 240)

In [38]:
deeptabular = TabMlp(
    mlp_hidden_dims=[64,32],
    column_idx=tab_preprocessor.column_idx,
#     embed_input=tab_preprocessor.embeddings_input,
    continuous_cols=cont_cols,
)

In [39]:
model = WideDeep(wide=wide, deeptabular=deeptabular)

In [42]:
# X_wide = np.array(X_wide)

In [40]:
X_wide.shape

(1000000, 45)

In [41]:
X_tab.shape

(1000000, 240)

<!-- 39774, 758737, 552968 -->

In [42]:
trainer = Trainer(model, objective='binary', metrics=[Accuracy], seed=42, )

In [43]:
y = np.array(y)

In [44]:
trainer.fit(
    X_wide=X_wide,
    X_tab=X_tab,
    target=y,
    n_epochs=30,
    batch_size=1024,
    val_split=0.2,
)

epoch 1: 100%|██████████| 782/782 [00:10<00:00, 76.13it/s, loss=0.697, metrics={'acc': 0.6645}]
valid: 100%|██████████| 196/196 [00:02<00:00, 80.95it/s, loss=0.515, metrics={'acc': 0.7484}] 
epoch 2: 100%|██████████| 782/782 [00:09<00:00, 81.18it/s, loss=0.501, metrics={'acc': 0.7569}]
valid: 100%|██████████| 196/196 [00:02<00:00, 79.99it/s, loss=0.487, metrics={'acc': 0.7639}] 
epoch 3: 100%|██████████| 782/782 [00:09<00:00, 78.82it/s, loss=0.492, metrics={'acc': 0.7614}]
valid: 100%|██████████| 196/196 [00:02<00:00, 78.82it/s, loss=0.485, metrics={'acc': 0.7639}] 
epoch 4: 100%|██████████| 782/782 [00:10<00:00, 77.82it/s, loss=0.49, metrics={'acc': 0.7621}] 
valid: 100%|██████████| 196/196 [00:01<00:00, 101.71it/s, loss=0.484, metrics={'acc': 0.7649}]
epoch 5: 100%|██████████| 782/782 [00:09<00:00, 81.42it/s, loss=0.488, metrics={'acc': 0.7628}]
valid: 100%|██████████| 196/196 [00:01<00:00, 101.85it/s, loss=0.484, metrics={'acc': 0.765}] 
epoch 6: 100%|██████████| 782/782 [00:09<00:0

In [49]:
X_test = pd.read_feather(datapath/test_source)

  and should_run_async(code)


In [51]:
# X_test = X_test.to_numpy()
X_wide_te = wide_preprocessor.fit_transform(X_test)
X_tab_te = tab_preprocessor.fit_transform(X_test)

  and should_run_async(code)


In [52]:
X_wide_te[:10]

  and should_run_async(code)


array([[ 1,  3,  5,  7,  9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
        33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63,
        65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89],
       [ 2,  4,  5,  7, 10, 12, 13, 15, 17, 19, 21, 24, 26, 27, 30, 31,
        34, 35, 37, 40, 41, 43, 45, 47, 50, 52, 54, 56, 57, 60, 61, 63,
        65, 67, 70, 72, 73, 75, 77, 79, 81, 84, 86, 88, 89],
       [ 1,  4,  5,  8,  9, 12, 13, 15, 17, 20, 22, 23, 26, 27, 29, 32,
        34, 35, 38, 39, 41, 43, 45, 47, 49, 51, 54, 56, 57, 60, 62, 63,
        66, 68, 70, 72, 73, 75, 77, 79, 81, 83, 86, 88, 89],
       [ 1,  4,  5,  7, 10, 11, 13, 15, 17, 20, 22, 23, 26, 27, 29, 31,
        34, 36, 38, 40, 41, 43, 45, 47, 49, 52, 53, 56, 57, 59, 61, 64,
        66, 67, 70, 72, 73, 75, 77, 79, 82, 83, 86, 88, 89],
       [ 2,  4,  5,  8, 10, 12, 13, 16, 17, 19, 22, 24, 26, 27, 29, 32,
        33, 35, 38, 40, 41, 43, 45, 47, 50, 52, 53, 56, 58, 59, 62, 63,
        65, 68, 69, 72, 73, 75, 77, 

In [53]:
X_tab_te[:10]

array([[-0.62606923, -0.18969476, -0.92787077, ..., -1.60943639,
         0.22706685, -1.07477182],
       [-0.57253077,  0.2098222 , -0.82945475, ..., -0.27047229,
        -0.8826086 , -0.40752455],
       [-0.95243505, -0.03722716,  1.08705115, ..., -0.24144445,
         0.5487114 , -0.00957377],
       ...,
       [-0.08712007,  1.51269865,  0.38352908, ..., -0.24936747,
        -1.37592574, -1.2029806 ],
       [-0.1412585 ,  1.40645559,  2.19939253, ..., -1.62665483,
         2.64911658,  1.50194527],
       [-0.52762128,  2.71380286, -0.24626641, ..., -0.26179802,
         0.12574403, -0.6659266 ]])

In [54]:
X_wide_te.shape

(500000, 45)

In [55]:
X_tab_te.shape

(500000, 240)

In [75]:
# X_test = pd.read_feather(datapath/'X_test_NaNcounts_SummaryStats_imputed-Median-wIndicators-StandardScaled.feather')

# low_card_features = [f for f in X_test.columns if X_test[f].nunique() <= 50000]
# high_card_features = [f for f in X_test.columns if X_test[f].nunique() > 50000]

# wide_cols_pre = [f for f in X_test.columns if X_test[f].nunique() <= max_card_cat and X_test[f].nunique() > 2]
# wide_cols_onehot = [f for f in X_test.columns if X_test[f].nunique() == 2]
# cont_cols = high_card_features
# embed_cols = [f for f in X_test.columns if X_test[f].nunique() <= max_card_embed and X_test[f].nunique() > 2]

In [78]:
# # X_test = X_test.to_numpy()
# X_wide_te = wide_preprocessor.transform(X_test)
# X_tab_te = tab_preprocessor.transform(X_test)

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [None]:
# X_wide_te = wide_preprocessor

In [None]:
# preds = trainer.predict(X_wide=X_wide_te, X_tab=X_tab_te)

In [None]:
# preds[:20]

In [56]:
preds_proba = trainer.predict_proba(X_wide=X_wide_te, X_tab=X_tab_te, batch_size=1024)

predict: 100%|██████████| 489/489 [00:02<00:00, 176.22it/s]


In [71]:
preds_proba_train = trainer.predict_proba(X_wide=X_wide, X_tab=X_tab)

predict: 100%|██████████| 977/977 [00:04<00:00, 212.77it/s]


In [69]:
roc_auc_score(y_score=preds_proba_train[:,1], y_true=y)

  and should_run_async(code)


0.8562719341662627

In [59]:
preds_proba_train[:20]

array([[0.44021851, 0.55978149],
       [0.72445405, 0.27554595],
       [0.13637155, 0.86362845],
       [0.45089591, 0.54910409],
       [0.10132414, 0.89867586],
       [0.45762271, 0.54237729],
       [0.13202792, 0.86797208],
       [0.03556085, 0.96443915],
       [0.20678174, 0.79321826],
       [0.13734156, 0.86265844],
       [0.90057671, 0.09942328],
       [0.34125483, 0.65874517],
       [0.18667585, 0.81332415],
       [0.31571954, 0.68428046],
       [0.69393456, 0.30606544],
       [0.84645522, 0.15354478],
       [0.97542024, 0.02457974],
       [0.92568088, 0.07431912],
       [0.04913533, 0.95086467],
       [0.84242821, 0.15757181]])

In [60]:
preds_proba[:20]

array([[0.82947814, 0.17052183],
       [0.15631253, 0.84368747],
       [0.670138  , 0.329862  ],
       [0.78820115, 0.21179883],
       [0.27101159, 0.72898841],
       [0.3710376 , 0.6289624 ],
       [0.57676411, 0.42323589],
       [0.14067805, 0.85932195],
       [0.75468326, 0.24531674],
       [0.66803461, 0.33196539],
       [0.76909131, 0.23090869],
       [0.07448471, 0.92551529],
       [0.20164472, 0.79835528],
       [0.14791131, 0.85208869],
       [0.17014694, 0.82985306],
       [0.34611297, 0.65388703],
       [0.16073269, 0.83926731],
       [0.12745643, 0.87254357],
       [0.35510087, 0.64489913],
       [0.75042468, 0.24957532]])

In [61]:
dump(preds_proba, predpath/'widedeep_30epochs_bs1024_64x32tabmlp_20211008_test_preds.joblib')

['/home/sf/code/kaggle/tabular_playgrounds/oct2021/preds/widedeep_30epochs_bs1024_64x32tabmlp_20211008_test_preds.joblib']

In [63]:
sample_df = pd.read_csv(datapath/'sample_submission.csv.zip')

In [64]:
sample_df.loc[:, 'target'] = preds_proba[:,1]

In [65]:
sample_df.head()

Unnamed: 0,id,target
0,1000000,0.170522
1,1000001,0.843687
2,1000002,0.329862
3,1000003,0.211799
4,1000004,0.728988


In [None]:
# submission_path = datapath/'submissions'
# submission_path.mkdir(exist_ok=True)

In [66]:
sample_df.to_csv(subpath/f"{wandb_config['name']}_widedeep_30epochs_bs1024_64x32tabmlp_20211008_test_preds.csv", index=False)

  and should_run_async(code)


## Weights and Biases Run Config

Below is the configuration for a Weights and Biases (`wandb`) run.

In [15]:
# wandb config:
config_run = {
    'name': f"{os.environ['WANDB_NOTEBOOK_NAME'][:-6]}_{datetime.now().strftime('%H%M%S')}", # just removes the .ipynb extension, leaving the notebook filename's stem
    'tags': ['stacking-sklearn', 'attempt'],
    'notes': "Trying a fastai tabular MLP model, for ensembling with the GBMs",
}

# Training

In [None]:
learner.LR

In [23]:
def train(X_train, X_valid, y_train, y_valid, model_config, 
                                              random_state=42,
                                              exmodel_config=exmodel_config, 
                                              config_run=config_run):#, scaler): # passed in via config dict for now
    """
    Basic training function. Note that some of the options passed via the argument are
    in fact hard-coded in, to avoid inconveniences.
    :param X_train: the training set features
    :param X_valid: the validation set features
    :param y_train: the training set targets
    :param y_valid: the validation set targets
    :param random_staKFold: for reproducibility
    :param exmodel_config: dict containing configuration details including the library 
                            (thus model) used, preprocessing, and cross-validation
    :param model_config: dict containing hyperparameter specifications for the model
    :param config_run: dict containing wandb run configuration (name, etc)
    """
    
    # As of 20210920, best CatBoost config is:
    best_20210920_catboost_params = {
        'iterations': 3493,
        'depth': 5,
        'learning_rate': 0.09397459954141321,
        'random_strength': 43,
        'l2_leaf_reg': 26,
        'border_count': 239,
        'bagging_temperature': 12.532400413798356,
        'od_type': 'Iter'
    }
    
    # catboost 20210921 on colab (only 15 trials though)
    best_catboost_params = {
        'iterations': 3302,
        'depth': 5,
        'learning_rate': 0.017183208677599107,
        'random_strength': 41,
        'l2_leaf_reg': 30,
        'border_count': 251,
        'bagging_temperature': 9.898390369028036, 
        'od_type': 'IncToDec'
    }
    
    # optuna 20210921
    best_xgboost_params = {
        'n_estimators': 1119,
        'max_depth': 6,
        'learning_rate': 0.04123392555159452,
        'reg_alpha': 4.511876752318655,
        'reg_lambda': 4.074347238862406,
        'subsample': 0.8408586950521992
    }
    
    wandb.init(
        project="202109_Kaggle_tabular_playground",
        save_code=True,
        tags=config_run['tags'],
        name=config_run['name'],
        notes=config_run['notes'],
        config=exmodel_config)   
        
    if exmodel_config['library'] == 'xgboost':
        model = XGBClassifier(
            tree_method=model_config['tree_method'],
            random_state=random_state,
            n_jobs=model_config['n_jobs'], 
            verbosity=model_config['verbosity'], 
            objective=model_config['objective'],
            **best_xgboost_params
            # #             eval_metric=model_config['eval_metric'],

            # comment out the below for a fairly default model
#             booster=model_config['booster'],
#             max_depth=model_config['max_depth'],
#             learning_rate=model_config['learning_rate'], 
#             subsample=model_config['subsample'],
#             reg_alpha=model_config['reg_alpha'],
#             reg_lambda=model_config['reg_lambda'],
#             n_estimators=model_config['n_estimators'],
        )
        
        model.fit(X_train, y_train, callbacks=[wandb.xgboost.wandb_callback()],
#                                     eval_metric=model_config['eval_metric'],
                 )


    elif exmodel_config['library'] == 'lightgbm':
        model = LGBMClassifier(
#             boosting_type=model_config['boosting_type'],
#             max_depth=model_config['max_depth']
            # TODO
            random_state=random_state,
            n_jobs=model_config['n_jobs'],
            objective=model_config['objective'],
#             eval_metric=model_config['eval_metric'],
            boosting_type=model_config['boosting_type'],
            device_type=model_config['device_type'],
            
            # comment out the below for a basically default model
            n_estimators=model_config['n_estimators'],
            learning_rate=model_config['learning_rate'],
            max_depth=model_config['max_depth'],
            reg_alpha=model_config['reg_alpha'],
            reg_lambda=model_config['reg_lambda'],
            subsample=model_config['subsample'],
        )
        
        model.fit(X_train, y_train, callbacks=[wandb.lightgbm.wandb_callback()],
#                                     eval_metric=model_config['eval_metric'],
                 )
        
    elif exmodel_config['library'] == 'catboost':
        print("CatBoost, therefore no WandB callback.")
        model = CatBoostClassifier(
#             n_estimators=config['n_estimators'],
#             learning_rate=config['learning_rate'],
#             max_depth=config['max_depth'],
            task_type=model_config['task_type'],
    #         n_jobs=config['n_jobs'],
    #         verbosity=config['verbosity'],
    #         subsample=config['subsample'],
#             n_estimators=model_config['n_estimators'],
            random_state=random_state,
            # objective='Logloss', # default, accepts only one
#             custom_metrics=model_config['custom_metrics'],
    #         bootstrap_type=config['bootstrap_type'],
    #         device:config['device']
            **best_catboost_params
        ) 
        model.fit(X_train, y_train)
        
#     y_train_pred = model.predict(X_train)
    y_train_pred = model.predict_proba(X_train)[:,1]

    train_loss = log_loss(y_train, y_train_pred)
    train_auc = roc_auc_score(y_train, y_train_pred)
    wandb.log({'train_loss': train_loss, 'train_auc': train_auc})

    if exmodel_config['library'] == 'catboost':
        print(model.get_all_params())
        wandb.log(model.get_all_params())
    else:
        wandb.log(model.get_params()) # logging model parameters, trying bare-invocation rather than params: model.get_params()
    
    # trying with predict_proba
    y_pred = model.predict_proba(X_valid)[:,1]
#     y_pred = model.predict(X_valid)

    valid_loss = log_loss(y_valid, y_pred)
    valid_auc = roc_auc_score(y_valid, y_pred)
    wandb.log({'valid_loss':valid_loss, 'valid_auc':valid_auc})
    print(f"Valid log-loss is {valid_loss}\nValid AUC is {valid_auc}")   
#     wandb.finish()   
    return model
    

In [24]:
def cross_validation(model_config, X=X, y=y, start_fold=0, exmodel_config=exmodel_config, random_state=42):
    """
    Function to handle model training process in the context of cross-validation -- via hold-out or via k-fold.
    If exmodel_config['cross_val_strategy'] == None, then any kfolds= input is ignored; otherwise, the number specified is used.
    
    :param kfolds: int specifying number of k-folds to use in cross-validation
    :param exmodel_config: dict containing general config including for cross-validation -- `kfold=1` implies hold-out
    """
    if exmodel_config['kfolds'] == 1:
        print("Proceeding with holdout")
        X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                      test_size=exmodel_config['test_size'], 
                                                      random_state=random_state,
                                                     )
        model = train(X_train, X_valid, y_train, y_valid, exmodel_config=exmodel_config, 
                                                    model_config=model_config,
                                                    config_run=config_run)
        wandb.finish()
        
    else:
        X, y = X.to_numpy(), y.to_numpy()
        kfold = exmodel_config['cross_val_strategy'](n_splits=exmodel_config['kfolds'], shuffle=True, random_state=random_state)
        models = {}
        model_path = Path(datapath/f"models/{config_run['name']}_{exmodel_config['kfolds']}folds/")
        (model_path).mkdir(exist_ok=True)
        for fold, (train_ids, valid_ids) in enumerate(kfold.split(X,y)):
            if fold < start_fold:
                continue
            else:
                print(f"FOLD {fold}")
                print("---------------------------------------------------")
                X_train, X_valid = X[train_ids], X[valid_ids]
                y_train, y_valid = y[train_ids], y[valid_ids]
                model = train(X_train, X_valid, y_train, y_valid, exmodel_config=exmodel_config, 
                                                    model_config=model_config,
                                                    config_run=config_run)
                wandb.log({'fold': fold})
                models[fold] = model
                dump(model, Path(model_path/f"{exmodel_config['library']}_fold{fold}_model.joblib"))
                wandb.finish()
        return models
        

# Interface

## Runs

In [25]:
# library = 'xgboost'
# exmodel_config['library'] = library
# model_config = model_configurator(library)
# xgboost_models = cross_validation(model_config)

In [26]:
# for scaler in [StandardScaler, MaxAbsScaler, MinMaxScaler, RobustScaler]:
#     exmodel_config['scaler'] = scaler
#     scaler = scaler()
#     X_scaled = scaler.fit_transform(X)
#     X = pd.DataFrame(X_scaled, columns=X.columns)
#     exmodel_config['library'] = 'lightgbm'
#     model_config = model_configurator('lightgbm')
#     cross_validation(model_config)

In [27]:
# library = 'lightgbm'
# exmodel_config['library'] = library
# model_config = model_configurator(library)
# lightgbm_models = cross_validation(model_config)

# Stacking

## Via `sklearn.ensemble.StackingClassifier`

In [28]:
# xgboost_estimators = [(f'xgboost_fold{fold}', xgboost_models[fold]) for fold in range(5)]

In [29]:
# leaving this default for first try
# final_estimator = 

In [30]:
def stacker(estimators:dict, library:str, X=X, y=y): #, load_models:bool=False, load_path:Path=None):
    """
    A wrapper that will take a dict of the form {fold:int : model} and a string representing the library (for file-naming), 
    then run `sklearn.ensemble.StackingClassifier` with it, and save the stacked model afterward
    """
    estimators_list = [(f'{library}_fold{fold}', estimators[fold]) for fold in range(5)]
    blender = StackingClassifier(estimators=estimators_list,
                                 cv=5,
                                 stack_method='predict_proba',
                                 n_jobs=2,
                                 passthrough=False,
                                 verbose=1
                                )
    print(f"Starting fitting at {datetime.now().strftime('%Y%m%d_%H%M%S')}")
    blender.fit(X,y)
    print(f"Fitting complete at {datetime.now().strftime('%Y%m%d_%H%M%S')}")
    dump(blender, filename=datapath/f"models/{config_run['name']}_{exmodel_config['kfolds']}folds/{library}_stack.joblib")
    print(f"Blender model saved at {datetime.now().strftime('%Y%m%d_%H%M%S')}")
    return blender
    

In [31]:
# might encapsulate this in a new version of the above train function later
exmodel_config['ensemble'] = 'stacking'

wandb.init(
        project="202109_Kaggle_tabular_playground",
        save_code=True,
        tags=config_run['tags'],
        name=config_run['name'],
        notes=config_run['notes'],
        config=exmodel_config)   

random_state = exmodel_config['random_state'] # 42


# # optuna 20210921
# best_xgboost_params = {
#     'n_estimators': 1119,
#     'max_depth': 6,
#     'learning_rate': 0.04123392555159452,
#     'reg_alpha': 4.511876752318655,
#     'reg_lambda': 4.074347238862406,
#     'subsample': 0.8408586950521992
# }

# model_config = model_configurator('xgboost')
# xgboost_model = XGBClassifier(
#             tree_method=model_config['tree_method'],
#             random_state=random_state,
# #             n_jobs=model_config['n_jobs'], 
#             verbosity=model_config['verbosity'], 
#             objective=model_config['objective'],
#             **best_xgboost_params
#             # #             eval_metric=model_config['eval_metric'],

#             # comment out the below for a fairly default model
# #             booster=model_config['booster'],
# #             max_depth=model_config['max_depth'],
# #             learning_rate=model_config['learning_rate'], 
# #             subsample=model_config['subsample'],
# #             reg_alpha=model_config['reg_alpha'],
# #             reg_lambda=model_config['reg_lambda'],
# #             n_estimators=model_config['n_estimators'],
#         )

# model_config = model_configurator('lightgbm')
# lightgbm_model = LGBMClassifier(
#             random_state=random_state,
# #             n_jobs=model_config['n_jobs'],
#             objective=model_config['objective'],
#             boosting_type=model_config['boosting_type'],
#             device_type=model_config['device_type'],
            
#             # comment out the below for a basically default model
#             n_estimators=model_config['n_estimators'],
#             learning_rate=model_config['learning_rate'],
#             max_depth=model_config['max_depth'],
#             reg_alpha=model_config['reg_alpha'],
#             reg_lambda=model_config['reg_lambda'],
#             subsample=model_config['subsample'],
#         )

model_config = model_configurator('catboost', gpu_available=False) # set GPU false to avoid parallel threads blocking GPU

# # As of 20210920, best CatBoost config is:
# best_20210920_catboost_params = {
#     'iterations': 3493,
#     'depth': 5,
#     'learning_rate': 0.09397459954141321,
#     'random_strength': 43,
#     'l2_leaf_reg': 26,
#     'border_count': 239,
#     'bagging_temperature': 12.532400413798356,
#     'od_type': 'Iter'
# }

# catboost 20210921 on colab (only 15 trials though)
best_catboost_params = {
    'iterations': 3302,
    'depth': 5,
    'learning_rate': 0.017183208677599107,
    'random_strength': 41,
    'l2_leaf_reg': 30,
    'border_count': 251,
    'bagging_temperature': 9.898390369028036, 
    'od_type': 'IncToDec'
}
    

catboost_model = CatBoostClassifier(
            task_type=model_config['task_type'],
#             n_estimators=model_config['n_estimators'],
            random_state=random_state,
            
            **best_catboost_params
        ) 



estimators_list = [
#     ('xgboost', xgboost_model),
#     ('lightgbm', lightgbm_model),
    ('catboost', catboost_model)
]

# wandb.log({'estimators': estimators_list})

final_estimator = LogisticRegression(max_iter=1000)
exmodel_config['blender_final_estimator'] = str(final_estimator)
exmodel_config['blender-passthrough'] = False

blender = StackingClassifier(estimators=estimators_list,
                             final_estimator=final_estimator,
                             cv=5,
                             stack_method='predict_proba',
                             n_jobs=-1, # 4 is max allowable for CPU
                             passthrough=exmodel_config['blender-passthrough'],
                             verbose=1
                            )


           
    

[34m[1mwandb[0m: Currently logged in as: [33mhushifang[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.2 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


In [32]:
wandb.log({'blender-final_estimator': str(blender.final_estimator),
#            'blender-final_estimator_params': str(blender.final_estimator.get_params()),
           'blender-stack_mdethod': 'predict_proba',
           'blender-cv': 5
          })

In [None]:
print(f"Starting fitting at {datetime.now().strftime('%Y%m%d_%H%M%S')}")
blender.fit(X,y) # unsure of this -- given kwarg cv=5, is it producing the splits? Or do I have to somehow?
print(f"Fitting complete at {datetime.now().strftime('%Y%m%d_%H%M%S')}")

Starting fitting at 20210922_112912


In [None]:
# wandb.log({'xgboost_params':str(blender.estimators[0][1].get_params()),
#            'lightgbm_params':str(blender.estimators[1][1].get_params()),
# #            'catboost_params':str(blender.estimators[2][1].get_all_params()),
#           })

In [None]:
model_path = Path(datapath/f"models/{config_run['name']}/")
(model_path).mkdir(exist_ok=True)
dump(blender, filename=model_path/f"{config_run['name']}_stack.joblib")
print(f"Blender model saved at {datetime.now().strftime('%Y%m%d_%H%M%S')}")

In [None]:
train_preds = blender.predict_proba(X)[:,1]
train_loss = log_loss(y_pred=train_preds, y_true=y)
train_auc = roc_auc_score(y, train_preds)
wandb.log({'train_loss': train_loss, 'train_auc': train_auc})
print(f"train_loss is {train_loss}, train_auc is {train_auc}")

In [None]:
# train_preds[:20]

# Test set preprocessing


(Here's where encapsulating the transformations in a pipeline would come in handy. But I'll do it manually for now.)

In [None]:
# features = [x for x in test_df.columns if x != 'claim']
# X_test = test_df[features] # this is just for naming consistency

Now, let's get the features the model was trained on and subset the test set's features accordingly

In [None]:
# generation polynomial features
# poly = PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)
# X_test_poly = poly.fit_transform(X_test)

In [None]:
# X_test_poly_names = poly.get_feature_names(X_test.columns)
# X_poly_names[100:150]
# features = pd.read_csv('X_candidates_20210827.csv').columns

In [None]:
# checks = [feature in X_test_poly_names for feature in features]
# checks

In [None]:
# X_test_final = pd.DataFrame(X_test_poly, columns=X_test_poly_names)

In [None]:
# X_test_final = X_test_final[features[1:]]
# X_test_final = X_test

In [None]:
# X_test['nan_count'] = X_test.isnull().sum(axis=1)

In [None]:
# imputer = SimpleImputer(strategy='median', add_indicator=True)
# X_test_imputed_np = imputer.fit_transform(X_test)

In [None]:
# X_test_imputed = pd.DataFrame(X_test_imputed, columns=[str(x) for x in range(X_test_imputed.shape[1])])
# X_test_imputed.to_feather(path=datapath/'X_test_NaNcounts_imputed-Median-wIndicators.feather')

In [None]:
# scaler = exmodel_config['scaler']()
# X_test_imputed_scaled_np = scaler.fit_transform(X_test_imputed)
# X_test_imputed_scaled = pd.DataFrame(X_test_imputed_scaled_np, columns=X_test_imputed.columns)
# X_test_imputed_scaled.to_feather(path=datapath/'X_test_NaNcounts_imputed-Median-wIndicators_StandardScaled.feather')
# X_scaled_df = pd.DataFrame(X_scaled, columns=X_poly_names)

In [None]:
test_set_path = str(datapath/'X_test_NaNcounts_SummaryStats_imputed-Median-wIndicators-StandardScaled.feather')
wandb.log({'test_set': test_set_path})

In [None]:
# X_test_imputed_scaled = pd.read_feather(path=datapath/'X_test_NaNcounts_imputed-Median-wIndicators_StandardScaled.feather')
X_test_imputed_scaled = pd.read_feather(path=datapath/'X_test_NaNcounts_SummaryStats_imputed-Median-wIndicators-StandardScaled.feather')

## Prediction Generation

In [None]:
preds_path = Path(datapath/"preds/")

blender_preds = blender.predict_proba(X_test_imputed_scaled)[:,1]
dump(blender_preds, preds_path/f"{config_run['name']}_stack.joblib")

# Submission

In [None]:
sample_df = pd.read_csv(datapath/'sample_solution.csv')

In [None]:
sample_df.loc[:, 'claim'] = blender_preds

In [None]:
sample_df.head()

In [None]:
submission_path = datapath/'submissions'
submission_path.mkdir(exist_ok=True)

In [55]:
sample_df.to_csv(submission_path/f"{config_run['name']}_blended.csv", index=False)

In [59]:
# str(blender.estimators[2][1].get_all_params())
# blender.estimators[2][1]

<catboost.core.CatBoostClassifier at 0x7f227c7b81c0>

In [56]:
wandb.log({'leaderboard_auc': ,
           'catboost_params': str(best_catboost_params),
          })

In [57]:
wandb.finish()

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
blender-final_estimator,LogisticRegression(m...
blender-stack_mdethod,predict_proba
blender-cv,5
_runtime,16221
_timestamp,1632331738
_step,4
xgboost_params,{'objective': 'binar...
lightgbm_params,{'boosting_type': 'g...
train_loss,0.48961
train_auc,0.84839


0,1
blender-cv,▁
_runtime,▁████
_timestamp,▁████
_step,▁▃▅▆█
train_loss,▁
train_auc,▁
leaderboard_auc,▁


## Manual Stacking

In [73]:
X.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,227,228,229,230,231,232,233,234,235,236
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.425545,-2.357891,-0.637206,-0.866657,-0.111568,-4.829243,-1.171229,-0.603397,-0.596871,-0.516828,...,-0.128368,-0.127677,-0.128242,-0.127867,-0.127119,-0.127985,-0.128494,-0.12862,-0.127854,-0.12703
1,0.2476,-0.323982,1.223569,0.361863,1.071182,-0.36114,0.082051,-0.74659,0.899454,0.469668,...,-0.128368,-0.127677,-0.128242,-0.127867,-0.127119,-0.127985,-0.128494,-0.12862,-0.127854,-0.12703
2,2.032371,-2.43568,-0.48896,0.341193,1.069656,0.118532,0.537069,-0.044075,-0.763516,1.056879,...,-0.128368,-0.127677,-0.128242,-0.127867,-0.127119,-0.127985,-0.128494,-0.12862,-0.127854,-0.12703
3,1.438373,-2.337605,-0.508914,-0.829607,1.485682,3.592008,-1.189087,-0.339152,-0.735281,-0.529158,...,-0.128368,-0.127677,-0.128242,-0.127867,-0.127119,-0.127985,-0.128494,-0.12862,-0.127854,-0.12703
4,0.602333,1.076218,-0.648438,0.463365,0.275053,-0.157989,0.727338,-0.905498,0.052478,-0.511066,...,-0.128368,-0.127677,-0.128242,-0.127867,-0.127119,-0.127985,-0.128494,-0.12862,7.821398,-0.12703


In [119]:
X1 = X.copy()

In [120]:
X1.shape

(957919, 237)

In [121]:
# generate probability predictions for the XGBoost model's folds
for fold in xgboost_models.keys():
#     X1[f"xgboost_fold{fold}_pred"] = xgboost_models[fold].predict(X)
    X1[f"xgboost_fold{fold}_pred"] = xgboost_models[fold].predict_proba(X)[:,1]
#     xgboost_preds[fold] = xgboost_models[fold].predict(X_test_imputed_scaled)



In [122]:
X1.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,232,233,234,235,236,xgboost_fold0_pred,xgboost_fold1_pred,xgboost_fold2_pred,xgboost_fold3_pred,xgboost_fold4_pred
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.425545,-2.357891,-0.637206,-0.866657,-0.111568,-4.829243,-1.171229,-0.603397,-0.596871,-0.516828,...,-0.127985,-0.128494,-0.12862,-0.127854,-0.12703,0.582566,0.58095,0.576743,0.569523,0.595877
1,0.2476,-0.323982,1.223569,0.361863,1.071182,-0.36114,0.082051,-0.74659,0.899454,0.469668,...,-0.127985,-0.128494,-0.12862,-0.127854,-0.12703,0.152252,0.150803,0.148316,0.155218,0.147297
2,2.032371,-2.43568,-0.48896,0.341193,1.069656,0.118532,0.537069,-0.044075,-0.763516,1.056879,...,-0.127985,-0.128494,-0.12862,-0.127854,-0.12703,0.794083,0.789945,0.788326,0.787177,0.797979
3,1.438373,-2.337605,-0.508914,-0.829607,1.485682,3.592008,-1.189087,-0.339152,-0.735281,-0.529158,...,-0.127985,-0.128494,-0.12862,-0.127854,-0.12703,0.774001,0.76851,0.774555,0.782187,0.773245
4,0.602333,1.076218,-0.648438,0.463365,0.275053,-0.157989,0.727338,-0.905498,0.052478,-0.511066,...,-0.127985,-0.128494,-0.12862,7.821398,-0.12703,0.759366,0.755764,0.763769,0.758034,0.758038
