# Baseline
Setting up a more robust baseline notebook, suitable for use with all of the "Big Three" (XGBoost, CatBoost, LightGBM) libraries and on either Google Colab or the local machine.

# Setup

In [1]:
# two manual flags (ex-config)
colab = False
gpu_available = True
# libraries = ['xgboost', 'lightgbm', 'catboost']
libraries = ['fastai']

In [2]:
# basic imports
from pathlib import Path
import os
import math
from datetime import datetime

In [3]:
%matplotlib inline
%config Completer.use_jedi = False
os.environ['WANDB_NOTEBOOK_NAME'] = f"widedeep_{datetime.now().strftime('%Y%m%d')}.ipynb"

In [4]:
# handle Google Colab-specific library installation/updating
if colab:
    # much of the below inspired by or cribbed from the May 2021 Kaggle Tabular Playground winner, at 
    # https://colab.research.google.com/gist/academicsuspect/0aac7bd6e506f5f70295bfc9a3dc2250/tabular-may-baseline.ipynb?authuser=1#scrollTo=LJoVKJb5wN0L
    
    # Kaggle API for downloading the datasets
    !pip install --upgrade -q kaggle

    # weights and biases
    !pip install -qqqU wandb
    
    # Optuna for parameter search
    !pip install -q optuna

    # upgrade sklearn
    !pip install --upgrade scikit-learn

    !pip install category_encoders
    
    if 'catboost' in libraries:
        !pip install catboost
    
    if 'xgboost' in libraries:
        if gpu_available: 
            # this part is from https://github.com/rapidsai/gputreeshap/issues/24
            !pip install cmake --upgrade
            # !pip install sklearn --upgrade
            !git clone --recursive https://github.com/dmlc/xgboost
            %cd /content/xgboost
            !mkdir build
            %cd build
            !cmake .. -DUSE_CUDA=ON
            !make -j4
            %cd /content/xgboost/python-package
            !python setup.py install --use-cuda --use-nccl
            !/opt/bin/nvidia-smi
            !pip install shap
        else:
            !pip install --upgrade xgboost
    if 'lightgbm' in libraries:
        if gpu_available:
            # lighgbm gpu compatible
            !git clone --recursive https://github.com/Microsoft/LightGBM
            ! cd LightGBM && rm -rf build && mkdir build && cd build && cmake -DUSE_GPU=1 ../../LightGBM && make -j4 && cd ../python-package && python3 setup.py install --precompile --gpu;
        else:
            !pip install --upgrade lightgbm
        

        

Now, non-stdlib imports

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

# general ML tooling
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.metrics import log_loss, roc_auc_score
import wandb
# from wandb.xgboost import wandb_callback
# from wandb.lightgbm import wandb_callback
from sklearn.impute import SimpleImputer #, KNNImputer
# import timm

import seaborn as sns

# from catboost import CatBoostClassifier
# from xgboost import XGBClassifier
# from lightgbm import LGBMClassifier
from fastai.tabular.all import *
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.preprocessing import StandardScaler #, MinMaxScaler, MaxAbsScaler, RobustScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
# from sklearn.feature_selection import SelectKBest, f_regression
from joblib import dump, load
# feature engineering tools
# from sklearn.feature_selection import mutual_info_regression
# import featuretools as ft


  return torch._C._cuda_getDeviceCount() > 0


In [6]:
torch.cuda.is_available()

False

Now, datapath setup

In [7]:
# # This is the code for reading the train.csv and converting it to a .feather file
# df = pd.read_csv(datapath/'train.csv', index_col='id', low_memory=False)
# df.index.name = None
# df.to_feather(path='./dataset_df.feather')

In [8]:
if colab:
    # mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')
    
    # handling datapath
    datapath = Path('/content/drive/MyDrive/kaggle/tabular_playgrounds/sep2021/')
    
else:
    # if on local machine
    datapath = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/sep2021/')    
    


## Ex-Model Config

In [9]:
# meta-config for preprocessing and cross-validation, but NOT for model parameters
exmodel_config = {
    # model config
#     "model": XGBClassifier,
#     "n_estimators": 100, 
#     "max_depth": 3,
#     "learning_rate": 0.1,
#     "test_size": 0.2,
#     "reg_lambda": None, 
    "library": 'widedeep',
    "scaler": StandardScaler, # TODO: experiment with others (but imputation may be slow)
    "scale_b4_impute": False,
    "imputer": SimpleImputer(strategy='median', add_indicator=True),
#     "knn_imputer_n_neighbors": None, # None if a different imputer is used
#     "feature_selector": SelectKBest,
#     "k_best": 80,
#     "feature_selection_scoring": f_regression,
    'random_state': 42,
    'feature_generation': ['NaN_counts', 'SummaryStats', 'NaN_OneHots'],
    'features_categorized': True,
#     'subsample': 1,
#     'cross_val_strategy': KFold, # None for holdout, or the relevant sklearn class
#     'kfolds': 5, # if 1, that means just doing holdout
#     'test_size': 0.2,
#     'features_created': False,
#     'feature_creator': None,
}

## Data Setup

Due to the importance of identifying categorical variables for deep learning on tabular data (namely, the generation of embeddings containing meaningful information about them), I'm going to try using `fastai`'s `cont_cat_split` on the original dataset (post-imputation and generation of features based on summary statistics) and then proceeding with the other transforms.

In [10]:
# # here's how to load the original, unaltered dataset and separate features from targets
# df = pd.read_feather(path=datapath/'dataset_df.feather') # this is the unaltered original dataset
# features = [x for x in df.columns if x != 'claim']
# X = df[features]
# y = df.claim



# load the version of the dataset with imputations; X and y were stored separately, as feather and joblib respectively
X = pd.read_feather(datapath/'X_NaNcounts_SummaryStats_imputed-Median-wIndicators-StandardScaled.feather') 
y = load(datapath/'y.joblib')    
X.index.name = 'id'
y.index.name = 'id'

exmodel_config['feature_count'] = len(X.columns)


In [11]:
len(X.columns)

246

In [12]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [13]:
X.nunique(axis=0)

0      147965
1      117427
2      274624
3      265618
4      125414
5      286473
6      201975
7      209765
8      368026
9      370266
10     157265
11     325095
12     117373
13     231413
14     308572
15     313089
16      77994
17     172559
18     160073
19     318317
20     313905
21      83438
22     148460
23     176988
24     208329
25     333767
26     290999
27     269916
28     233936
29     135072
30     263378
31     297891
32     281103
33     245721
34     357008
35     316292
36     210948
37      38267
38     229893
39     223437
40     307516
41     264543
42      81540
43     213949
44     245352
45      84044
46     212671
47      87459
48     245196
49     127050
50     305728
51     212595
52     342517
53       7459
54     117101
55     240290
56     246959
57     163484
58      76378
59     158582
60     245796
61     266661
62     194153
63     267355
64     142378
65     149027
66     213419
67     209319
68      14049
69     210462
70      30980
71    

I'll follow JH's advice here and set 10k as the ceiling for the `max_card=`; this might be something to experiment with later using Optuna. (Esp. as JH says that numbers in the 5000s and higher make him very nervous.)

In [14]:
max_card_embed = 10000
max_card_cat = 100000
exmodel_config['max_card_for_embedding'] = max_card_embed
exmodel_config['max_card_for_categorical'] = max_card_cat


# X_orig = X.iloc[:, :118] # excluding summary, meta-statistics
# X_meta = X.iloc[:, 118:] # including summary, meta-statistics

In [15]:
exmodel_config['max_card_for_categorical']

100000

In [16]:
low_card_features = [f for f in X.columns if X[f].nunique() <= 50000]
high_card_features = [f for f in X.columns if X[f].nunique() > 50000]

In [17]:
len(low_card_features)

129

In [18]:
len(high_card_features)

117

# WideDeep

## (Example)

In [19]:
# df = pd.read_csv(datapath/"adult.csv.zip")
# df["income_label"] = (df["income"].apply(lambda x: ">50K" in x)).astype(int)
# df.drop("income", axis=1, inplace=True)
# df_train, df_test = train_test_split(df, test_size=0.2, stratify=df.income_label)

In [20]:
# for f in df.columns:
#     print(f"{f}: {df[f].nunique()}")
#     print(f"NaNs: {df[f].isna().sum()}\n")

In [21]:
from pytorch_widedeep import Trainer
from pytorch_widedeep.preprocessing import WidePreprocessor, TabPreprocessor
from pytorch_widedeep.models import Wide, TabMlp, WideDeep
from pytorch_widedeep.metrics import Accuracy

In [22]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 957919 entries, 0 to 957918
Columns: 246 entries, 0 to 245
dtypes: float64(246)
memory usage: 1.8 GB


  and should_run_async(code)


In [73]:
# wide_cols_pre = [f for f in X.columns if X[f].nunique() <= max_card_cat and X[f].nunique() > 2]
# wide_cols_onehot = [f for f in X.columns if X[f].nunique() == 2]
wide_cols = [f for f in X.columns if X[f].nunique() <= max_card_cat]
cont_cols = high_card_features
embed_cols = [f for f in X.columns if X[f].nunique() <= max_card_embed and X[f].nunique() > 2]

In [74]:
len(wide_cols), len(cont_cols), len(embed_cols)

(136, 117, 4)

In [67]:
wide_cols

[16,
 21,
 37,
 42,
 45,
 47,
 53,
 58,
 68,
 70,
 75,
 80,
 96,
 98,
 110,
 114,
 118,
 119,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,
 185,
 186,
 187,
 188,
 189,
 190,
 191,
 192,
 193,
 194,
 195,
 196,
 197,
 198,
 199,
 200,
 201,
 202,
 203,
 204,
 205,
 206,
 207,
 208,
 209,
 210,
 211,
 212,
 213,
 214,
 215,
 216,
 217,
 218,
 219,
 220,
 221,
 222,
 223,
 224,
 225,
 226,
 227,
 228,
 229,
 230,
 231,
 232,
 233,
 234,
 235,
 236,
 237,
 238,
 239,
 240,
 241,
 242,
 243,
 244,
 245]

In [70]:
# X_np = np.array(X)

In [76]:
del X_np

In [75]:
wide_preprocessor = WidePreprocessor(wide_cols=wide_cols)
X_wide_pre = wide_preprocessor.fit_transform(X)

In [77]:
X_wide_pre.shape

(957919, 136)

In [30]:
X_wide_pre[:10,:]

array([[     1,  77995, 161433, ..., 794314, 794316, 794318],
       [     2,  77996, 161434, ..., 794314, 794316, 794318],
       [     3,  77997, 161435, ..., 794314, 794316, 794318],
       ...,
       [     8,  78002, 161440, ..., 794314, 794316, 794318],
       [     9,  78003, 161441, ..., 794314, 794316, 794318],
       [    10,  78004, 161442, ..., 794314, 794316, 794318]])

In [28]:
# X_wide_pre_df = pd.DataFrame(X_wide_pre)

In [29]:
# X_wide_pre_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,1,77995,161433,199700,281240,365284,452743,460202,536580,550629,581609,660306,693792,694221,736126,780964,793934,793949
1,2,77996,161434,199701,281241,365285,452744,460203,536581,550630,581610,660307,693793,694222,736127,780965,793935,793950
2,3,77997,161435,199702,281242,365286,452745,460204,536582,550631,581611,660308,693794,694223,736128,780966,793936,793951
3,4,77998,161436,199703,281243,365287,452746,460205,536583,550632,581612,660309,693795,694224,736129,780967,793937,793952
4,5,77999,161437,199704,281244,365288,452747,460206,536584,550633,581613,660310,693794,694225,736130,780968,793938,793953


In [30]:
# X.loc[:, wide_cols_onehot].head()

  and should_run_async(code)


Unnamed: 0_level_0,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1
0,-0.127178,-0.126936,-0.128208,-0.128498,-0.127846,-0.128334,-0.128263,-0.127711,-0.127186,-0.127076,-0.12793,-0.128637,-0.128095,-0.127072,-0.128284,-0.12801,-0.127939,-0.127508,-0.128137,-0.128057,-0.128052,-0.127309,-0.127639,-0.128792,-0.128271,-0.127648,-0.12801,-0.127254,-0.127888,-0.127791,-0.128992,-0.128368,-0.128212,-0.127182,-0.127546,-0.127669,-0.127453,-0.127968,-0.128494,-0.12769,-0.127808,-0.127892,-0.128057,-0.12809,-0.128174,-0.128804,-0.128343,-0.128019,-0.12774,-0.128506,7.814957,-0.127466,-0.128086,-0.12793,-0.127918,-0.128107,-0.128553,-0.128057,-0.127825,-0.128498,-0.127956,-0.128322,-0.127867,-0.128574,-0.127884,-0.127884,-0.128187,-0.128745,-0.128464,-0.127242,-0.12817,-0.127059,-0.128401,-0.12854,-0.128061,-0.128536,-0.127195,-0.128031,-0.127732,-0.127487,-0.127597,-0.128183,-0.128779,-0.127762,-0.128031,-0.128343,-0.12747,-0.128448,-0.128015,-0.12817,-0.128275,-0.128212,-0.128065,-0.127884,-0.128662,-0.127339,-0.127254,-0.127352,-0.127968,-0.128355,-0.12761,-0.126843,-0.128666,-0.12697,-0.127766,-0.128431,-0.127757,-0.127572,-0.128368,-0.127677,-0.128242,-0.127867,-0.127119,-0.127985,-0.128494,-0.12862,-0.127854,-0.12703
1,-0.127178,-0.126936,-0.128208,-0.128498,-0.127846,-0.128334,-0.128263,-0.127711,-0.127186,-0.127076,-0.12793,-0.128637,-0.128095,-0.127072,-0.128284,-0.12801,-0.127939,-0.127508,-0.128137,-0.128057,-0.128052,-0.127309,-0.127639,-0.128792,-0.128271,-0.127648,-0.12801,-0.127254,-0.127888,-0.127791,-0.128992,-0.128368,-0.128212,-0.127182,-0.127546,-0.127669,-0.127453,-0.127968,-0.128494,-0.12769,-0.127808,-0.127892,-0.128057,-0.12809,-0.128174,-0.128804,-0.128343,-0.128019,-0.12774,-0.128506,-0.12796,-0.127466,-0.128086,-0.12793,-0.127918,-0.128107,-0.128553,-0.128057,-0.127825,-0.128498,-0.127956,-0.128322,-0.127867,-0.128574,-0.127884,-0.127884,-0.128187,-0.128745,-0.128464,-0.127242,-0.12817,-0.127059,-0.128401,-0.12854,-0.128061,-0.128536,-0.127195,-0.128031,-0.127732,-0.127487,-0.127597,-0.128183,-0.128779,-0.127762,-0.128031,-0.128343,-0.12747,-0.128448,-0.128015,-0.12817,-0.128275,-0.128212,-0.128065,-0.127884,-0.128662,-0.127339,-0.127254,-0.127352,-0.127968,-0.128355,-0.12761,-0.126843,-0.128666,-0.12697,-0.127766,-0.128431,-0.127757,-0.127572,-0.128368,-0.127677,-0.128242,-0.127867,-0.127119,-0.127985,-0.128494,-0.12862,-0.127854,-0.12703
2,-0.127178,-0.126936,-0.128208,-0.128498,-0.127846,-0.128334,-0.128263,-0.127711,-0.127186,-0.127076,-0.12793,-0.128637,-0.128095,-0.127072,7.795214,-0.12801,-0.127939,-0.127508,-0.128137,-0.128057,-0.128052,-0.127309,-0.127639,-0.128792,-0.128271,-0.127648,-0.12801,-0.127254,-0.127888,-0.127791,-0.128992,-0.128368,-0.128212,-0.127182,-0.127546,-0.127669,7.846019,-0.127968,-0.128494,-0.12769,-0.127808,-0.127892,-0.128057,-0.12809,-0.128174,-0.128804,-0.128343,-0.128019,-0.12774,-0.128506,-0.12796,-0.127466,-0.128086,-0.12793,-0.127918,-0.128107,-0.128553,-0.128057,-0.127825,-0.128498,-0.127956,-0.128322,-0.127867,-0.128574,-0.127884,-0.127884,-0.128187,-0.128745,7.784252,-0.127242,-0.12817,-0.127059,-0.128401,-0.12854,-0.128061,-0.128536,-0.127195,-0.128031,-0.127732,-0.127487,-0.127597,-0.128183,-0.128779,-0.127762,-0.128031,-0.128343,-0.12747,-0.128448,-0.128015,7.80212,-0.128275,-0.128212,-0.128065,-0.127884,-0.128662,-0.127339,7.858285,-0.127352,-0.127968,-0.128355,-0.12761,-0.126843,-0.128666,-0.12697,-0.127766,-0.128431,-0.127757,-0.127572,-0.128368,-0.127677,-0.128242,-0.127867,-0.127119,-0.127985,-0.128494,-0.12862,-0.127854,-0.12703
3,-0.127178,-0.126936,-0.128208,-0.128498,-0.127846,-0.128334,-0.128263,-0.127711,-0.127186,-0.127076,-0.12793,-0.128637,-0.128095,-0.127072,-0.128284,-0.12801,-0.127939,-0.127508,-0.128137,-0.128057,-0.128052,-0.127309,-0.127639,-0.128792,-0.128271,-0.127648,-0.12801,-0.127254,-0.127888,-0.127791,-0.128992,-0.128368,7.79956,-0.127182,-0.127546,-0.127669,-0.127453,-0.127968,-0.128494,-0.12769,-0.127808,-0.127892,-0.128057,-0.12809,-0.128174,-0.128804,-0.128343,-0.128019,-0.12774,-0.128506,7.814957,-0.127466,-0.128086,-0.12793,-0.127918,-0.128107,-0.128553,-0.128057,-0.127825,-0.128498,-0.127956,-0.128322,-0.127867,-0.128574,-0.127884,-0.127884,-0.128187,-0.128745,-0.128464,-0.127242,-0.12817,-0.127059,-0.128401,-0.12854,-0.128061,-0.128536,-0.127195,-0.128031,-0.127732,-0.127487,-0.127597,-0.128183,-0.128779,-0.127762,-0.128031,-0.128343,-0.12747,-0.128448,-0.128015,-0.12817,-0.128275,-0.128212,-0.128065,-0.127884,-0.128662,-0.127339,-0.127254,-0.127352,-0.127968,-0.128355,-0.12761,-0.126843,-0.128666,-0.12697,-0.127766,-0.128431,-0.127757,-0.127572,-0.128368,-0.127677,-0.128242,-0.127867,-0.127119,-0.127985,-0.128494,-0.12862,-0.127854,-0.12703
4,-0.127178,-0.126936,-0.128208,-0.128498,-0.127846,-0.128334,-0.128263,-0.127711,-0.127186,-0.127076,-0.12793,-0.128637,-0.128095,-0.127072,7.795214,-0.12801,-0.127939,-0.127508,7.80417,7.809044,-0.128052,-0.127309,-0.127639,-0.128792,-0.128271,-0.127648,-0.12801,-0.127254,-0.127888,-0.127791,-0.128992,-0.128368,-0.128212,-0.127182,-0.127546,-0.127669,-0.127453,-0.127968,-0.128494,-0.12769,-0.127808,-0.127892,-0.128057,7.806991,-0.128174,-0.128804,-0.128343,-0.128019,-0.12774,-0.128506,-0.12796,-0.127466,-0.128086,-0.12793,-0.127918,-0.128107,-0.128553,-0.128057,-0.127825,-0.128498,-0.127956,-0.128322,-0.127867,-0.128574,-0.127884,-0.127884,7.801096,-0.128745,-0.128464,-0.127242,-0.12817,-0.127059,-0.128401,-0.12854,-0.128061,-0.128536,-0.127195,-0.128031,7.82889,-0.127487,-0.127597,-0.128183,-0.128779,-0.127762,-0.128031,-0.128343,-0.12747,-0.128448,-0.128015,-0.12817,-0.128275,-0.128212,-0.128065,-0.127884,-0.128662,-0.127339,7.858285,-0.127352,-0.127968,-0.128355,-0.12761,-0.126843,-0.128666,-0.12697,-0.127766,-0.128431,-0.127757,-0.127572,-0.128368,-0.127677,-0.128242,-0.127867,-0.127119,-0.127985,-0.128494,-0.12862,7.821398,-0.12703


In [31]:
# X_wide = X_wide_pre_df.join(X.loc[:,wide_cols_onehot])

  and should_run_async(code)


In [None]:
# X_wide.info()

In [78]:
X_wide = X_wide_pre

In [34]:
wide = Wide(wide_dim=np.unique(X_wide).shape[0], pred_dim=1)

In [35]:
tab_preprocessor = TabPreprocessor(embed_cols=embed_cols, continuous_cols=cont_cols)

In [35]:
# X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 957919 entries, 0 to 957918
Columns: 246 entries, 0 to 245
dtypes: float64(246)
memory usage: 1.8 GB


In [36]:
X_tab = tab_preprocessor.fit_transform(X)

In [37]:
X_tab.shape

  and should_run_async(code)


(957919, 121)

In [38]:
deeptabular = TabMlp(
    mlp_hidden_dims=[64,32],
    column_idx=tab_preprocessor.column_idx,
    embed_input=tab_preprocessor.embeddings_input,
    continuous_cols=cont_cols,
)

In [39]:
model = WideDeep(wide=wide, deeptabular=deeptabular)

In [42]:
# X_wide = np.array(X_wide)

In [40]:
X_wide.shape

(957919, 136)

In [41]:
X_tab.shape

(957919, 121)

<!-- 39774, 758737, 552968 -->

In [42]:
trainer = Trainer(model, objective='binary', metrics=[Accuracy], seed=42)

In [43]:
y = np.array(y)

In [44]:
trainer.fit(
    X_wide=X_wide,
    X_tab=X_tab,
    target=y,
    n_epochs=5,
    batch_size=1024,
    val_split=0.2,
)

epoch 1: 100%|██████████| 749/749 [02:36<00:00,  4.79it/s, loss=0.899, metrics={'acc': 0.6769}]
valid: 100%|██████████| 188/188 [00:02<00:00, 92.52it/s, loss=0.836, metrics={'acc': 0.6942}] 
epoch 2: 100%|██████████| 749/749 [02:32<00:00,  4.90it/s, loss=0.787, metrics={'acc': 0.7015}]
valid: 100%|██████████| 188/188 [00:01<00:00, 125.08it/s, loss=0.798, metrics={'acc': 0.6938}]
epoch 3: 100%|██████████| 749/749 [02:32<00:00,  4.90it/s, loss=0.722, metrics={'acc': 0.7144}]
valid: 100%|██████████| 188/188 [00:01<00:00, 128.40it/s, loss=0.776, metrics={'acc': 0.6945}]
epoch 4: 100%|██████████| 749/749 [02:34<00:00,  4.86it/s, loss=0.671, metrics={'acc': 0.7268}]
valid: 100%|██████████| 188/188 [00:01<00:00, 124.43it/s, loss=0.762, metrics={'acc': 0.6958}]
epoch 5: 100%|██████████| 749/749 [02:34<00:00,  4.86it/s, loss=0.627, metrics={'acc': 0.7383}]
valid: 100%|██████████| 188/188 [00:01<00:00, 127.39it/s, loss=0.752, metrics={'acc': 0.6929}]


In [50]:
X_test = pd.read_feather(datapath/'X_test_NaNcounts_SummaryStats_imputed-Median-wIndicators-StandardScaled.feather')

In [79]:
X_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245
0,1.751704,0.968278,-0.427066,-0.840621,0.095301,0.464038,-0.824375,-1.08538,-0.767857,0.095726,0.29427,1.587074,0.099936,-0.53782,0.253845,0.896228,1.888856,1.569833,-0.19659,-0.724887,-0.658471,-0.094833,0.237521,0.487661,-1.018911,-0.085047,-0.699558,1.819097,-0.745413,-0.209014,-0.536655,-0.551061,-0.62629,-0.145177,0.954797,0.030325,-0.333978,-0.716172,1.831318,-0.815223,-0.68016,-0.808847,0.148292,-0.167851,-1.015892,0.543848,-1.664666,0.323791,-0.874336,-2.071137,-0.619346,-0.885334,-0.576842,1.463713,-0.026447,0.659552,0.069464,-0.238719,-0.66655,-1.811843,-0.989331,0.020936,0.333912,-0.957043,0.84888,0.7856,0.792306,-0.637652,-0.425662,-0.833097,0.628813,1.638262,-0.412419,-0.525297,-0.646473,-1.325236,-0.591492,-0.062932,-0.08176,1.964202,0.407308,1.453569,-0.36235,-0.064352,1.046524,-1.046313,-0.836077,-0.154928,-0.846819,-0.477627,-2.453846,-0.139056,-0.180401,-1.779057,-0.148828,-0.005655,-0.464128,-0.859371,-0.412277,-1.517857,1.980152,-0.750306,-0.413239,-0.386818,-0.115089,-0.745005,0.362949,-0.463772,-0.794857,-0.132273,-0.003437,-0.509415,-1.246808,0.210003,-0.127254,-0.629358,-0.618295,-0.231429,-0.44261,-0.034637,-0.155002,1.330175,-0.027994,-0.151745,-0.281398,0.550393,0.013257,-0.884058,-0.126828,-0.127478,-0.126687,-0.126175,-0.127363,-0.127658,-0.127822,-0.127182,-0.126877,-0.127519,-0.127724,-0.129077,-0.127806,-0.126315,-0.128695,-0.127879,-0.128247,-0.12841,-0.128272,-0.126894,-0.128378,-0.127494,-0.127617,-0.126431,-0.127379,-0.129345,-0.128067,-0.127978,-0.127912,-0.127617,-0.127009,-0.127231,-0.126803,-0.127691,-0.127576,-0.128231,-0.128247,-0.127281,-0.128182,-0.128744,-0.127511,-0.128051,-0.12783,-0.126704,-0.128321,-0.128459,-0.127847,-0.128524,-0.128728,-0.128378,-0.128541,-0.126547,-0.12801,-0.128647,-0.128647,-0.128736,-0.1271,-0.12696,-0.127428,-0.128801,-0.12792,-0.128002,-0.127757,-0.129515,-0.127568,-0.128321,-0.128002,-0.12902,-0.12792,-0.12756,-0.128573,-0.128524,-0.127428,-0.128076,-0.129134,-0.127486,-0.12863,-0.128084,7.792981,-0.128051,-0.126621,-0.127593,-0.128141,-0.127797,-0.126976,-0.127937,-0.127264,-0.128027,-0.127896,-0.127363,-0.126819,-0.128149,-0.128573,-0.127912,-0.127609,-0.127108,-0.126605,-0.128149,-0.12949,-0.128231,-0.127773,-0.12663,-0.128704,-0.128598,-0.127461,-0.128263,-0.127904,-0.128059,-0.128476,-0.126506,-0.127461,-0.128573,-0.126365,-0.127896,-0.128182,-0.129045,-0.126423,-0.127428
1,0.913581,0.184047,-0.353273,2.503619,0.188649,-0.562836,-0.584997,0.522611,-0.512844,-0.432564,0.782779,1.459134,0.362927,-0.732927,-0.531157,-0.643969,-0.175145,2.140006,0.499012,-0.705995,-0.318749,0.955799,0.596087,-0.741604,1.844986,-0.592684,0.231928,-0.557752,1.615232,-0.83224,0.529083,1.395408,0.063284,1.477203,-0.607693,0.066323,-1.191912,-0.462801,0.050357,-0.813357,-0.662537,-0.648801,-0.2248,-0.963386,2.254599,0.15879,-1.297993,-0.919179,-0.790977,-2.045965,0.498839,2.187002,0.483492,1.009265,1.649467,1.032962,-0.038555,0.753564,-0.811298,-0.311674,-0.560421,-0.695927,0.053301,0.827917,0.72776,0.262516,-1.189286,0.069433,0.148949,1.200276,1.606983,0.797347,-0.563326,-0.351463,-0.841337,0.074523,-0.540233,-0.000196,-0.767644,-0.78441,1.101888,1.437974,2.901721,-0.596012,2.645387,-1.054988,-0.641274,1.965607,-1.008591,-0.348907,0.046766,-0.524472,-0.142755,0.628105,-0.970454,-0.405823,0.481852,0.40286,-0.551052,-0.627485,-0.519872,-0.932559,-0.346737,-0.478548,0.623596,-1.076239,0.268719,-0.830292,1.413474,0.986151,-0.556124,-0.542603,1.006487,0.302786,-0.506379,0.022709,0.854729,-0.68123,-0.935935,-1.163156,-0.287454,1.005741,-0.274192,-0.292826,-0.385354,0.429414,-0.243408,-1.006832,-0.126828,-0.127478,-0.126687,-0.126175,-0.127363,-0.127658,-0.127822,-0.127182,-0.126877,-0.127519,-0.127724,-0.129077,-0.127806,-0.126315,-0.128695,-0.127879,-0.128247,-0.12841,-0.128272,-0.126894,-0.128378,-0.127494,-0.127617,-0.126431,-0.127379,-0.129345,-0.128067,-0.127978,-0.127912,-0.127617,-0.127009,-0.127231,-0.126803,-0.127691,-0.127576,-0.128231,-0.128247,-0.127281,-0.128182,-0.128744,-0.127511,-0.128051,-0.12783,-0.126704,-0.128321,-0.128459,-0.127847,-0.128524,-0.128728,-0.128378,-0.128541,-0.126547,-0.12801,-0.128647,-0.128647,-0.128736,-0.1271,-0.12696,-0.127428,-0.128801,-0.12792,-0.128002,-0.127757,-0.129515,-0.127568,-0.128321,-0.128002,-0.12902,-0.12792,-0.12756,-0.128573,-0.128524,-0.127428,-0.128076,-0.129134,-0.127486,-0.12863,-0.128084,-0.128321,-0.128051,-0.126621,-0.127593,-0.128141,-0.127797,-0.126976,-0.127937,-0.127264,-0.128027,-0.127896,-0.127363,-0.126819,-0.128149,-0.128573,-0.127912,-0.127609,-0.127108,-0.126605,-0.128149,-0.12949,-0.128231,-0.127773,-0.12663,-0.128704,-0.128598,-0.127461,-0.128263,-0.127904,-0.128059,-0.128476,-0.126506,-0.127461,-0.128573,-0.126365,-0.127896,-0.128182,-0.129045,-0.126423,-0.127428
2,0.694558,0.679362,-0.515168,-0.731396,-0.063744,0.186457,0.827822,-0.17017,-0.597921,-0.51277,-0.820196,-0.864226,1.2298,-0.997208,-0.041742,2.474075,-0.516905,-0.905959,-0.719207,2.453757,-0.666196,1.416064,0.561363,-1.233139,0.363539,-0.153538,-0.753065,-0.977576,1.582552,-0.891516,0.854381,-1.686447,-0.630413,0.917298,-0.800475,-0.383654,-0.152517,-0.013698,1.42156,-0.800002,0.678951,1.34964,1.3544,1.019803,-0.421299,1.223441,-0.980069,0.097238,-0.966543,0.593753,0.006015,-1.014916,0.029021,0.630872,1.499162,0.339007,0.561246,0.740199,1.369262,0.835835,-0.563563,1.11885,0.413232,0.749485,0.887943,-0.530724,0.330726,-0.540148,-0.360259,-0.841787,-1.087088,-0.131217,-0.666113,0.167155,-0.85363,-0.749515,-0.645966,-0.910024,-0.337017,-0.803533,-0.105199,0.644774,0.27002,0.210557,-0.613863,0.267352,-1.001924,1.188931,1.446396,0.662903,0.260671,-0.527644,1.58762,0.708334,1.410587,-0.155128,-0.470698,-0.573225,-0.251247,0.240875,0.866243,-0.555523,2.232848,0.133247,0.993754,-0.477394,0.271009,-0.85542,1.423724,-0.708549,0.076225,-0.561605,-0.502072,-0.268384,-0.470481,-0.62975,0.601385,-1.034715,-0.44261,-0.034637,-0.948329,0.416796,-0.933648,-0.95011,-0.504031,-0.357964,-0.916449,0.855258,-0.126828,-0.127478,-0.126687,-0.126175,-0.127363,-0.127658,-0.127822,-0.127182,-0.126877,-0.127519,-0.127724,-0.129077,-0.127806,-0.126315,-0.128695,-0.127879,-0.128247,-0.12841,-0.128272,-0.126894,-0.128378,-0.127494,-0.127617,-0.126431,-0.127379,-0.129345,-0.128067,-0.127978,-0.127912,-0.127617,-0.127009,-0.127231,-0.126803,-0.127691,-0.127576,-0.128231,7.797444,-0.127281,-0.128182,-0.128744,-0.127511,-0.128051,-0.12783,-0.126704,-0.128321,-0.128459,-0.127847,-0.128524,-0.128728,-0.128378,-0.128541,-0.126547,-0.12801,-0.128647,-0.128647,-0.128736,-0.1271,-0.12696,-0.127428,-0.128801,-0.12792,-0.128002,-0.127757,-0.129515,-0.127568,-0.128321,-0.128002,-0.12902,-0.12792,-0.12756,-0.128573,-0.128524,-0.127428,-0.128076,-0.129134,-0.127486,-0.12863,-0.128084,-0.128321,-0.128051,-0.126621,-0.127593,-0.128141,-0.127797,-0.126976,-0.127937,-0.127264,-0.128027,-0.127896,-0.127363,-0.126819,-0.128149,-0.128573,-0.127912,-0.127609,-0.127108,-0.126605,-0.128149,-0.12949,-0.128231,-0.127773,-0.12663,-0.128704,-0.128598,-0.127461,-0.128263,-0.127904,-0.128059,-0.128476,-0.126506,-0.127461,-0.128573,-0.126365,-0.127896,-0.128182,-0.129045,-0.126423,-0.127428
3,-0.837726,0.339277,-0.47419,3.12201,0.42642,-0.044651,-0.972258,-0.145231,-0.783784,-0.528587,-1.927408,1.350249,0.182419,-0.891518,-1.018552,2.467541,0.007737,1.722104,0.055257,-0.243399,-0.685267,0.519907,-0.14196,2.132586,-0.351455,-0.595874,-0.763673,0.023498,-0.521678,-0.499183,0.33507,-0.417375,-0.33935,-0.40392,0.358163,1.752987,0.312649,0.256485,0.433688,-0.802652,-0.265331,1.563002,0.288298,0.139869,0.858205,-0.076926,0.098262,-0.7407,0.578541,0.316091,-0.630003,-0.509272,-0.495719,0.430361,-0.948177,-0.457535,-0.386012,0.636068,1.036644,-0.72428,-0.579233,-0.882536,-0.528785,-0.742628,1.073733,-2.3228,1.60569,0.006446,0.975049,1.199666,-0.136038,0.006952,2.837742,-0.520679,-0.816043,0.233348,0.207093,-0.830033,-0.437317,-0.404215,-0.900053,1.660584,-0.374089,-0.778953,0.363649,-0.663762,-0.900234,0.233728,-0.741002,-0.708002,0.066075,-0.579166,-1.102966,0.207542,0.839596,-0.299733,-0.490406,0.098282,-0.895793,0.435478,0.385685,1.514675,0.91722,-0.521825,-1.231826,3.613937,-0.952575,0.17935,1.583678,0.943601,-0.616461,-0.519192,-0.036459,-0.387144,-0.521264,2.560401,0.706337,-0.563179,-0.935935,-1.163156,2.407592,-1.77752,2.323779,2.397136,2.346159,0.367916,2.387944,0.196935,-0.126828,-0.127478,-0.126687,-0.126175,-0.127363,-0.127658,-0.127822,-0.127182,-0.126877,-0.127519,-0.127724,-0.129077,-0.127806,-0.126315,-0.128695,-0.127879,-0.128247,-0.12841,-0.128272,-0.126894,-0.128378,-0.127494,-0.127617,-0.126431,-0.127379,-0.129345,-0.128067,-0.127978,-0.127912,-0.127617,-0.127009,-0.127231,-0.126803,-0.127691,-0.127576,-0.128231,-0.128247,-0.127281,-0.128182,-0.128744,-0.127511,-0.128051,-0.12783,-0.126704,-0.128321,-0.128459,-0.127847,-0.128524,-0.128728,-0.128378,-0.128541,-0.126547,-0.12801,-0.128647,-0.128647,-0.128736,-0.1271,-0.12696,-0.127428,-0.128801,-0.12792,-0.128002,-0.127757,-0.129515,-0.127568,-0.128321,-0.128002,-0.12902,-0.12792,-0.12756,-0.128573,-0.128524,-0.127428,-0.128076,-0.129134,-0.127486,-0.12863,-0.128084,-0.128321,-0.128051,-0.126621,-0.127593,-0.128141,-0.127797,-0.126976,-0.127937,-0.127264,-0.128027,-0.127896,-0.127363,-0.126819,-0.128149,-0.128573,-0.127912,-0.127609,-0.127108,-0.126605,-0.128149,-0.12949,-0.128231,-0.127773,-0.12663,-0.128704,-0.128598,-0.127461,-0.128263,-0.127904,-0.128059,-0.128476,-0.126506,-0.127461,-0.128573,-0.126365,-0.127896,-0.128182,-0.129045,-0.126423,-0.127428
4,-0.237172,-2.442706,1.036035,0.701092,1.003695,-0.837415,1.118867,0.154011,-0.768622,0.073582,0.766561,0.014387,0.740279,-0.721846,-1.252427,0.353497,-0.203435,0.613535,-1.043149,0.954753,-0.560617,-1.661382,0.652344,0.644744,0.049008,0.142589,-0.758819,-0.977637,1.629602,1.400235,0.766681,1.18403,-0.747111,-0.428913,-0.797404,-0.449215,1.688384,1.673614,-0.761875,-0.778883,-0.707102,-0.853185,-3.116618,-1.02583,-0.827048,-0.201275,1.215369,-0.679285,-0.333675,0.94052,-0.227671,-0.681174,-0.538202,-1.802282,0.036005,-0.424086,-1.06159,-2.386683,-0.275487,1.328893,-0.097465,-0.415497,-1.031416,-0.120169,-1.135318,0.127382,0.090168,-0.049644,-0.994043,-0.848861,-0.608259,-0.725843,-0.343367,2.615374,-0.849832,0.081675,2.361217,-0.655917,-0.530237,1.913836,1.870015,0.443875,-0.623531,-0.698811,0.383363,-0.207545,-0.494442,-1.225568,-0.890671,0.035754,0.064838,-0.161638,-0.882829,0.16408,-0.957368,-0.604126,-0.181648,0.368971,-0.640781,1.024079,-0.304932,-0.969297,-0.616898,1.022784,-0.099087,0.475426,0.392832,-0.896007,-0.492508,0.061379,1.876532,-0.502208,-0.667787,-0.131009,0.352563,2.579061,-0.544035,-0.541321,-0.935935,-1.163156,1.856834,-1.175219,2.292012,1.8884,2.290951,0.569465,2.407292,0.399006,-0.126828,-0.127478,-0.126687,-0.126175,-0.127363,-0.127658,-0.127822,-0.127182,-0.126877,-0.127519,-0.127724,-0.129077,-0.127806,-0.126315,-0.128695,-0.127879,-0.128247,-0.12841,-0.128272,-0.126894,-0.128378,-0.127494,-0.127617,-0.126431,-0.127379,-0.129345,-0.128067,-0.127978,-0.127912,-0.127617,-0.127009,-0.127231,-0.126803,-0.127691,-0.127576,-0.128231,-0.128247,-0.127281,-0.128182,-0.128744,-0.127511,-0.128051,-0.12783,-0.126704,-0.128321,-0.128459,-0.127847,-0.128524,-0.128728,-0.128378,-0.128541,-0.126547,-0.12801,-0.128647,-0.128647,-0.128736,-0.1271,-0.12696,-0.127428,-0.128801,-0.12792,-0.128002,-0.127757,-0.129515,-0.127568,-0.128321,-0.128002,-0.12902,-0.12792,-0.12756,-0.128573,-0.128524,-0.127428,-0.128076,-0.129134,-0.127486,-0.12863,-0.128084,-0.128321,-0.128051,-0.126621,-0.127593,-0.128141,-0.127797,-0.126976,-0.127937,-0.127264,-0.128027,-0.127896,-0.127363,-0.126819,-0.128149,-0.128573,-0.127912,-0.127609,-0.127108,-0.126605,-0.128149,-0.12949,-0.128231,-0.127773,-0.12663,-0.128704,-0.128598,-0.127461,-0.128263,-0.127904,-0.128059,-0.128476,-0.126506,-0.127461,-0.128573,-0.126365,-0.127896,-0.128182,-0.129045,-0.126423,-0.127428


In [80]:
X_test.columns

  and should_run_async(code)


Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
       ...
       '236', '237', '238', '239', '240', '241', '242', '243', '244', '245'],
      dtype='object', length=246)

In [81]:
X.columns

Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
       ...
       '236', '237', '238', '239', '240', '241', '242', '243', '244', '245'],
      dtype='object', length=246)

In [83]:
# X_test = X_test.to_numpy()
X_wide_te = wide_preprocessor.fit_transform(X_test)
X_tab_te = tab_preprocessor.fit_transform(X_test)

In [84]:
X_wide_te[:10]

  and should_run_async(code)


array([[     1,  68341, 128605, ..., 686901, 686903, 686905],
       [     2,  68342, 128606, ..., 686901, 686903, 686905],
       [     3,  68343, 128607, ..., 686901, 686903, 686905],
       ...,
       [     8,  68348, 128612, ..., 686901, 686903, 686905],
       [     9,  68349, 128613, ..., 686901, 686903, 686905],
       [    10,  68350, 128614, ..., 686901, 686903, 686905]])

In [85]:
X_tab_te[:10]

array([[ 1.        ,  1.        ,  1.        , ...,  0.55039265,
         0.01325731, -0.88405807],
       [ 2.        ,  2.        ,  2.        , ...,  0.4294142 ,
        -0.2434078 , -1.0068317 ],
       [ 3.        ,  3.        ,  1.        , ..., -0.35796436,
        -0.91644899,  0.85525809],
       ...,
       [ 8.        ,  8.        ,  2.        , ..., -0.20433381,
        -0.65738129, -0.80560633],
       [ 9.        ,  9.        ,  1.        , ..., -0.26027519,
        -0.68821454, -1.08294345],
       [10.        , 10.        ,  4.        , ..., -0.68056968,
        -0.74512913,  0.60969703]])

In [86]:
X_wide_te.shape

(493474, 136)

In [87]:
X_tab_te.shape

(493474, 121)

In [75]:
# X_test = pd.read_feather(datapath/'X_test_NaNcounts_SummaryStats_imputed-Median-wIndicators-StandardScaled.feather')

# low_card_features = [f for f in X_test.columns if X_test[f].nunique() <= 50000]
# high_card_features = [f for f in X_test.columns if X_test[f].nunique() > 50000]

# wide_cols_pre = [f for f in X_test.columns if X_test[f].nunique() <= max_card_cat and X_test[f].nunique() > 2]
# wide_cols_onehot = [f for f in X_test.columns if X_test[f].nunique() == 2]
# cont_cols = high_card_features
# embed_cols = [f for f in X_test.columns if X_test[f].nunique() <= max_card_embed and X_test[f].nunique() > 2]

In [78]:
# # X_test = X_test.to_numpy()
# X_wide_te = wide_preprocessor.transform(X_test)
# X_tab_te = tab_preprocessor.transform(X_test)

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [None]:
# X_wide_te = wide_preprocessor

In [None]:
# preds = trainer.predict(X_wide=X_wide_te, X_tab=X_tab_te)

In [None]:
# preds[:20]

In [92]:
preds_proba = trainer.predict_proba(X_wide=X_wide_te, X_tab=X_tab_te, batch_size=1024)

  and should_run_async(code)
predict:  24%|██▍       | 116/482 [00:01<00:04, 89.41it/s] 


IndexError: index out of range in self

In [90]:
preds_proba_train = trainer.predict_proba(X_wide=X_wide, X_tab=X_tab)

predict: 100%|██████████| 936/936 [00:04<00:00, 193.74it/s]


In [62]:
X_wide_te[:10]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [58]:
preds_proba_train[:20]

array([[0.01788539, 0.98211461],
       [0.79904032, 0.20095971],
       [0.02058744, 0.97941256],
       [0.34795648, 0.65204352],
       [0.10245168, 0.89754832],
       [0.72765213, 0.27234787],
       [0.55998504, 0.44001493],
       [0.30380297, 0.69619703],
       [0.73843539, 0.26156464],
       [0.87854004, 0.12145996],
       [0.40143102, 0.59856898],
       [0.61878359, 0.38121644],
       [0.34835207, 0.65164793],
       [0.77924246, 0.22075753],
       [0.93200088, 0.06799912],
       [0.03524351, 0.96475649],
       [0.06104219, 0.93895781],
       [0.8391974 , 0.16080263],
       [0.5203411 , 0.4796589 ],
       [0.02382565, 0.97617435]])

In [59]:
preds_proba[:20]

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.]])

In [None]:
dump(preds_proba, datapath/'preds/widedeep_5epochs_bs1024_64x32tabmlp_20210930_probas.joblib')

## Weights and Biases Run Config

Below is the configuration for a Weights and Biases (`wandb`) run.

In [15]:
# wandb config:
config_run = {
    'name': f"{os.environ['WANDB_NOTEBOOK_NAME'][:-6]}_{datetime.now().strftime('%H%M%S')}", # just removes the .ipynb extension, leaving the notebook filename's stem
    'tags': ['stacking-sklearn', 'attempt'],
    'notes': "Trying a fastai tabular MLP model, for ensembling with the GBMs",
}

# Training

In [None]:
learner.LR

In [23]:
def train(X_train, X_valid, y_train, y_valid, model_config, 
                                              random_state=42,
                                              exmodel_config=exmodel_config, 
                                              config_run=config_run):#, scaler): # passed in via config dict for now
    """
    Basic training function. Note that some of the options passed via the argument are
    in fact hard-coded in, to avoid inconveniences.
    :param X_train: the training set features
    :param X_valid: the validation set features
    :param y_train: the training set targets
    :param y_valid: the validation set targets
    :param random_staKFold: for reproducibility
    :param exmodel_config: dict containing configuration details including the library 
                            (thus model) used, preprocessing, and cross-validation
    :param model_config: dict containing hyperparameter specifications for the model
    :param config_run: dict containing wandb run configuration (name, etc)
    """
    
    # As of 20210920, best CatBoost config is:
    best_20210920_catboost_params = {
        'iterations': 3493,
        'depth': 5,
        'learning_rate': 0.09397459954141321,
        'random_strength': 43,
        'l2_leaf_reg': 26,
        'border_count': 239,
        'bagging_temperature': 12.532400413798356,
        'od_type': 'Iter'
    }
    
    # catboost 20210921 on colab (only 15 trials though)
    best_catboost_params = {
        'iterations': 3302,
        'depth': 5,
        'learning_rate': 0.017183208677599107,
        'random_strength': 41,
        'l2_leaf_reg': 30,
        'border_count': 251,
        'bagging_temperature': 9.898390369028036, 
        'od_type': 'IncToDec'
    }
    
    # optuna 20210921
    best_xgboost_params = {
        'n_estimators': 1119,
        'max_depth': 6,
        'learning_rate': 0.04123392555159452,
        'reg_alpha': 4.511876752318655,
        'reg_lambda': 4.074347238862406,
        'subsample': 0.8408586950521992
    }
    
    wandb.init(
        project="202109_Kaggle_tabular_playground",
        save_code=True,
        tags=config_run['tags'],
        name=config_run['name'],
        notes=config_run['notes'],
        config=exmodel_config)   
        
    if exmodel_config['library'] == 'xgboost':
        model = XGBClassifier(
            tree_method=model_config['tree_method'],
            random_state=random_state,
            n_jobs=model_config['n_jobs'], 
            verbosity=model_config['verbosity'], 
            objective=model_config['objective'],
            **best_xgboost_params
            # #             eval_metric=model_config['eval_metric'],

            # comment out the below for a fairly default model
#             booster=model_config['booster'],
#             max_depth=model_config['max_depth'],
#             learning_rate=model_config['learning_rate'], 
#             subsample=model_config['subsample'],
#             reg_alpha=model_config['reg_alpha'],
#             reg_lambda=model_config['reg_lambda'],
#             n_estimators=model_config['n_estimators'],
        )
        
        model.fit(X_train, y_train, callbacks=[wandb.xgboost.wandb_callback()],
#                                     eval_metric=model_config['eval_metric'],
                 )


    elif exmodel_config['library'] == 'lightgbm':
        model = LGBMClassifier(
#             boosting_type=model_config['boosting_type'],
#             max_depth=model_config['max_depth']
            # TODO
            random_state=random_state,
            n_jobs=model_config['n_jobs'],
            objective=model_config['objective'],
#             eval_metric=model_config['eval_metric'],
            boosting_type=model_config['boosting_type'],
            device_type=model_config['device_type'],
            
            # comment out the below for a basically default model
            n_estimators=model_config['n_estimators'],
            learning_rate=model_config['learning_rate'],
            max_depth=model_config['max_depth'],
            reg_alpha=model_config['reg_alpha'],
            reg_lambda=model_config['reg_lambda'],
            subsample=model_config['subsample'],
        )
        
        model.fit(X_train, y_train, callbacks=[wandb.lightgbm.wandb_callback()],
#                                     eval_metric=model_config['eval_metric'],
                 )
        
    elif exmodel_config['library'] == 'catboost':
        print("CatBoost, therefore no WandB callback.")
        model = CatBoostClassifier(
#             n_estimators=config['n_estimators'],
#             learning_rate=config['learning_rate'],
#             max_depth=config['max_depth'],
            task_type=model_config['task_type'],
    #         n_jobs=config['n_jobs'],
    #         verbosity=config['verbosity'],
    #         subsample=config['subsample'],
#             n_estimators=model_config['n_estimators'],
            random_state=random_state,
            # objective='Logloss', # default, accepts only one
#             custom_metrics=model_config['custom_metrics'],
    #         bootstrap_type=config['bootstrap_type'],
    #         device:config['device']
            **best_catboost_params
        ) 
        model.fit(X_train, y_train)
        
#     y_train_pred = model.predict(X_train)
    y_train_pred = model.predict_proba(X_train)[:,1]

    train_loss = log_loss(y_train, y_train_pred)
    train_auc = roc_auc_score(y_train, y_train_pred)
    wandb.log({'train_loss': train_loss, 'train_auc': train_auc})

    if exmodel_config['library'] == 'catboost':
        print(model.get_all_params())
        wandb.log(model.get_all_params())
    else:
        wandb.log(model.get_params()) # logging model parameters, trying bare-invocation rather than params: model.get_params()
    
    # trying with predict_proba
    y_pred = model.predict_proba(X_valid)[:,1]
#     y_pred = model.predict(X_valid)

    valid_loss = log_loss(y_valid, y_pred)
    valid_auc = roc_auc_score(y_valid, y_pred)
    wandb.log({'valid_loss':valid_loss, 'valid_auc':valid_auc})
    print(f"Valid log-loss is {valid_loss}\nValid AUC is {valid_auc}")   
#     wandb.finish()   
    return model
    

In [24]:
def cross_validation(model_config, X=X, y=y, start_fold=0, exmodel_config=exmodel_config, random_state=42):
    """
    Function to handle model training process in the context of cross-validation -- via hold-out or via k-fold.
    If exmodel_config['cross_val_strategy'] == None, then any kfolds= input is ignored; otherwise, the number specified is used.
    
    :param kfolds: int specifying number of k-folds to use in cross-validation
    :param exmodel_config: dict containing general config including for cross-validation -- `kfold=1` implies hold-out
    """
    if exmodel_config['kfolds'] == 1:
        print("Proceeding with holdout")
        X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                      test_size=exmodel_config['test_size'], 
                                                      random_state=random_state,
                                                     )
        model = train(X_train, X_valid, y_train, y_valid, exmodel_config=exmodel_config, 
                                                    model_config=model_config,
                                                    config_run=config_run)
        wandb.finish()
        
    else:
        X, y = X.to_numpy(), y.to_numpy()
        kfold = exmodel_config['cross_val_strategy'](n_splits=exmodel_config['kfolds'], shuffle=True, random_state=random_state)
        models = {}
        model_path = Path(datapath/f"models/{config_run['name']}_{exmodel_config['kfolds']}folds/")
        (model_path).mkdir(exist_ok=True)
        for fold, (train_ids, valid_ids) in enumerate(kfold.split(X,y)):
            if fold < start_fold:
                continue
            else:
                print(f"FOLD {fold}")
                print("---------------------------------------------------")
                X_train, X_valid = X[train_ids], X[valid_ids]
                y_train, y_valid = y[train_ids], y[valid_ids]
                model = train(X_train, X_valid, y_train, y_valid, exmodel_config=exmodel_config, 
                                                    model_config=model_config,
                                                    config_run=config_run)
                wandb.log({'fold': fold})
                models[fold] = model
                dump(model, Path(model_path/f"{exmodel_config['library']}_fold{fold}_model.joblib"))
                wandb.finish()
        return models
        

# Interface

## Runs

In [25]:
# library = 'xgboost'
# exmodel_config['library'] = library
# model_config = model_configurator(library)
# xgboost_models = cross_validation(model_config)

In [26]:
# for scaler in [StandardScaler, MaxAbsScaler, MinMaxScaler, RobustScaler]:
#     exmodel_config['scaler'] = scaler
#     scaler = scaler()
#     X_scaled = scaler.fit_transform(X)
#     X = pd.DataFrame(X_scaled, columns=X.columns)
#     exmodel_config['library'] = 'lightgbm'
#     model_config = model_configurator('lightgbm')
#     cross_validation(model_config)

In [27]:
# library = 'lightgbm'
# exmodel_config['library'] = library
# model_config = model_configurator(library)
# lightgbm_models = cross_validation(model_config)

# Stacking

## Via `sklearn.ensemble.StackingClassifier`

In [28]:
# xgboost_estimators = [(f'xgboost_fold{fold}', xgboost_models[fold]) for fold in range(5)]

In [29]:
# leaving this default for first try
# final_estimator = 

In [30]:
def stacker(estimators:dict, library:str, X=X, y=y): #, load_models:bool=False, load_path:Path=None):
    """
    A wrapper that will take a dict of the form {fold:int : model} and a string representing the library (for file-naming), 
    then run `sklearn.ensemble.StackingClassifier` with it, and save the stacked model afterward
    """
    estimators_list = [(f'{library}_fold{fold}', estimators[fold]) for fold in range(5)]
    blender = StackingClassifier(estimators=estimators_list,
                                 cv=5,
                                 stack_method='predict_proba',
                                 n_jobs=2,
                                 passthrough=False,
                                 verbose=1
                                )
    print(f"Starting fitting at {datetime.now().strftime('%Y%m%d_%H%M%S')}")
    blender.fit(X,y)
    print(f"Fitting complete at {datetime.now().strftime('%Y%m%d_%H%M%S')}")
    dump(blender, filename=datapath/f"models/{config_run['name']}_{exmodel_config['kfolds']}folds/{library}_stack.joblib")
    print(f"Blender model saved at {datetime.now().strftime('%Y%m%d_%H%M%S')}")
    return blender
    

In [31]:
# might encapsulate this in a new version of the above train function later
exmodel_config['ensemble'] = 'stacking'

wandb.init(
        project="202109_Kaggle_tabular_playground",
        save_code=True,
        tags=config_run['tags'],
        name=config_run['name'],
        notes=config_run['notes'],
        config=exmodel_config)   

random_state = exmodel_config['random_state'] # 42


# # optuna 20210921
# best_xgboost_params = {
#     'n_estimators': 1119,
#     'max_depth': 6,
#     'learning_rate': 0.04123392555159452,
#     'reg_alpha': 4.511876752318655,
#     'reg_lambda': 4.074347238862406,
#     'subsample': 0.8408586950521992
# }

# model_config = model_configurator('xgboost')
# xgboost_model = XGBClassifier(
#             tree_method=model_config['tree_method'],
#             random_state=random_state,
# #             n_jobs=model_config['n_jobs'], 
#             verbosity=model_config['verbosity'], 
#             objective=model_config['objective'],
#             **best_xgboost_params
#             # #             eval_metric=model_config['eval_metric'],

#             # comment out the below for a fairly default model
# #             booster=model_config['booster'],
# #             max_depth=model_config['max_depth'],
# #             learning_rate=model_config['learning_rate'], 
# #             subsample=model_config['subsample'],
# #             reg_alpha=model_config['reg_alpha'],
# #             reg_lambda=model_config['reg_lambda'],
# #             n_estimators=model_config['n_estimators'],
#         )

# model_config = model_configurator('lightgbm')
# lightgbm_model = LGBMClassifier(
#             random_state=random_state,
# #             n_jobs=model_config['n_jobs'],
#             objective=model_config['objective'],
#             boosting_type=model_config['boosting_type'],
#             device_type=model_config['device_type'],
            
#             # comment out the below for a basically default model
#             n_estimators=model_config['n_estimators'],
#             learning_rate=model_config['learning_rate'],
#             max_depth=model_config['max_depth'],
#             reg_alpha=model_config['reg_alpha'],
#             reg_lambda=model_config['reg_lambda'],
#             subsample=model_config['subsample'],
#         )

model_config = model_configurator('catboost', gpu_available=False) # set GPU false to avoid parallel threads blocking GPU

# # As of 20210920, best CatBoost config is:
# best_20210920_catboost_params = {
#     'iterations': 3493,
#     'depth': 5,
#     'learning_rate': 0.09397459954141321,
#     'random_strength': 43,
#     'l2_leaf_reg': 26,
#     'border_count': 239,
#     'bagging_temperature': 12.532400413798356,
#     'od_type': 'Iter'
# }

# catboost 20210921 on colab (only 15 trials though)
best_catboost_params = {
    'iterations': 3302,
    'depth': 5,
    'learning_rate': 0.017183208677599107,
    'random_strength': 41,
    'l2_leaf_reg': 30,
    'border_count': 251,
    'bagging_temperature': 9.898390369028036, 
    'od_type': 'IncToDec'
}
    

catboost_model = CatBoostClassifier(
            task_type=model_config['task_type'],
#             n_estimators=model_config['n_estimators'],
            random_state=random_state,
            
            **best_catboost_params
        ) 



estimators_list = [
#     ('xgboost', xgboost_model),
#     ('lightgbm', lightgbm_model),
    ('catboost', catboost_model)
]

# wandb.log({'estimators': estimators_list})

final_estimator = LogisticRegression(max_iter=1000)
exmodel_config['blender_final_estimator'] = str(final_estimator)
exmodel_config['blender-passthrough'] = False

blender = StackingClassifier(estimators=estimators_list,
                             final_estimator=final_estimator,
                             cv=5,
                             stack_method='predict_proba',
                             n_jobs=-1, # 4 is max allowable for CPU
                             passthrough=exmodel_config['blender-passthrough'],
                             verbose=1
                            )


           
    

[34m[1mwandb[0m: Currently logged in as: [33mhushifang[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.2 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


In [32]:
wandb.log({'blender-final_estimator': str(blender.final_estimator),
#            'blender-final_estimator_params': str(blender.final_estimator.get_params()),
           'blender-stack_mdethod': 'predict_proba',
           'blender-cv': 5
          })

In [None]:
print(f"Starting fitting at {datetime.now().strftime('%Y%m%d_%H%M%S')}")
blender.fit(X,y) # unsure of this -- given kwarg cv=5, is it producing the splits? Or do I have to somehow?
print(f"Fitting complete at {datetime.now().strftime('%Y%m%d_%H%M%S')}")

Starting fitting at 20210922_112912


In [None]:
# wandb.log({'xgboost_params':str(blender.estimators[0][1].get_params()),
#            'lightgbm_params':str(blender.estimators[1][1].get_params()),
# #            'catboost_params':str(blender.estimators[2][1].get_all_params()),
#           })

In [None]:
model_path = Path(datapath/f"models/{config_run['name']}/")
(model_path).mkdir(exist_ok=True)
dump(blender, filename=model_path/f"{config_run['name']}_stack.joblib")
print(f"Blender model saved at {datetime.now().strftime('%Y%m%d_%H%M%S')}")

In [None]:
train_preds = blender.predict_proba(X)[:,1]
train_loss = log_loss(y_pred=train_preds, y_true=y)
train_auc = roc_auc_score(y, train_preds)
wandb.log({'train_loss': train_loss, 'train_auc': train_auc})
print(f"train_loss is {train_loss}, train_auc is {train_auc}")

In [None]:
# train_preds[:20]

# Test set preprocessing


(Here's where encapsulating the transformations in a pipeline would come in handy. But I'll do it manually for now.)

In [None]:
# features = [x for x in test_df.columns if x != 'claim']
# X_test = test_df[features] # this is just for naming consistency

Now, let's get the features the model was trained on and subset the test set's features accordingly

In [None]:
# generation polynomial features
# poly = PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)
# X_test_poly = poly.fit_transform(X_test)

In [None]:
# X_test_poly_names = poly.get_feature_names(X_test.columns)
# X_poly_names[100:150]
# features = pd.read_csv('X_candidates_20210827.csv').columns

In [None]:
# checks = [feature in X_test_poly_names for feature in features]
# checks

In [None]:
# X_test_final = pd.DataFrame(X_test_poly, columns=X_test_poly_names)

In [None]:
# X_test_final = X_test_final[features[1:]]
# X_test_final = X_test

In [None]:
# X_test['nan_count'] = X_test.isnull().sum(axis=1)

In [None]:
# imputer = SimpleImputer(strategy='median', add_indicator=True)
# X_test_imputed_np = imputer.fit_transform(X_test)

In [None]:
# X_test_imputed = pd.DataFrame(X_test_imputed, columns=[str(x) for x in range(X_test_imputed.shape[1])])
# X_test_imputed.to_feather(path=datapath/'X_test_NaNcounts_imputed-Median-wIndicators.feather')

In [None]:
# scaler = exmodel_config['scaler']()
# X_test_imputed_scaled_np = scaler.fit_transform(X_test_imputed)
# X_test_imputed_scaled = pd.DataFrame(X_test_imputed_scaled_np, columns=X_test_imputed.columns)
# X_test_imputed_scaled.to_feather(path=datapath/'X_test_NaNcounts_imputed-Median-wIndicators_StandardScaled.feather')
# X_scaled_df = pd.DataFrame(X_scaled, columns=X_poly_names)

In [None]:
test_set_path = str(datapath/'X_test_NaNcounts_SummaryStats_imputed-Median-wIndicators-StandardScaled.feather')
wandb.log({'test_set': test_set_path})

In [None]:
# X_test_imputed_scaled = pd.read_feather(path=datapath/'X_test_NaNcounts_imputed-Median-wIndicators_StandardScaled.feather')
X_test_imputed_scaled = pd.read_feather(path=datapath/'X_test_NaNcounts_SummaryStats_imputed-Median-wIndicators-StandardScaled.feather')

## Prediction Generation

In [None]:
preds_path = Path(datapath/"preds/")

blender_preds = blender.predict_proba(X_test_imputed_scaled)[:,1]
dump(blender_preds, preds_path/f"{config_run['name']}_stack.joblib")

# Submission

In [None]:
sample_df = pd.read_csv(datapath/'sample_solution.csv')

In [None]:
sample_df.loc[:, 'claim'] = blender_preds

In [None]:
sample_df.head()

In [None]:
submission_path = datapath/'submissions'
submission_path.mkdir(exist_ok=True)

In [55]:
sample_df.to_csv(submission_path/f"{config_run['name']}_blended.csv", index=False)

In [59]:
# str(blender.estimators[2][1].get_all_params())
# blender.estimators[2][1]

<catboost.core.CatBoostClassifier at 0x7f227c7b81c0>

In [56]:
wandb.log({'leaderboard_auc': ,
           'catboost_params': str(best_catboost_params),
          })

In [57]:
wandb.finish()

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
blender-final_estimator,LogisticRegression(m...
blender-stack_mdethod,predict_proba
blender-cv,5
_runtime,16221
_timestamp,1632331738
_step,4
xgboost_params,{'objective': 'binar...
lightgbm_params,{'boosting_type': 'g...
train_loss,0.48961
train_auc,0.84839


0,1
blender-cv,▁
_runtime,▁████
_timestamp,▁████
_step,▁▃▅▆█
train_loss,▁
train_auc,▁
leaderboard_auc,▁


## Manual Stacking

In [73]:
X.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,227,228,229,230,231,232,233,234,235,236
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.425545,-2.357891,-0.637206,-0.866657,-0.111568,-4.829243,-1.171229,-0.603397,-0.596871,-0.516828,...,-0.128368,-0.127677,-0.128242,-0.127867,-0.127119,-0.127985,-0.128494,-0.12862,-0.127854,-0.12703
1,0.2476,-0.323982,1.223569,0.361863,1.071182,-0.36114,0.082051,-0.74659,0.899454,0.469668,...,-0.128368,-0.127677,-0.128242,-0.127867,-0.127119,-0.127985,-0.128494,-0.12862,-0.127854,-0.12703
2,2.032371,-2.43568,-0.48896,0.341193,1.069656,0.118532,0.537069,-0.044075,-0.763516,1.056879,...,-0.128368,-0.127677,-0.128242,-0.127867,-0.127119,-0.127985,-0.128494,-0.12862,-0.127854,-0.12703
3,1.438373,-2.337605,-0.508914,-0.829607,1.485682,3.592008,-1.189087,-0.339152,-0.735281,-0.529158,...,-0.128368,-0.127677,-0.128242,-0.127867,-0.127119,-0.127985,-0.128494,-0.12862,-0.127854,-0.12703
4,0.602333,1.076218,-0.648438,0.463365,0.275053,-0.157989,0.727338,-0.905498,0.052478,-0.511066,...,-0.128368,-0.127677,-0.128242,-0.127867,-0.127119,-0.127985,-0.128494,-0.12862,7.821398,-0.12703


In [119]:
X1 = X.copy()

In [120]:
X1.shape

(957919, 237)

In [121]:
# generate probability predictions for the XGBoost model's folds
for fold in xgboost_models.keys():
#     X1[f"xgboost_fold{fold}_pred"] = xgboost_models[fold].predict(X)
    X1[f"xgboost_fold{fold}_pred"] = xgboost_models[fold].predict_proba(X)[:,1]
#     xgboost_preds[fold] = xgboost_models[fold].predict(X_test_imputed_scaled)



In [122]:
X1.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,232,233,234,235,236,xgboost_fold0_pred,xgboost_fold1_pred,xgboost_fold2_pred,xgboost_fold3_pred,xgboost_fold4_pred
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.425545,-2.357891,-0.637206,-0.866657,-0.111568,-4.829243,-1.171229,-0.603397,-0.596871,-0.516828,...,-0.127985,-0.128494,-0.12862,-0.127854,-0.12703,0.582566,0.58095,0.576743,0.569523,0.595877
1,0.2476,-0.323982,1.223569,0.361863,1.071182,-0.36114,0.082051,-0.74659,0.899454,0.469668,...,-0.127985,-0.128494,-0.12862,-0.127854,-0.12703,0.152252,0.150803,0.148316,0.155218,0.147297
2,2.032371,-2.43568,-0.48896,0.341193,1.069656,0.118532,0.537069,-0.044075,-0.763516,1.056879,...,-0.127985,-0.128494,-0.12862,-0.127854,-0.12703,0.794083,0.789945,0.788326,0.787177,0.797979
3,1.438373,-2.337605,-0.508914,-0.829607,1.485682,3.592008,-1.189087,-0.339152,-0.735281,-0.529158,...,-0.127985,-0.128494,-0.12862,-0.127854,-0.12703,0.774001,0.76851,0.774555,0.782187,0.773245
4,0.602333,1.076218,-0.648438,0.463365,0.275053,-0.157989,0.727338,-0.905498,0.052478,-0.511066,...,-0.127985,-0.128494,-0.12862,7.821398,-0.12703,0.759366,0.755764,0.763769,0.758034,0.758038
