# Baseline
Setting up a more robust baseline notebook, suitable for use with all of the "Big Three" (XGBoost, CatBoost, LightGBM) libraries and on either Google Colab or the local machine.

# Setup

In [1]:
# two manual flags (ex-config)
COLAB = False
USE_GPU = True
# libraries = ['xgboost', 'lightgbm', 'catboost']
libraries = ['xgboost', 'lightgbm', 'catboost']

In [2]:
# basic imports
from pathlib import Path
import os
import math
from datetime import datetime
import random

In [3]:
%matplotlib inline
%config Completer.use_jedi = False
os.environ['WANDB_NOTEBOOK_NAME'] = f"stacking_manual_{datetime.now().strftime('%Y%m%d')}.ipynb"

In [4]:
# handle Google Colab-specific library installation/updating
if COLAB:
    # much of the below inspired by or cribbed from the May 2021 Kaggle Tabular Playground winner, at 
    # https://colab.research.google.com/gist/academicsuspect/0aac7bd6e506f5f70295bfc9a3dc2250/tabular-may-baseline.ipynb?authuser=1#scrollTo=LJoVKJb5wN0L
    
    # Kaggle API for downloading the datasets
#     !pip install --upgrade -q kaggle

    # weights and biases
    !pip install -qqqU wandb
    
    # Optuna for parameter search
    !pip install -q optuna

    # upgrade sklearn
    !pip install --upgrade scikit-learn

#     !pip install category_encoders
    
    if 'catboost' in libraries:
        !pip install catboost
    
    if 'xgboost' in libraries:
        if USE_GPU: 
            # this part is from https://github.com/rapidsai/gputreeshap/issues/24
            !pip install cmake --upgrade
            # !pip install sklearn --upgrade
            !git clone --recursive https://github.com/dmlc/xgboost
            %cd /content/xgboost
            !mkdir build
            %cd build
            !cmake .. -DUSE_CUDA=ON
            !make -j4
            %cd /content/xgboost/python-package
            !python setup.py install --use-cuda --use-nccl
            !/opt/bin/nvidia-smi
            !pip install shap
        else:
            !pip install --upgrade xgboost
    if 'lightgbm' in libraries:
        if USE_GPU:
            # lighgbm gpu compatible
            !git clone --recursive https://github.com/Microsoft/LightGBM
            ! cd LightGBM && rm -rf build && mkdir build && cd build && cmake -DUSE_GPU=1 ../../LightGBM && make -j4 && cd ../python-package && python3 setup.py install --precompile --gpu;
        else:
            !pip install --upgrade lightgbm
        

        

Now, non-stdlib imports

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

# general ML tooling
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.metrics import log_loss, roc_auc_score
import wandb
from wandb.xgboost import wandb_callback
from wandb.lightgbm import wandb_callback
from sklearn.impute import SimpleImputer #, KNNImputer
# import timm

import seaborn as sns

from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
# from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.preprocessing import StandardScaler #, MinMaxScaler, MaxAbsScaler, RobustScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
# from sklearn.feature_selection import SelectKBest, f_regression
from joblib import dump, load
# feature engineering tools
# from sklearn.feature_selection import mutual_info_regression
# import featuretools as ft


Now, datapath setup

In [6]:
# # This is the code for reading the train.csv and converting it to a .feather file
# df = pd.read_csv(datapath/'train.csv', index_col='id', low_memory=False)
# df.index.name = None
# df.to_feather(path='./dataset_df.feather')

In [7]:
if COLAB:
    # mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')
    
    # handling datapath
    datapath = Path('/content/drive/MyDrive/kaggle/tabular_playgrounds/oct2021/')
    
else:
    # if on local machine
#     datapath = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/sep2021/')  
    root = Path('/home/sf/code/kaggle/tabular_playgrounds/oct2021/')
    datapath = root/'datasets'
    edapath = root/'EDA'
    modelpath = root/'models'
    predpath = root/'preds'
    subpath = root/'submissions'
    
    for pth in [root, datapath, edapath, modelpath, predpath, subpath]:
        pth.mkdir(exist_ok=True)
    


In [8]:
# Function to seed everything
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

seed_everything(seed=42)

## Ex-Model Config

In [9]:
# meta-config for preprocessing and cross-validation, but NOT for model parameters
exmodel_config = {
#     "feature_selector": SelectKBest,
#     "k_best": 80,
#     "feature_selection_scoring": f_regression,
#     'random_state': SEED,
#     'feature_generation': ['NaN_counts', 'SummaryStats', 'NaN_OneHots'],
#     'subsample': 1,
    'cross_val_strategy': KFold, # None for holdout, or the relevant sklearn class
    'kfolds': 5, # if 1, that means just doing holdout
    'test_size': 0.2,
#     'features_created': False,
#     'feature_creator': None,
}

## Data Setup

**TODO** Write some conditional logic here to automate it -- possibly as part of a sklearn.*pipeline

In [10]:
# if exmodel_config['scaler']:
#     scaler = exmodel_config['scaler']()
#     scaler.fit_transform()

In [23]:
train_source = datapath/'train.feather'
df = pd.read_feather(path=train_source)
df.index.name = 'id'
y_train = df.target
features = [x for x in df.columns if x != 'target']
X_train = df[features]
# X.index.name = 'id'
# y.index.name = 'id'
X = np.array(X_train)
y = np.array(y_train)

# del df, X_train, y_train


# exmodel_config['feature_count'] = len(X.columns)
exmodel_config['feature_count'] = X.shape[1]
exmodel_config['instance_count'] = X.shape[0]

# exmodel_config['feature_generator'] = None
# exmodel_config['feature_generator'] = "Summary statistics"

exmodel_config['train_source'] = str(train_source)

In [31]:
test_source = datapath/'test.feather'
exmodel_config['test_source'] = str(test_source)
X_test = pd.read_feather(path=test_source)
# X_test = X_test.iloc[:, 1:]

In [33]:
X_test = np.array(X_test)

## Weights and Biases Run Config

Below is the configuration for a Weights and Biases (`wandb`) run.

In [14]:
# wandb config:
wandb_config = {
    'name': f"{os.environ['WANDB_NOTEBOOK_NAME'][:-6]}_{datetime.now().strftime('%H%M%S')}", # just removes the .ipynb extension, leaving the notebook filename's stem
    'tags': ['stacking-manual', 'baseline'],
    'notes': "Doing per-architecture baseline runs on 5-fold models with equal-weight for folds using default hyperparams",
}

# Training

# Hyperparameters

In [15]:
#  # optuna 20210924
# best_xgboost_params = {
#     'n_estimators': 4205,
#     'max_depth': 9,
#     'learning_rate': 0.002953166980699093,
#     'reg_alpha': 4.496528786028185,
#     'reg_lambda': 0.07084201124334108,
#     'subsample': 0.611948848824097,
#     'min_child_weight': 0.8353153853897145,
#     'colsample_bytree': 0.8562542843394833,
#     'gamma': 11.710075953347133
# }

# # best as of 20210923, for valid_auc of 0.8156
# best_lightgbm_params = {
#     'n_estimators': 4429,
#     'max_depth': 3,
#     'learning_rate': 0.02952568423554658,
#     'reg_alpha': 9.285152197932742,
#     'reg_lambda': 7.784744078293992,
#     'subsample': 0.8628511640697093,
#     'boosting_type': 'gbdt',
#     'min_child_samples': 47,
#     'num_leaves': 57,
#     'colsample_bytree': 0.573325963741589
# }

# # catboost 20210921-25 on colab (only 15 trials though)
# best_catboost_params = {
#     'iterations': 3302,
#     'depth': 5,
#     'learning_rate': 0.017183208677599107,
#     'random_strength': 41,
#     'l2_leaf_reg': 30,
#     'border_count': 251,
#     'bagging_temperature': 9.898390369028036, 
#     'od_type': 'IncToDec'
# }

In [18]:
X.shape

(1000000, 285)

In [19]:
X_test.shape

(500000, 286)

In [20]:
X_test[0,-1]

0.0

In [47]:
def cross_validate_model(library:str, params:dict={}, X=X, y=y, X_test=X_test, start_fold=0, 
                         exmodel_config=exmodel_config, wandb_config=wandb_config, 
                         random_state=42, wandb_tracked=True):
    """
    Function to handle model training process in the context of cross-validation -- via hold-out or via k-fold.
    If exmodel_config['cross_val_strategy'] == None, then any kfolds= input is ignored; otherwise, the number specified is used.
    
    :param kfolds: int specifying number of k-folds to use in cross-validation
    :param exmodel_config: dict containing general config including for cross-validation -- `kfold=1` implies hold-out
    """
#     if exmodel_config['kfolds'] == 1:
#         print("Proceeding with holdout")
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                          test_size=0.2, 
                                                          random_state=random_state)                 
    
    # prepare for k-fold cross-validation
    kfold = exmodel_config['cross_val_strategy'](n_splits=exmodel_config['kfolds'], shuffle=True, random_state=random_state)
    
    if wandb_tracked:
        exmodel_config['library'] = library
        exmodel_config[f'{library}_params'] = str(params)
        wandb.init(
            project="202110_Kaggle_tabular_playground",
            save_code=True,
            tags=wandb_config['tags'],
            name=wandb_config['name'],
            notes=wandb_config['notes'],
            config=exmodel_config
    )   
    
    # setup for serialization
    runpath = Path(modelpath/"{wandb_config['name']}_{library}_{exmodel_config['kfolds']}folds/")
    (runpath).mkdir(exist_ok=True)
    
    
    # initialize lists for out-of-fold preds and ground truth
    oof_preds, oof_y = [], []
    
    # initialize a numpy.ndarray containing the fold-model's preds for test set
    test_preds = np.zeros((X_test.shape[0]))
    
    for fold, (train_ids, valid_ids) in enumerate(kfold.split(X,y)):
        if fold < start_fold: # skip folds that are already trained
            continue
        else:
            print(f"FOLD {fold}")
            print("---------------------------------------------------")
            X_train, X_valid = X[train_ids], X[valid_ids]
            y_train, y_valid = y[train_ids], y[valid_ids]
    
        # define models
        if library == 'xgboost':
            model = XGBClassifier(
#                 booster='gbtree',
                random_state=random_state,
                n_jobs=-1, 
                verbosity=1, 
                objective='binary:logistic',
                **params)
            if wandb_tracked:
                model.fit(X_train, y_train, callbacks=[wandb.xgboost.wandb_callback()])
            else:
                model.fit(X_train, y_train)


        elif library == 'lightgbm':
            model = LGBMClassifier(
                objective='binary',
                random_state=random_state,
                n_jobs=-1,
                eval_metric='auc',
#                 device_type='gpu',
                **params)
            if wandb_tracked:
                model.fit(X_train, y_train, callbacks=[wandb.lightgbm.wandb_callback()],)
            else:
                model.fit(X_train, y_train)

            
        elif library == 'catboost':
            model = CatBoostClassifier(
#                 task_type='GPU',
                random_state=random_state,
                **params) 
        
            model.fit(X_train, y_train)
        
        # take the training set predictions, if desired
#         y_train_pred = model.predict_proba(X_train)[:,1]
#         train_loss = log_loss(y_train, y_train_pred)
#         train_auc = roc_auc_score(y_train, y_train_pred)
#         wandb.log({'train_loss': train_loss, 'train_auc': train_auc})

        # log the parameters, if desired
#         if exmodel_config['library'] == 'catboost':
#             print(model.get_all_params())
#             wandb.log(model.get_all_params())
#         else:
#             wandb.log(model.get_params()) # logging model parameters, trying bare-invocation rather than params: model.get_params()

        y_valid_preds = model.predict_proba(X_valid)[:,1]
        
        # add the fold-model's OOF preds and ground truths to the out-of-loop lists
        oof_preds.extend(y_valid_preds)
        oof_y.extend(y_valid)
        
        # add the fold's predictions to the model's test-set predictions (will divide later)
        test_preds += model.predict_proba(X_test)[:,1]

#         valid_loss = log_loss(y_valid, y_pred)
        # give the valid AUC score, for edification
        fold_valid_auc = roc_auc_score(y_valid, y_valid_preds)
        print(f"Valid AUC for fold {fold} is {fold_valid_auc}")   
        dump(model, Path(runpath/f"{exmodel_config['library']}_fold{fold}_rs{random_state}_model.joblib"))

    model_valid_auc = roc_auc_score(oof_y, oof_preds)
    print(f"Valid AUC score for {library} model is {model_valid_auc}")
    
    # finalize test preds
    test_preds /= exmodel_config['kfolds']
    
    # save OOF preds and test-set preds
    dump(oof_preds, Path(predpath/f"{wandb_config['name']}_{library}_{exmodel_config['kfolds']}folds_rs{random_state}_oof_preds.joblib"))
    dump(test_preds, Path(predpath/f"{wandb_config['name']}_{library}_{exmodel_config['kfolds']}folds_rs{random_state}_test_preds.joblib"))

    if wandb_tracked:
        wandb.log({'model_valid_auc': model_valid_auc,
                   'oof_preds': oof_preds,
                   'test_preds': test_preds,
                  })
        wandb.finish()
    return oof_preds, test_preds #/ exmodel_config['kfolds']
        

In [36]:
_, lightgbm_preds = cross_validate_model(library='lightgbm')

VBox(children=(Label(value=' 2.31MB of 2.31MB uploaded (0.46MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

[34m[1mwandb[0m: wandb version 0.12.3 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


FOLD 0
---------------------------------------------------
Valid AUC for fold 0 is 0.8496537118591476
FOLD 1
---------------------------------------------------
Valid AUC for fold 1 is 0.8484784905755038
FOLD 2
---------------------------------------------------
Valid AUC for fold 2 is 0.8495964071389925
FOLD 3
---------------------------------------------------
Valid AUC for fold 3 is 0.8491138775676337
FOLD 4
---------------------------------------------------
Valid AUC for fold 4 is 0.8491283147870375
Valid AUC score for lightgbm model is 0.8491902735111283




VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
model_valid_auc,0.84919


0,1
model_valid_auc,▁


In [48]:
_, catboost_preds = cross_validate_model(library='catboost')

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

[34m[1mwandb[0m: wandb version 0.12.3 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


FOLD 0
---------------------------------------------------


Custom logger is already specified. Specify more than one logger at same time is not thread safe.

Learning rate set to 0.178877
0:	learn: 0.6268947	total: 199ms	remaining: 3m 19s
1:	learn: 0.5860842	total: 340ms	remaining: 2m 49s
2:	learn: 0.5588807	total: 475ms	remaining: 2m 37s
3:	learn: 0.5426063	total: 593ms	remaining: 2m 27s
4:	learn: 0.5307459	total: 714ms	remaining: 2m 22s
5:	learn: 0.5233528	total: 845ms	remaining: 2m 19s
6:	learn: 0.5175378	total: 985ms	remaining: 2m 19s
7:	learn: 0.5135472	total: 1.11s	remaining: 2m 17s
8:	learn: 0.5101456	total: 1.24s	remaining: 2m 16s
9:	learn: 0.5074840	total: 1.36s	remaining: 2m 14s
10:	learn: 0.5056426	total: 1.5s	remaining: 2m 15s
11:	learn: 0.5040335	total: 1.63s	remaining: 2m 14s
12:	learn: 0.5025528	total: 1.76s	remaining: 2m 13s
13:	learn: 0.5013587	total: 1.87s	remaining: 2m 11s
14:	learn: 0.5003271	total: 2s	remaining: 2m 11s
15:	learn: 0.4991698	total: 2.13s	remaining: 2m 10s
16:	learn: 0.4982659	total: 2.25s	remaining: 2m 10s
17:	learn: 0.4974238	total: 2.37s	remaining: 2m 9s
18:	learn: 0.4964738	total: 2.5s	remaining: 2m 8s



VBox(children=(Label(value=' 0.80MB of 0.80MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
model_valid_auc,0.85496


0,1
model_valid_auc,▁


In [52]:
_, xgboost_preds = cross_validate_model(library='xgboost')

[34m[1mwandb[0m: wandb version 0.12.3 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


FOLD 0
---------------------------------------------------




AttributeError: /home/sf/anaconda3/envs/tabular-gpu/lib/libxgboost.so: undefined symbol: XGDMatrixCreateFromDense

# Submission

In [42]:
sample_df = pd.read_csv(datapath/'sample_submission.csv.zip')

In [None]:
sample_df.loc[:, 'target'] = xgboost_preds

In [50]:
sample_df.head()

Unnamed: 0,id,target
0,1000000,0.783878
1,1000001,0.250317
2,1000002,0.911016
3,1000003,0.884747
4,1000004,0.270889


In [None]:
# submission_path = datapath/'submissions'
# submission_path.mkdir(exist_ok=True)

In [51]:
sample_df.to_csv(subpath/f"{wandb_config['name']}_catboost_{exmodel_config['kfolds']}folds_rs{42}_baseline_preds.csv", index=False)

In [59]:
# str(blender.estimators[2][1].get_all_params())
# blender.estimators[2][1]

<catboost.core.CatBoostClassifier at 0x7f227c7b81c0>

In [56]:
wandb.log({'leaderboard_auc': 0.81725,
#            'catboost_params': str(best_catboost_params),
          })

In [57]:
wandb.finish()

VBox(children=(Label(value=' 0.23MB of 0.23MB uploaded (0.06MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
blender-cv,5
blender-final_estimator,LogisticRegression(m...
blender-stack_mdethod,predict_proba
leaderboard_auc,0.81725
lightgbm_params,{'boosting_type': 'g...
test_set,/media/sf/easystore/...
train_auc,0.8414
train_loss,0.494
xgboost_params,{'objective': 'binar...


0,1
blender-cv,▁
leaderboard_auc,▁
train_auc,▁
train_loss,▁


In [16]:
# oof_y_pd = pd.Series(oof_y)

# Predictions

## Level One

In [17]:
# oof_xgb2, test_xgb2 = cross_validate_model(library='xgboost', X=X, y=y, X_test=X_test, 
#                                                  wandb_config=wandb_config,
#                                                  random_state=1983,
#                                                  params=best_xgboost_params,
#                                                  exmodel_config=exmodel_config, 
#                                                  wandb_tracked=True
#                                                 )
# oof_lv1['xgboost_2'] = oof_xgb2
# test_lv1['xgboost_2'] = test_xgb2

In [18]:
# oof_lv1, test_lv1 = pd.DataFrame(), pd.DataFrame()
# oof_xgb, test_xgb = cross_validate_model(library='xgboost', X=X, y=y, X_test=X_test, 
#                                          wandb_config=wandb_config,
#                                          random_state=42,
#                                          params=best_xgboost_params,
#                                          exmodel_config=exmodel_config, 
#                                          wandb_tracked=True
#                                         )
# oof_lv1['xgboost'] = oof_xgb
# test_lv1['xgboost'] = test_xgb

In [19]:
# oof_cat1, test_cat1 = cross_validate_model(library='catboost', X=X, y=y, X_test=X_test, 
#                                          wandb_config=wandb_config,
#                                          random_state=42,
#                                          params=best_catboost_params,
#                                          exmodel_config=exmodel_config, 
#                                          wandb_tracked=True
#                                         )
# oof_lv1['catboost_1'] = oof_cat1
# test_lv1['catboost_1'] = test_cat1

In [20]:
# oof_cat2, test_cat2 = cross_validate_model(library='catboost', X=X, y=y, X_test=X_test, 
#                                                  wandb_config=wandb_config,
#                                                  random_state=1983,
#                                                  params=best_catboost_params,
#                                                  exmodel_config=exmodel_config, 
#                                                  wandb_tracked=True
#                                                 )
# oof_lv1['catboost_2'] = oof_cat2
# test_lv1['catboost_2'] = test_cat2

In [21]:
# oof_lgb2, test_lgb2 = cross_validate_model(library='lightgbm', X=X, y=y, X_test=X_test, 
#                                                  wandb_config=wandb_config,
#                                                  random_state=1983,
#                                                  params=best_lightgbm_params,
#                                                  exmodel_config=exmodel_config, 
#                                                  wandb_tracked=True
#                                                 )
# oof_lv1['lightgbm_2'] = oof_lgb2
# test_lv1['lightgbm_2'] = test_lgb2

In [22]:
# oof_lgb1, test_lgb1 = cross_validate_model(library='lightgbm', X=X, y=y, X_test=X_test, 
#                                          wandb_config=wandb_config,
#                                          random_state=42,
#                                          params=best_lightgbm_params,
#                                          exmodel_config=exmodel_config, 
#                                          wandb_tracked=True
#                                         )
# oof_lv1['lightgbm_1'] = oof_lgb1
# test_lv1['lightgbm)1'] = test_lgb1

In [23]:
# oof_lv1, test_lv1 = pd.DataFrame(), pd.DataFrame()
# preds_path = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/sep2021/preds/')

In [24]:
# oof_lv1['xgboost_42'] = load(preds_path/'validAUC_0.8146662938898428_stacking_manual_20210925_212129_xgboost_5folds_rs42_oof_preds.joblib')
# test_lv1['xgboost_42'] = load(preds_path/'stacking_manual_20210925_212129_xgboost_5folds_rs42_test_preds.joblib')

In [25]:
# oof_lv1['xgboost_1983'] = load(preds_path/'validAUC_0.8146252172737458_stacking_manual_20210926_211701_xgboost_5folds_rs1983_oof_preds.joblib')
# test_lv1['xgboost_1983'] = load(preds_path/'stacking_manual_20210926_211701_xgboost_5folds_rs1983_test_preds.joblib')

In [26]:
# oof_lv1['lightgbm_42'] = load(preds_path/'validAUC_0.8156810521798477_stacking_manual_20210925_212129_lightgbm_5folds_rs42_oof_preds.joblib')
# test_lv1['lightgbm_42'] = load(preds_path/'stacking_manual_20210925_212129_lightgbm_5folds_rs42_test_preds.joblib')

In [27]:
# oof_lv1['lightgbm_1983'] = load(preds_path/'validAUC_0.8156503194185875_stacking_manual_20210925_212129_lightgbm_5folds_rs1983_oof_preds.joblib')
# test_lv1['lightgbm_1983'] = load(preds_path/'stacking_manual_20210925_212129_lightgbm_5folds_rs1983_test_preds.joblib')

In [28]:
# oof_lv1['catboost_42'] = load(preds_path/'validAUC_0.8116727090290558_stacking_manual_20210925_212129_catboost_5folds_rs42_oof_preds.joblib')
# test_lv1['catboost_42'] = load(preds_path/'stacking_manual_20210925_212129_catboost_5folds_rs42_test_preds.joblib')

In [29]:
# oof_lv1['catboost_1983'] = load(preds_path/'validAUC_0.8116312032218072_stacking_manual_20210925_212129_catboost_5folds_rs1983_oof_preds.joblib')
# test_lv1['catboost_1983'] = load(preds_path/'stacking_manual_20210925_212129_catboost_5folds_rs1983_test_preds.joblib')

In [30]:
# oof_lv1.iloc[:20, :]

In [31]:
# oof_y_pd.iloc[:20]

- Why is it that the random seed seems far more important than the model type in making a prediction???

In [32]:
# oof_lv1.to_csv('oof_lv1.csv', index=False)
# test_lv1.to_csv('test_lv1.csv', index=False)

In [33]:
# oof_lv1.to_feather(preds_path/'oof_lv1.feather')
# test_lv1.to_feather(preds_path/'test_lv1.feather')

In [34]:
# oof_lv1.head()

NameError: name 'oof_lv1' is not defined

In [35]:
oof_lv1 = pd.read_feather(preds_path/'oof_lv1.feather')#, columns=[str(x) for x in range()])
test_lv1 = pd.read_feather(preds_path/'test_lv1.feather')
oof_y = load(preds_path/'oof_y_pd-series.joblib')

# oof_lv1.index.name = 'id'
# test_lv1.index.name = 'id'
# oof_y.index.name = 'id'
oof_lv1 = np.array(oof_lv1)
test_lv1 = np.array(test_lv1)
oof_y = np.array(oof_y)

## Level Two

In [36]:

oof_lv2_preds, test_lv2_preds = cross_validate_model(library='xgboost', X=oof_lv1, y=oof_y, X_test=test_lv1, 
                                         wandb_config=wandb_config,
                                         random_state=42,
                                         params=best_xgboost_params,
                                         exmodel_config=exmodel_config, 
                                         wandb_tracked=True
                                        )

[34m[1mwandb[0m: Currently logged in as: [33mhushifang[0m (use `wandb login --relogin` to force relogin)


FOLD 0
---------------------------------------------------








Valid AUC for fold 0 is 0.8161896906852846
FOLD 1
---------------------------------------------------








Valid AUC for fold 1 is 0.8158227520546544
FOLD 2
---------------------------------------------------








Valid AUC for fold 2 is 0.8149862496669764
FOLD 3
---------------------------------------------------








Valid AUC for fold 3 is 0.8156043438710472
FOLD 4
---------------------------------------------------








Valid AUC for fold 4 is 0.8155944249661089
Valid AUC score for xgboost model is 0.8156007376847357




VBox(children=(Label(value=' 0.10MB of 0.10MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
model_valid_auc,0.8156


0,1
model_valid_auc,▁


In [37]:
dump(oof_lv2_preds, preds_path/'oof_lv2_xgboost_preds.joblib')
dump(test_lv2_preds, preds_path/'test_lv2_xgboost_preds.joblib')

['/media/sf/easystore/kaggle_data/tabular_playgrounds/sep2021/preds/test_lv2_preds.joblib']

In [48]:
oof_xgb_f0_rs1983 = load('/media/sf/easystore/kaggle_data/tabular_playgrounds/sep2021/models/stacking_manual_20210926_211701_xgboost_5folds/xgboost_fold0_model.joblib')
oof_xgb_f0_rs42 = load('/media/sf/easystore/kaggle_data/tabular_playgrounds/sep2021/models/stacking_manual_20210925_212129_xgboost_5folds/xgboost_fold0_model.joblib')

In [None]:

oof_lv2_cat_preds, test_lv2_cat_preds = cross_validate_model(library='catboost', X=oof_lv1, y=oof_y, X_test=test_lv1, 
                                         wandb_config=wandb_config,
                                         random_state=42,
                                         params=best_catboost_params,
                                         exmodel_config=exmodel_config, 
                                         wandb_tracked=True
                                        )

FOLD 0
---------------------------------------------------
0:	learn: 0.6865535	total: 7.78ms	remaining: 25.7s
1:	learn: 0.6802746	total: 14.5ms	remaining: 23.9s
2:	learn: 0.6744054	total: 21.8ms	remaining: 24s
3:	learn: 0.6684288	total: 28.3ms	remaining: 23.3s
4:	learn: 0.6626083	total: 35.5ms	remaining: 23.4s
5:	learn: 0.6571100	total: 42ms	remaining: 23.1s
6:	learn: 0.6519722	total: 48.4ms	remaining: 22.8s
7:	learn: 0.6470768	total: 55.5ms	remaining: 22.9s
8:	learn: 0.6421232	total: 62.8ms	remaining: 23s
9:	learn: 0.6375136	total: 71ms	remaining: 23.4s
10:	learn: 0.6328126	total: 77.3ms	remaining: 23.1s
11:	learn: 0.6287030	total: 83.6ms	remaining: 22.9s
12:	learn: 0.6261433	total: 90.1ms	remaining: 22.8s
13:	learn: 0.6227046	total: 96.4ms	remaining: 22.6s
14:	learn: 0.6192884	total: 103ms	remaining: 22.5s
15:	learn: 0.6154601	total: 109ms	remaining: 22.3s
16:	learn: 0.6124289	total: 115ms	remaining: 22.2s
17:	learn: 0.6086097	total: 121ms	remaining: 22.1s
18:	learn: 0.6053399	total:

In [None]:
dump(oof_lv2_cat_preds, preds_path/'oof_lv2_catboost_preds.joblib')
dump(test_lv2_cat_preds, preds_path/'test_lv2_catboost_preds.joblib')

In [57]:

oof_lv2_lgb_preds, test_lv2_lgb_preds = cross_validate_model(library='lightgbm', X=oof_lv1, y=oof_y, X_test=test_lv1, 
                                         wandb_config=wandb_config,
                                         random_state=42,
                                         params=best_lightgbm_params,
                                         exmodel_config=exmodel_config, 
                                         wandb_tracked=True
                                        )

FOLD 0
---------------------------------------------------
Valid AUC for fold 0 is 0.8153502256098097
FOLD 1
---------------------------------------------------
Valid AUC for fold 1 is 0.8151090643209462
FOLD 2
---------------------------------------------------
Valid AUC for fold 2 is 0.8143548186372677
FOLD 3
---------------------------------------------------
Valid AUC for fold 3 is 0.8147151176211568
FOLD 4
---------------------------------------------------
Valid AUC for fold 4 is 0.8150620785589353
Valid AUC score for lightgbm model is 0.8149022673864115




VBox(children=(Label(value=' 0.48MB of 0.48MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
model_valid_auc,0.8149


0,1
model_valid_auc,▁


In [58]:
dump(oof_lv2_lgb_preds, preds_path/'oof_lv2_lightgbm_preds.joblib')
dump(test_lv2_lgb_preds, preds_path/'test_lv2_lightgbm_preds.joblib')

['/media/sf/easystore/kaggle_data/tabular_playgrounds/sep2021/preds/test_lv2_lightgbm_preds.joblib']

In [59]:
oof_lv2, test_lv2 = pd.DataFrame(), pd.DataFrame()
oof_lv2['xgboost'] = oof_lv2_preds
oof_lv2['catboost'] = oof_lv2_cat_preds
oof_lv2['lightgbm'] = oof_lv2_lgb_preds

test_lv2['xgboost'] = test_lv2_preds
test_lv2['catboost'] = test_lv2_cat_preds
test_lv2['lightgbm'] = test_lv2_lgb_preds

In [60]:
oof_lv2.head()

Unnamed: 0,xgboost,catboost,lightgbm
0,0.598283,0.592973,0.613436
1,0.757375,0.754756,0.744127
2,0.144481,0.141614,0.140979
3,0.637471,0.637243,0.64084
4,0.147584,0.146836,0.145494


In [65]:
oof_lv1_df = pd.read_feather(preds_path/'oof_lv1.feather')

In [66]:
oof_lv1_df.head()

Unnamed: 0,xgboost_42,xgboost_1983,lightgbm_42,lightgbm_1983,catboost_42,catboost_1983
0,0.593499,0.761973,0.602013,0.77048,0.607259,0.733238
1,0.624203,0.75478,0.648943,0.776359,0.624454,0.767116
2,0.767918,0.77565,0.770889,0.787858,0.75281,0.744239
3,0.736804,0.126186,0.734552,0.107402,0.72138,0.118478
4,0.090639,0.139272,0.079626,0.14085,0.098337,0.133733


In [67]:
oof_lv2_full = oof_lv2.join(oof_lv1_df)

In [69]:
test_lv2_full = test_lv2.join(pd.read_feather(preds_path/'test_lv1.feather'))

In [68]:
oof_lv2_full.head()

Unnamed: 0,xgboost,catboost,lightgbm,xgboost_42,xgboost_1983,lightgbm_42,lightgbm_1983,catboost_42,catboost_1983
0,0.598283,0.592973,0.613436,0.593499,0.761973,0.602013,0.77048,0.607259,0.733238
1,0.757375,0.754756,0.744127,0.624203,0.75478,0.648943,0.776359,0.624454,0.767116
2,0.144481,0.141614,0.140979,0.767918,0.77565,0.770889,0.787858,0.75281,0.744239
3,0.637471,0.637243,0.64084,0.736804,0.126186,0.734552,0.107402,0.72138,0.118478
4,0.147584,0.146836,0.145494,0.090639,0.139272,0.079626,0.14085,0.098337,0.133733


In [70]:
test_lv2_full.head()

Unnamed: 0,xgboost,catboost,lightgbm,xgboost_42,xgboost_1983,lightgbm_42,lightgbm_1983,catboost_42,catboost_1983
0,0.594096,0.586394,0.590837,0.6024,0.6005,0.596474,0.597061,0.613762,0.609026
1,0.122359,0.122358,0.128885,0.119548,0.118213,0.12359,0.12735,0.122456,0.122698
2,0.643096,0.637357,0.654246,0.629702,0.628639,0.646495,0.652845,0.596505,0.594301
3,0.124502,0.122938,0.125442,0.128551,0.128885,0.121361,0.12176,0.117042,0.11618
4,0.144049,0.143363,0.149342,0.146628,0.144621,0.145941,0.145415,0.143914,0.144842


In [71]:
oof_lv2_np = oof_lv2_full.to_numpy()
test_lv2_np = test_lv2_full.to_numpy()

In [75]:
oof_y_np = oof_y

## Level Three (Logistic Regression)

In [72]:
from sklearn import model_selection

In [73]:
kfolds = model_selection.StratifiedKFold(n_splits=5, shuffle=False)

In [76]:
oof_preds, oof_y = [], []

In [77]:
test_preds = np.zeros((X_test.shape[0]))

In [79]:
X = oof_lv2_np
y = oof_y_np
X_test = test_lv2_np

In [None]:
library = 'sklearn (LogisticRegressor(max_iter=1000))'
exmodel_config['library'] = library
wandb.init(
    project="202109_Kaggle_tabular_playground",
    save_code=True,
    tags=wandb_config['tags'],
    name=wandb_config['name'],
    notes=wandb_config['notes'],
    config=exmodel_config
)   

# # prepare for k-fold cross-validation
# kfold = exmodel_config['cross_val_strategy'](n_splits=exmodel_config['kfolds'], shuffle=True, random_state=random_state)

# setup for serialization
model_path = Path(datapath/f"models/{wandb_config['name']}_{library}_{exmodel_config['kfolds']}folds/")
(model_path).mkdir(exist_ok=True)

In [82]:
for fold, (train_idx, valid_idx) in enumerate(kfolds.split(X,y)):
    X_train, X_valid = X[train_idx], X[valid_idx]
    y_train, y_valid = y[train_idx], y[valid_idx]
    
    print(f"FOLD {fold}")
    print("---------------------")
    
    model = LogisticRegression(max_iter=1000)
    
    model.fit(X_train, y_train)
    preds = model.predict_proba(X_valid)[:,1]
    
    oof_preds.extend(preds)
    oof_y.extend(y_valid)
    
    test_preds += model.predict_proba(X_test)[:,1]
    
    valid_auc = roc_auc_score(y_valid, preds)
    print(f"ROC AUC of fold {fold} is {valid_auc}")
    
#     dump(preds, /'lv_3)

valid_auc_total = roc_auc_score(oof_y, oof_preds)
print(f"Overall ROC_AUC is {valid_auc_total}")

dump(oof_preds, preds_path/'oof_lv3_preds.joblib')
dump(oof_y, preds_path/'oof_lv3_y.joblib')

test_preds /= 5

dump(test_preds, preds_path/'test_lv3_preds.joblib')

FOLD 0
---------------------
ROC AUC of fold 0 is 0.8146730078703983
FOLD 1
---------------------
ROC AUC of fold 1 is 0.8163570962547366
FOLD 2
---------------------
ROC AUC of fold 2 is 0.8152767993551605
FOLD 3
---------------------
ROC AUC of fold 3 is 0.8157294548970742
FOLD 4
---------------------
ROC AUC of fold 4 is 0.8165344261672212
Overall ROC_AUC is 0.8157065996211509


['/media/sf/easystore/kaggle_data/tabular_playgrounds/sep2021/preds/test_lv3_preds.joblib']

In [49]:
# X_test_imputed_scaled = pd.read_feather(path=datapath/'X_test_NaNcounts_imputed-Median-wIndicators_StandardScaled.feather')
# X_test_imputed_scaled = pd.read_feather(path=datapath/'X_test_NaNcounts_SummaryStats_imputed-Median-wIndicators-StandardScaled.feather')

## Prediction Generation

In [50]:
# preds_path = Path(datapath/"preds/")

# blender_preds = blender.predict_proba(X_test_imputed_scaled)[:,1]
# dump(blender_preds, preds_path/f"{config_run['name']}_stack.joblib")



['/media/sf/easystore/kaggle_data/tabular_playgrounds/sep2021/preds/stacking_off-shelf_20210922_213426_stack.joblib']