# Baseline
Setting up a more robust baseline notebook, suitable for use with all of the "Big Three" (XGBoost, CatBoost, LightGBM) libraries and on either Google Colab or the local machine.

# Setup

In [1]:
# two manual flags (ex-config)
COLAB = False
USE_GPU = True
# libraries = ['xgboost', 'lightgbm', 'catboost']
libraries = ['xgboost', 'lightgbm', 'catboost']

In [2]:
# basic imports
from pathlib import Path
import os
import math
from datetime import datetime
import random

In [3]:
%matplotlib inline
%config Completer.use_jedi = False
os.environ['WANDB_NOTEBOOK_NAME'] = f"stacking_manual_{datetime.now().strftime('%Y%m%d')}.ipynb"

In [4]:
# handle Google Colab-specific library installation/updating
if COLAB:
    # much of the below inspired by or cribbed from the May 2021 Kaggle Tabular Playground winner, at 
    # https://colab.research.google.com/gist/academicsuspect/0aac7bd6e506f5f70295bfc9a3dc2250/tabular-may-baseline.ipynb?authuser=1#scrollTo=LJoVKJb5wN0L
    
    # Kaggle API for downloading the datasets
#     !pip install --upgrade -q kaggle

    # weights and biases
    !pip install -qqqU wandb
    
    # Optuna for parameter search
    !pip install -q optuna

    # upgrade sklearn
    !pip install --upgrade scikit-learn

#     !pip install category_encoders
    
    if 'catboost' in libraries:
        !pip install catboost
    
    if 'xgboost' in libraries:
        if USE_GPU: 
            # this part is from https://github.com/rapidsai/gputreeshap/issues/24
            !pip install cmake --upgrade
            # !pip install sklearn --upgrade
            !git clone --recursive https://github.com/dmlc/xgboost
            %cd /content/xgboost
            !mkdir build
            %cd build
            !cmake .. -DUSE_CUDA=ON
            !make -j4
            %cd /content/xgboost/python-package
            !python setup.py install --use-cuda --use-nccl
            !/opt/bin/nvidia-smi
            !pip install shap
        else:
            !pip install --upgrade xgboost
    if 'lightgbm' in libraries:
        if USE_GPU:
            # lighgbm gpu compatible
            !git clone --recursive https://github.com/Microsoft/LightGBM
            ! cd LightGBM && rm -rf build && mkdir build && cd build && cmake -DUSE_GPU=1 ../../LightGBM && make -j4 && cd ../python-package && python3 setup.py install --precompile --gpu;
        else:
            !pip install --upgrade lightgbm
        

        

Now, non-stdlib imports

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

# general ML tooling
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.metrics import log_loss, roc_auc_score
import wandb
from wandb.xgboost import wandb_callback
from wandb.lightgbm import wandb_callback
from sklearn.impute import SimpleImputer #, KNNImputer
# import timm

import seaborn as sns

from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
# from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.preprocessing import StandardScaler #, MinMaxScaler, MaxAbsScaler, RobustScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
# from sklearn.feature_selection import SelectKBest, f_regression
from joblib import dump, load
# feature engineering tools
# from sklearn.feature_selection import mutual_info_regression
# import featuretools as ft


Now, datapath setup

In [6]:
# # This is the code for reading the train.csv and converting it to a .feather file
# df = pd.read_csv(datapath/'train.csv', index_col='id', low_memory=False)
# df.index.name = None
# df.to_feather(path='./dataset_df.feather')

In [39]:
if COLAB:
    # mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')
    
    # handling datapath
    datapath = Path('/content/drive/MyDrive/kaggle/tabular_playgrounds/oct2021/')
    
else:
    # if on local machine
#     datapath = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/sep2021/')  
    root = Path('/home/sf/code/kaggle/tabular_playgrounds/oct2021/')
    datapath = root/'datasets'
    edapath = root/'EDA'
    modelpath = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/oct2021/models/')
    predpath = root/'preds'
    subpath = root/'submissions'
    
    for pth in [root, datapath, edapath, modelpath, predpath, subpath]:
        pth.mkdir(exist_ok=True)
    


In [8]:
SEED = 42

# Function to seed everything
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

seed_everything(seed=SEED)

## Ex-Model Config

In [9]:
# meta-config for preprocessing and cross-validation, but NOT for model parameters
exmodel_config = {
#     "feature_selector": SelectKBest,
#     "k_best": 80,
#     "feature_selection_scoring": f_regression,
#     'random_state': SEED,
#     'feature_generation': ['NaN_counts', 'SummaryStats', 'NaN_OneHots'],
#     'subsample': 1,
    'cross_val_strategy': KFold, # None for holdout, or the relevant sklearn class
    'kfolds': 5, # if 1, that means just doing holdout
    'test_size': 0.2,
#     'features_created': False,
#     'feature_creator': None,
}

## Data Setup

**TODO** Write some conditional logic here to automate it -- possibly as part of a sklearn.*pipeline

In [10]:
# if exmodel_config['scaler']:
#     scaler = exmodel_config['scaler']()
#     scaler.fit_transform()

In [11]:
train_source = datapath/'train.feather'
df = pd.read_feather(path=train_source)
df.index.name = 'id'
y_train = df.target
features = [x for x in df.columns if x != 'target']
X_train = df[features]
# X.index.name = 'id'
# y.index.name = 'id'
X = np.array(X_train)
y = np.array(y_train)

# del df, X_train, y_train


# exmodel_config['feature_count'] = len(X.columns)
exmodel_config['feature_count'] = X.shape[1]
exmodel_config['instance_count'] = X.shape[0]

# exmodel_config['feature_generator'] = None
# exmodel_config['feature_generator'] = "Summary statistics"

exmodel_config['train_source'] = str(train_source)

In [12]:
test_source = datapath/'test.feather'
exmodel_config['test_source'] = str(test_source)
X_test = pd.read_feather(path=test_source)
# X_test = X_test.iloc[:, 1:]

In [13]:
X_test = np.array(X_test)

## Weights and Biases Run Config

Below is the configuration for a Weights and Biases (`wandb`) run.

In [14]:
# wandb config:
wandb_config = {
    'name': f"{os.environ['WANDB_NOTEBOOK_NAME'][:-6]}_{datetime.now().strftime('%H%M%S')}", # just removes the .ipynb extension, leaving the notebook filename's stem
    'tags': ['stacking-manual'],
    'notes': "Using best-to-date params on GBM classifiers from XGBoost, LightGBM, and CatBoost on original, unaltered dataset. Manual stacking ensemble, with two random-state versions of each architecture in first layer, then one in second, then a LogisticRegressor for third.",
}

# Training

# Hyperparameters

In [15]:
# optuna 20211004, thru 106 trials on unaltered original dataset
best_xgboost_params = {
    'n_estimators': 3878,
    'max_depth': 4,
    'learning_rate': 0.024785857161974977,
    'reg_alpha': 26.867682044658245,
    'reg_lambda': 10.839759074147148,
    'subsample': 0.8208581489835881,
    'min_child_weight': 8.829122644339664,
    'colsample_bytree': 0.906420714280384,
    'gamma': 1.472322916021486
}

# best as of 20211005, thru 65 trials on unaltered original dataset
best_lightgbm_params = {
    'n_estimators': 6631,
    'max_depth': 10,
    'learning_rate': 0.004677044539666842,
    'reg_alpha': 19.334971246299116,
    'reg_lambda': 0.024384251140153856,
    'subsample': 0.5082183652689569,
    'boosting_type': 'gbdt',
    'min_child_samples': 9,
    'num_leaves': 233,
    'colsample_bytree': 0.5008014086989773
}

# catboost 20211001 on colab with 100 trials on GPU, unaltered original dataset
best_catboost_params = {
    'iterations': 29338,
    'max_depth': 9,
    'learning_rate': 0.004769831650275205,
    'random_strength': 7,
    'od_wait': 1968,
    'reg_lambda': 28.435563240493586,
    'border_count': 162,
    'min_child_samples': 14,
    'leaf_estimation_iterations': 1
}

In [76]:
def cross_validate_model(library:str, params:dict={}, X=X, y=y, X_test=X_test, start_fold=0, 
                         exmodel_config=exmodel_config, wandb_config=wandb_config, 
                         random_state=42, wandb_tracked=True):
    """
    Function to handle model training process in the context of cross-validation -- via hold-out or via k-fold.
    If exmodel_config['cross_val_strategy'] == None, then any kfolds= input is ignored; otherwise, the number specified is used.
    
    :param kfolds: int specifying number of k-folds to use in cross-validation
    :param exmodel_config: dict containing general config including for cross-validation -- `kfold=1` implies hold-out
    """
#     if exmodel_config['kfolds'] == 1:
#         print("Proceeding with holdout")
#     X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
#                                                           test_size=0.2, 
#                                                           random_state=SEED)                 
    
    # prepare for k-fold cross-validation; random-state here is notebook-wide, not per-model
    kfold = exmodel_config['cross_val_strategy'](n_splits=exmodel_config['kfolds'], shuffle=True, random_state=SEED)
    
    if wandb_tracked:
        exmodel_config['library'] = library
        exmodel_config[f'{library}_params'] = str(params)
        wandb.init(
            project="202110_Kaggle_tabular_playground",
            save_code=True,
            tags=wandb_config['tags'],
            name=wandb_config['name'],
            notes=wandb_config['notes'],
            config=exmodel_config
    )   
    
    # setup for serialization
    runpath = Path(modelpath/f"{wandb_config['name']}_{library}_{exmodel_config['kfolds']}folds/")
    (runpath).mkdir(exist_ok=True)
    
    
    # initialize lists for out-of-fold preds and ground truth
    oof_preds, oof_y = [], []
    
    # initialize a numpy.ndarray containing the fold-model's preds for test set
    test_preds = np.zeros((X_test.shape[0]))
    
    for fold, (train_ids, valid_ids) in enumerate(kfold.split(X,y)):
        if fold < start_fold: # skip folds that are already trained
            continue
        else:
            print(f"FOLD {fold}")
            print("---------------------------------------------------")
            X_train, X_valid = X[train_ids], X[valid_ids]
            y_train, y_valid = y[train_ids], y[valid_ids]
    
        # define models
        if library == 'xgboost':
            model = XGBClassifier(
                booster='gbtree',
                tree_method='gpu_hist',
                random_state=random_state,
                n_jobs=-1, 
                verbosity=1, 
                objective='binary:logistic',
                **params)
            if wandb_tracked:
                model.fit(X_train, y_train, callbacks=[wandb.xgboost.wandb_callback()])
            else:
                model.fit(X_train, y_train)


        elif library == 'lightgbm':
            model = LGBMClassifier(
                objective='binary',
                random_state=random_state,
                device_type='cpu',
                n_jobs=-1,
#                 eval_metric='auc',
#                 device_type='gpu',
#                 max_bin=63, # 15 might be even better for GPU perf, but depends on dataset -- see https://lightgbm.readthedocs.io/en/latest/GPU-Performance.html
#                 gpu_use_dp=False, # forces use of single precision rather than double for better perf, esp on consumer Nvidia chips
                **params)
            if wandb_tracked:
                model.fit(X_train, y_train, callbacks=[wandb.lightgbm.wandb_callback()],)
            else:
                model.fit(X_train, y_train)

            
        elif library == 'catboost':
            model = CatBoostClassifier(
                task_type='GPU',
                silent=True,
                random_state=random_state,
                **params) 
        
            model.fit(X_train, y_train)
        
        # take the training set predictions, if desired
#         y_train_pred = model.predict_proba(X_train)[:,1]
#         train_loss = log_loss(y_train, y_train_pred)
#         train_auc = roc_auc_score(y_train, y_train_pred)
#         wandb.log({'train_loss': train_loss, 'train_auc': train_auc})

        # log the parameters, if desired
#         if exmodel_config['library'] == 'catboost':
#             print(model.get_all_params())
#             wandb.log(model.get_all_params())
#         else:
#             wandb.log(model.get_params()) # logging model parameters, trying bare-invocation rather than params: model.get_params()

        y_valid_preds = model.predict_proba(X_valid)[:,1]
        
        # add the fold-model's OOF preds and ground truths to the out-of-loop lists
        oof_preds.extend(y_valid_preds)
        oof_y.extend(y_valid)
        
        # add the fold's predictions to the model's test-set predictions (will divide later)
        test_preds += model.predict_proba(X_test)[:,1]

#         valid_loss = log_loss(y_valid, y_pred)
        # give the valid AUC score, for edification
        fold_valid_auc = roc_auc_score(y_valid, y_valid_preds)
        print(f"Valid AUC for fold {fold} is {fold_valid_auc}")   
        dump(model, Path(runpath/f"{exmodel_config['library']}_fold{fold}_rs{random_state}_model.joblib"))

    model_valid_auc = roc_auc_score(oof_y, oof_preds)
    print(f"Valid AUC score for {library} model is {model_valid_auc}")
    
    # finalize test preds
    test_preds /= exmodel_config['kfolds']
    
    # save OOF preds and test-set preds
    dump(oof_preds, Path(predpath/f"{wandb_config['name']}_{library}_{exmodel_config['kfolds']}folds_rs{random_state}_oof_preds.joblib"))
    dump(test_preds, Path(predpath/f"{wandb_config['name']}_{library}_{exmodel_config['kfolds']}folds_rs{random_state}_test_preds.joblib"))
    dump(oof_y, predpath/f"{exmodel_config['kfolds']}folds_rs{SEED}_oof_y.joblib")
    
    if wandb_tracked:
        wandb.log({'model_valid_auc': model_valid_auc,
                   'oof_preds': oof_preds,
                   'test_preds': test_preds,
                   'model_params': str(model.get_params()),
                  })
        wandb.finish()
    return oof_preds, test_preds
        

In [160]:
train_source = datapath/'train.feather'
df = pd.read_feather(path=train_source)
df.index.name = 'id'
y_train = df.target
features = [x for x in df.columns if x != 'target']
X_train = df[features]
# X.index.name = 'id'
# y.index.name = 'id'
X_orig = np.array(X_train)
y_orig = np.array(y_train)

In [164]:
kfold = KFold(5, shuffle=True, random_state=SEED)
fold_valid_ids = {}
for fold, (train_ids, valid_ids) in enumerate(kfold.split(X_train,y_train)):
    fold_valid_ids[fold] = valid_ids

In [165]:
fold_valid_ids

{0: array([     0,      7,     12, ..., 999988, 999997, 999998]),
 1: array([     6,      9,     10, ..., 999989, 999991, 999992]),
 2: array([     2,     11,     26, ..., 999987, 999995, 999999]),
 3: array([     1,      4,     16, ..., 999966, 999984, 999990]),
 4: array([     3,      5,      8, ..., 999993, 999994, 999996])}

In [None]:
dump(fold_valid_ids, datapath/'KFold_5folds_rs42_)

In [None]:
def cross_split_checker(library:str, params:dict={}, X=X, y=y, X_test=X_test, start_fold=0, 
                         exmodel_config=exmodel_config, wandb_config=wandb_config, 
                         random_state=42, wandb_tracked=False):
    """
    Function to handle model training process in the context of cross-validation -- via hold-out or via k-fold.
    If exmodel_config['cross_val_strategy'] == None, then any kfolds= input is ignored; otherwise, the number specified is used.
    
    :param kfolds: int specifying number of k-folds to use in cross-validation
    :param exmodel_config: dict containing general config including for cross-validation -- `kfold=1` implies hold-out
    """
#     if exmodel_config['kfolds'] == 1:
#         print("Proceeding with holdout")
#     X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
#                                                           test_size=0.2, 
#                                                           random_state=SEED)                 
    
    # prepare for k-fold cross-validation; random-state here is notebook-wide, not per-model
    kfold = exmodel_config['cross_val_strategy'](n_splits=exmodel_config['kfolds'], shuffle=True, random_state=SEED)
    
#     if wandb_tracked:
#         exmodel_config['library'] = library
#         exmodel_config[f'{library}_params'] = str(params)
#         wandb.init(
#             project="202110_Kaggle_tabular_playground",
#             save_code=True,
#             tags=wandb_config['tags'],
#             name=wandb_config['name'],
#             notes=wandb_config['notes'],
#             config=exmodel_config
#     )   
    
    # setup for serialization
#     runpath = Path(modelpath/f"{wandb_config['name']}_{library}_{exmodel_config['kfolds']}folds/")
#     (runpath).mkdir(exist_ok=True)
    
    
    # initialize lists for out-of-fold preds and ground truth
    oof_preds, oof_y = [], []
    
    # initialize a numpy.ndarray containing the fold-model's preds for test set
#     test_preds = np.zeros((X_test.shape[0]))
    
    for fold, (train_ids, valid_ids) in enumerate(kfold.split(X,y)):
        if fold < start_fold: # skip folds that are already trained
            continue
        else:
            print(f"FOLD {fold}")
            print("---------------------------------------------------")
            X_train, X_valid = X[train_ids], X[valid_ids]
            y_train, y_valid = y[train_ids], y[valid_ids]
    
        # define models
#         if library == 'xgboost':
#             model = XGBClassifier(
#                 booster='gbtree',
#                 tree_method='gpu_hist',
#                 random_state=random_state,
#                 n_jobs=-1, 
#                 verbosity=1, 
#                 objective='binary:logistic',
#                 **params)
#             if wandb_tracked:
#                 model.fit(X_train, y_train, callbacks=[wandb.xgboost.wandb_callback()])
#             else:
#                 model.fit(X_train, y_train)


#         elif library == 'lightgbm':
#             model = LGBMClassifier(
#                 objective='binary',
#                 random_state=random_state,
#                 device_type='cpu',
#                 n_jobs=-1,
# #                 eval_metric='auc',
# #                 device_type='gpu',
# #                 max_bin=63, # 15 might be even better for GPU perf, but depends on dataset -- see https://lightgbm.readthedocs.io/en/latest/GPU-Performance.html
# #                 gpu_use_dp=False, # forces use of single precision rather than double for better perf, esp on consumer Nvidia chips
#                 **params)
#             if wandb_tracked:
#                 model.fit(X_train, y_train, callbacks=[wandb.lightgbm.wandb_callback()],)
#             else:
#                 model.fit(X_train, y_train)

            
#         elif library == 'catboost':
#             model = CatBoostClassifier(
#                 task_type='GPU',
#                 silent=True,
#                 random_state=random_state,
#                 **params) 
        
#             model.fit(X_train, y_train)
        
        # take the training set predictions, if desired
#         y_train_pred = model.predict_proba(X_train)[:,1]
#         train_loss = log_loss(y_train, y_train_pred)
#         train_auc = roc_auc_score(y_train, y_train_pred)
#         wandb.log({'train_loss': train_loss, 'train_auc': train_auc})

        # log the parameters, if desired
#         if exmodel_config['library'] == 'catboost':
#             print(model.get_all_params())
#             wandb.log(model.get_all_params())
#         else:
#             wandb.log(model.get_params()) # logging model parameters, trying bare-invocation rather than params: model.get_params()

#         y_valid_preds = model.predict_proba(X_valid)[:,1]
        
        # add the fold-model's OOF preds and ground truths to the out-of-loop lists
#         oof_preds.extend(y_valid_preds)
        oof_y.extend(y_valid)
        
        # add the fold's predictions to the model's test-set predictions (will divide later)
        test_preds += model.predict_proba(X_test)[:,1]

#         valid_loss = log_loss(y_valid, y_pred)
        # give the valid AUC score, for edification
        fold_valid_auc = roc_auc_score(y_valid, y_valid_preds)
        print(f"Valid AUC for fold {fold} is {fold_valid_auc}")   
        dump(model, Path(runpath/f"{exmodel_config['library']}_fold{fold}_rs{random_state}_model.joblib"))

    model_valid_auc = roc_auc_score(oof_y, oof_preds)
    print(f"Valid AUC score for {library} model is {model_valid_auc}")
    
    # finalize test preds
    test_preds /= exmodel_config['kfolds']
    
    # save OOF preds and test-set preds
    dump(oof_preds, Path(predpath/f"{wandb_config['name']}_{library}_{exmodel_config['kfolds']}folds_rs{random_state}_oof_preds.joblib"))
    dump(test_preds, Path(predpath/f"{wandb_config['name']}_{library}_{exmodel_config['kfolds']}folds_rs{random_state}_test_preds.joblib"))
    dump(oof_y, predpath/f"{exmodel_config['kfolds']}folds_rs{SEED}_oof_y.joblib")
    
    if wandb_tracked:
        wandb.log({'model_valid_auc': model_valid_auc,
                   'oof_preds': oof_preds,
                   'test_preds': test_preds,
                   'model_params': str(model.get_params()),
                  })
        wandb.finish()
    return oof_preds, test_preds
        

In [17]:
# _, lightgbm_preds = cross_validate_model(library='lightgbm')

In [18]:
# _, catboost_preds = cross_validate_model(library='catboost')

In [19]:
# xgboost_oof_preds, xgboost_test_preds = cross_validate_model(library='xgboost')

# Single Submission

In [20]:
# sample_df = pd.read_csv(datapath/'sample_submission.csv.zip')

In [21]:
# sample_df.loc[:, 'target'] = xgboost_preds

In [22]:
# sample_df.head()

In [23]:
# submission_path = datapath/'submissions'
# submission_path.mkdir(exist_ok=True)

In [24]:
# sample_df.to_csv(subpath/f"{wandb_config['name']}_xgboost_{exmodel_config['kfolds']}folds_rs{42}_baseline_preds.csv", index=False)

In [25]:
# str(blender.estimators[2][1].get_all_params())
# blender.estimators[2][1]

In [26]:
# wandb.log({'leaderboard_auc': 0.81725,
# #            'catboost_params': str(best_catboost_params),
#           })

In [27]:
# wandb.finish()

In [28]:
# oof_y_pd = pd.Series(oof_y)

# Predictions

## Level One

In [29]:
oof_lv1, test_lv1 = pd.DataFrame(), pd.DataFrame() # initialize dataframes

### Generating

In [30]:
oof_lv1_lgb42, test_lv1_lgb42 = cross_validate_model(library='lightgbm', X=X, y=y, X_test=X_test, 
                                         wandb_config=wandb_config,
                                         random_state=42,
                                         params=best_lightgbm_params,
                                         exmodel_config=exmodel_config, 
                                         wandb_tracked=True
                                        )
oof_lv1['lgb42'] = oof_lv1_lgb42
test_lv1['lgb42'] = test_lv1_lgb42

[34m[1mwandb[0m: Currently logged in as: [33mhushifang[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.4 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


FOLD 0
---------------------------------------------------
Valid AUC for fold 0 is 0.8572718743878947
FOLD 1
---------------------------------------------------
Valid AUC for fold 1 is 0.8559994877532525
FOLD 2
---------------------------------------------------
Valid AUC for fold 2 is 0.8568325308014939
FOLD 3
---------------------------------------------------
Valid AUC for fold 3 is 0.8559842308123811
FOLD 4
---------------------------------------------------
Valid AUC for fold 4 is 0.8565528831795393
Valid AUC score for lightgbm model is 0.8565251751685374




VBox(children=(Label(value=' 0.08MB of 0.08MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
model_valid_auc,▁

0,1
model_params,{'boosting_type': 'g...
model_valid_auc,0.85653


In [31]:
oof_lv1_lgb1983, test_lv1_lgb1983 = cross_validate_model(library='lightgbm', X=X, y=y, X_test=X_test, 
                                                 wandb_config=wandb_config,
                                                 random_state=1983,
                                                 params=best_lightgbm_params,
                                                 exmodel_config=exmodel_config, 
                                                 wandb_tracked=True
                                                )
oof_lv1['lgb1983'] = oof_lv1_lgb1983
test_lv1['lgb1983'] = test_lv1_lgb1983

[34m[1mwandb[0m: wandb version 0.12.4 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


FOLD 0
---------------------------------------------------
Valid AUC for fold 0 is 0.8572161841869534
FOLD 1
---------------------------------------------------
Valid AUC for fold 1 is 0.8559350043052648
FOLD 2
---------------------------------------------------
Valid AUC for fold 2 is 0.8568529019126492
FOLD 3
---------------------------------------------------
Valid AUC for fold 3 is 0.8558748673652216
FOLD 4
---------------------------------------------------
Valid AUC for fold 4 is 0.8565261379114149
Valid AUC score for lightgbm model is 0.8564776078917813




VBox(children=(Label(value=' 0.08MB of 0.08MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
model_valid_auc,▁

0,1
model_params,{'boosting_type': 'g...
model_valid_auc,0.85648


In [32]:
oof_lv1_xgb42, test_lv1_xgb42 = cross_validate_model(library='xgboost', X=X, y=y, X_test=X_test, 
                                         wandb_config=wandb_config,
                                         random_state=42,
                                         params=best_xgboost_params,
                                         exmodel_config=exmodel_config, 
                                         wandb_tracked=True
                                        )

oof_lv1['xgb42'] = oof_lv1_xgb42
test_lv1['xgb42'] = test_lv1_xgb42

[34m[1mwandb[0m: wandb version 0.12.4 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


FOLD 0
---------------------------------------------------




Valid AUC for fold 0 is 0.8572554115376164
FOLD 1
---------------------------------------------------




Valid AUC for fold 1 is 0.8561654493709842
FOLD 2
---------------------------------------------------




Valid AUC for fold 2 is 0.8572168508119474
FOLD 3
---------------------------------------------------




Valid AUC for fold 3 is 0.8560833380957398
FOLD 4
---------------------------------------------------




Valid AUC for fold 4 is 0.8567086183230934
Valid AUC score for xgboost model is 0.8566841128860819




VBox(children=(Label(value=' 0.09MB of 0.09MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
model_valid_auc,▁

0,1
model_params,{'objective': 'binar...
model_valid_auc,0.85668


In [33]:
oof_lv1_xgb1983, test_lv1_xgb1983 = cross_validate_model(library='xgboost', X=X, y=y, X_test=X_test, 
                                                 wandb_config=wandb_config,
                                                 random_state=1983,
                                                 params=best_xgboost_params,
                                                 exmodel_config=exmodel_config, 
                                                 wandb_tracked=True
                                                )
oof_lv1['xgb1983'] = oof_lv1_xgb1983
test_lv1['xgb1983'] = test_lv1_xgb1983

[34m[1mwandb[0m: wandb version 0.12.4 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


FOLD 0
---------------------------------------------------




Valid AUC for fold 0 is 0.8573146071386168
FOLD 1
---------------------------------------------------




Valid AUC for fold 1 is 0.8561499733154669
FOLD 2
---------------------------------------------------




Valid AUC for fold 2 is 0.8571969682510598
FOLD 3
---------------------------------------------------




Valid AUC for fold 3 is 0.8560595010136646
FOLD 4
---------------------------------------------------




Valid AUC for fold 4 is 0.8566916357893442
Valid AUC score for xgboost model is 0.8566807985789633




VBox(children=(Label(value=' 0.10MB of 0.10MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
model_valid_auc,▁

0,1
model_params,{'objective': 'binar...
model_valid_auc,0.85668


In [34]:
oof_lv1_cat42, test_lv1_cat42 = cross_validate_model(library='catboost', X=X, y=y, X_test=X_test, 
                                         wandb_config=wandb_config,
                                         random_state=42,
                                         params=best_catboost_params,
                                         exmodel_config=exmodel_config, 
                                         wandb_tracked=True
                                        )
oof_lv1['cat42'] = oof_lv1_cat42
test_lv1['cat42'] = test_lv1_cat42

[34m[1mwandb[0m: wandb version 0.12.4 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


FOLD 0
---------------------------------------------------
Valid AUC for fold 0 is 0.8575766033930445
FOLD 1
---------------------------------------------------
Valid AUC for fold 1 is 0.8564289538689146
FOLD 2
---------------------------------------------------
Valid AUC for fold 2 is 0.8574776355547533
FOLD 3
---------------------------------------------------
Valid AUC for fold 3 is 0.856314002656122
FOLD 4
---------------------------------------------------
Valid AUC for fold 4 is 0.8570333546927973
Valid AUC score for catboost model is 0.8569640294774554




VBox(children=(Label(value=' 0.11MB of 0.11MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
model_valid_auc,▁

0,1
model_params,{'iterations': 29338...
model_valid_auc,0.85696


In [54]:
oof_lv1_cat1983, test_lv1_cat1983 = cross_validate_model(library='catboost', X=X, y=y, X_test=X_test, 
                                                 wandb_config=wandb_config,
                                                 random_state=1983,
                                                 params=best_catboost_params,
                                                 exmodel_config=exmodel_config, 
                                                 wandb_tracked=False
                                                )


FOLD 0
---------------------------------------------------
Valid AUC for fold 0 is 0.8575796749930964
FOLD 1
---------------------------------------------------
Valid AUC for fold 1 is 0.8564173360521683
FOLD 2
---------------------------------------------------
Valid AUC for fold 2 is 0.8575358682866415
FOLD 3
---------------------------------------------------
Valid AUC for fold 3 is 0.8563227427678826
FOLD 4
---------------------------------------------------
Valid AUC for fold 4 is 0.8570098742696105
Valid AUC score for catboost model is 0.856970844007867


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [64]:
# this cell just for after-the-fact error corrections

# oof_lv1 = pd.DataFrame()
# test_lv1 = pd.DataFrame()

# oof_lv1['lgb1983'] = oof_lv1_lgb1983
# test_lv1['lgb1983'] = test_lv1_lgb1983
# oof_lv1['lgb42'] = oof_lv1_lgb42
# test_lv1['lgb42'] = test_lv1_lgb42
# oof_lv1['cat1983'] = oof_lv1_cat1983
# test_lv1['cat1983'] = test_lv1_cat1983
# oof_lv1['cat42'] = oof_lv1_cat42
# test_lv1['cat42'] = test_lv1_cat42
# oof_lv1['xgb1983'] = oof_lv1_xgb1983
# test_lv1['xgb1983'] = test_lv1_xgb1983
# oof_lv1['xgb42'] = oof_lv1_xgb42
# test_lv1['xgb42'] = test_lv1_xgb42



### Loading Sets of Predictions

In [None]:
# oof_lv1, test_lv1 = pd.DataFrame(), pd.DataFrame()
# preds_path = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/sep2021/preds/')

In [None]:
# oof_lv1['xgb42'] = load(predpath/'stacking_manual_20211005_085253_xgboost_5folds_rs42_oof_preds.joblib')
# test_lv1['xgb42'] = load(preds_path/'stacking_manual_20210925_212129_xgboost_5folds_rs42_test_preds.joblib')

In [None]:
# oof_lv_xgb42_y = load(predpath/'stacking_manual_20211005_085253_xgboost_5folds_rs42_oof_y.joblib')

In [None]:
# roc_auc_score(y_true=oof_lv_xgb42_y, y_score=oof_lv1['xgb42'])

In [None]:
# oof_lv1['xgb1983'] = load(preds_path/'validAUC_0.8146252172737458_stacking_manual_20210926_211701_xgboost_5folds_rs1983_oof_preds.joblib')
# test_lv1['xgb1983'] = load(preds_path/'stacking_manual_20210926_211701_xgboost_5folds_rs1983_test_preds.joblib')

In [None]:
# oof_lv1['lgb42'] = load(preds_path/'validAUC_0.8156810521798477_stacking_manual_20210925_212129_lightgbm_5folds_rs42_oof_preds.joblib')
# test_lv1['lgb42'] = load(preds_path/'stacking_manual_20210925_212129_lightgbm_5folds_rs42_test_preds.joblib')

In [None]:
# oof_lv1['lgb1983'] = load(preds_path/'validAUC_0.8156503194185875_stacking_manual_20210925_212129_lightgbm_5folds_rs1983_oof_preds.joblib')
# test_lv1['lgb1983'] = load(preds_path/'stacking_manual_20210925_212129_lightgbm_5folds_rs1983_test_preds.joblib')

In [None]:
# oof_lv1['cat42'] = load(preds_path/'validAUC_0.8116727090290558_stacking_manual_20210925_212129_catboost_5folds_rs42_oof_preds.joblib')
# test_lv1['cat42'] = load(preds_path/'stacking_manual_20210925_212129_catboost_5folds_rs42_test_preds.joblib')

In [None]:
# oof_lv1['cat1983'] = load(predpath/'stacking_manual_20211005_085253_catboost_5folds_rs1983_oof_preds.joblib')
# oof_cat1983_y = load(predpath/'stacking_manual_20211005_085253_catboost_5folds_rs1983_oof_y.joblib')
# roc_auc_score(y_true=oof_cat1983_y, y_score=oof_lv1['cat1983'])
# test_lv1['cat1983'] = load(preds_path/'stacking_manual_20210925_212129_catboost_5folds_rs1983_test_preds.joblib')

In [None]:
# oof_cat42_y = load(predpath/'stacking_manual_20211005_085253_catboost_5folds_rs42_oof_y.joblib')

In [None]:
# oof_cat42_y == oof_lv_xgb42_y

In [None]:
# oof_lv1.iloc[:20, :]

In [None]:
# oof_y_pd.iloc[:20]

- Why is it that the random seed seems far more important than the model type in making a prediction???

### Serialization

In [65]:
oof_lv1.head()

Unnamed: 0,lgb1983,lgb42,cat1983,cat42,xgb1983,xgb42
0,0.651299,0.642872,0.637863,0.629671,0.666962,0.627152
1,0.979994,0.978403,0.978872,0.978188,0.976313,0.975591
2,0.727006,0.717471,0.695742,0.758795,0.670068,0.674095
3,0.307745,0.322052,0.281502,0.280899,0.321891,0.294775
4,0.085114,0.08816,0.061077,0.062779,0.071851,0.069561


In [66]:
test_lv1.head()

Unnamed: 0,lgb1983,lgb42,cat1983,cat42,xgb1983,xgb42
0,0.716279,0.707717,0.742959,0.729703,0.74467,0.742503
1,0.227226,0.228462,0.236283,0.238824,0.255078,0.263515
2,0.907678,0.903571,0.908473,0.910906,0.903481,0.90982
3,0.812557,0.805153,0.844619,0.846002,0.86036,0.8614
4,0.282562,0.276703,0.267734,0.265967,0.262067,0.261821


In [67]:
oof_lv1.to_csv('oof_lv1.csv', index=False)
test_lv1.to_csv('test_lv1.csv', index=False)

In [68]:
oof_lv1.to_feather(predpath/f"{wandb_config['name']}_oof_lv1.feather")
test_lv1.to_feather(predpath/f"{wandb_config['name']}_test_lv1.feather")

### Lv1 Finalization

In [140]:
oof_lv1 = pd.read_feather(predpath/f"{wandb_config['name']}_oof_lv1.feather")#, columns=[str(x) for x in range()])
test_lv1 = pd.read_feather(predpath/f"{wandb_config['name']}_test_lv1.feather")
oof_y = load(predpath/f"{exmodel_config['kfolds']}folds_rs{SEED}_oof_y.joblib")

# oof_lv1.index.name = 'id'
# test_lv1.index.name = 'id'
# oof_y.index.name = 'id'
oof_lv1 = np.array(oof_lv1)
test_lv1 = np.array(test_lv1)
oof_y = np.array(oof_y)

In [141]:
oof_lv1

array([[0.651299  , 0.64287168, 0.63786275, 0.62967111, 0.66696191,
        0.62715232],
       [0.97999374, 0.97840349, 0.9788715 , 0.97818756, 0.97631311,
        0.9755913 ],
       [0.72700594, 0.7174706 , 0.69574226, 0.75879474, 0.67006797,
        0.6740948 ],
       ...,
       [0.15293226, 0.16239312, 0.16526066, 0.16129139, 0.14646634,
        0.14021994],
       [0.1820125 , 0.19328099, 0.2276023 , 0.23489056, 0.20038413,
        0.21633679],
       [0.12085927, 0.11898618, 0.1147604 , 0.11368849, 0.12350977,
        0.127176  ]])

In [142]:
test_lv1

array([[0.71627935, 0.70771669, 0.74295914, 0.72970335, 0.74466969,
        0.74250262],
       [0.22722624, 0.22846171, 0.23628303, 0.23882366, 0.25507812,
        0.26351548],
       [0.90767775, 0.90357087, 0.90847273, 0.9109058 , 0.90348123,
        0.90981996],
       ...,
       [0.30885597, 0.31995128, 0.2805131 , 0.28407661, 0.30922976,
        0.30733521],
       [0.53882482, 0.52314935, 0.4888139 , 0.47888754, 0.47377105,
        0.48541698],
       [0.4176311 , 0.43301102, 0.41166907, 0.41575487, 0.4223666 ,
        0.43216923]])

## Level Two

In [143]:
oof_lv2, test_lv2 = pd.DataFrame(), pd.DataFrame()

In [146]:
oof_y[:10]

array([1, 0, 1, 0, 0, 0, 1, 1, 1, 1])

In [148]:
oof_y_suspicious = oof_y.copy()

In [149]:
oof_y_new = load(predpath/'5folds_rs42_oof_y.joblib')

In [150]:
oof_y_suspicious == oof_y_new

array([ True, False, False, ..., False, False,  True])

In [151]:
oof_y = oof_y_new

In [155]:
oof_y_list = oof_y
oof_y = np.array(oof_y_list)

In [152]:
del oof_y_suspicious

In [157]:
%whos

Variable                Type                  Data/Info
-------------------------------------------------------
COLAB                   bool                  False
CatBoostClassifier      type                  <class 'catboost.core.CatBoostClassifier'>
KFold                   ABCMeta               <class 'sklearn.model_selection._split.KFold'>
LGBMClassifier          type                  <class 'lightgbm.sklearn.LGBMClassifier'>
LogisticRegression      type                  <class 'sklearn.linear_mo<...>stic.LogisticRegression'>
Path                    type                  <class 'pathlib.Path'>
SEED                    int                   42
SimpleImputer           type                  <class 'sklearn.impute._base.SimpleImputer'>
StandardScaler          type                  <class 'sklearn.preproces<...>ng._data.StandardScaler'>
StratifiedKFold         ABCMeta               <class 'sklearn.model_sel<...>._split.StratifiedKFold'>
USE_GPU                 bool                  True


In [156]:
oof_lv2_xgb42, test_lv2_xgb42 = cross_validate_model(library='xgboost', X=oof_lv1, y=oof_y, X_test=test_lv1, 
                                         wandb_config=wandb_config,
                                         random_state=42,
                                         params=best_xgboost_params,
                                         exmodel_config=exmodel_config, 
                                         wandb_tracked=False
                                        )

dump(oof_lv2_xgb42, predpath/f"{wandb_config['name']}_oof_lv2_xgboost42_preds.joblib")
dump(test_lv2_xgb42, predpath/f"{wandb_config['name']}_test_lv2_xgboost42_preds.joblib")



FOLD 0
---------------------------------------------------




Valid AUC for fold 0 is 0.49914453694384364
FOLD 1
---------------------------------------------------




Valid AUC for fold 1 is 0.5001056126413443
FOLD 2
---------------------------------------------------




Valid AUC for fold 2 is 0.5009500142412964
FOLD 3
---------------------------------------------------




Valid AUC for fold 3 is 0.5005702167455247
FOLD 4
---------------------------------------------------




Valid AUC for fold 4 is 0.4999237210906757
Valid AUC score for xgboost model is 0.5000931931876855


['/home/sf/code/kaggle/tabular_playgrounds/oct2021/preds/stacking_manual_20211005_205933_test_lv2_xgboost42_preds.joblib']

In [86]:
# oof_xgb_f0_rs1983 = load('/media/sf/easystore/kaggle_data/tabular_playgrounds/sep2021/models/stacking_manual_20210926_211701_xgboost_5folds/xgboost_fold0_model.joblib')
# oof_xgb_f0_rs42 = load('/media/sf/easystore/kaggle_data/tabular_playgrounds/sep2021/models/stacking_manual_20210925_212129_xgboost_5folds/xgboost_fold0_model.joblib')

In [87]:
oof_lv2_cat42, test_lv2_cat42 = cross_validate_model(library='catboost', X=oof_lv1, y=oof_y, X_test=test_lv1, 
                                         wandb_config=wandb_config,
                                         random_state=42,
                                         params=best_catboost_params,
                                         exmodel_config=exmodel_config, 
                                         wandb_tracked=False
                                        )

FOLD 0
---------------------------------------------------
Valid AUC for fold 0 is 0.5004641318928412
FOLD 1
---------------------------------------------------
Valid AUC for fold 1 is 0.5008023639313376
FOLD 2
---------------------------------------------------
Valid AUC for fold 2 is 0.5004957053252322
FOLD 3
---------------------------------------------------
Valid AUC for fold 3 is 0.500164733148486
FOLD 4
---------------------------------------------------
Valid AUC for fold 4 is 0.5007088478130824
Valid AUC score for catboost model is 0.5005080950500665


In [88]:
dump(oof_lv2_cat42, predpath/f"{wandb_config['name']}_oof_lv2_catboost42_preds.joblib")
dump(test_lv2_cat42, predpath/f"{wandb_config['name']}_test_lv2_catboost42_preds.joblib")

['/home/sf/code/kaggle/tabular_playgrounds/oct2021/preds/stacking_manual_20211005_205933_test_lv2_catboost42_preds.joblib']

In [89]:

oof_lv2_lgb42, test_lv2_lgb42 = cross_validate_model(library='lightgbm', X=oof_lv1, y=oof_y, X_test=test_lv1, 
                                         wandb_config=wandb_config,
                                         random_state=42,
                                         params=best_lightgbm_params,
                                         exmodel_config=exmodel_config, 
                                         wandb_tracked=False
                                        )

FOLD 0
---------------------------------------------------
Valid AUC for fold 0 is 0.4999526176098955
FOLD 1
---------------------------------------------------
Valid AUC for fold 1 is 0.5000747505022409
FOLD 2
---------------------------------------------------
Valid AUC for fold 2 is 0.5000905204593905
FOLD 3
---------------------------------------------------
Valid AUC for fold 3 is 0.5005522766535733
FOLD 4
---------------------------------------------------
Valid AUC for fold 4 is 0.5003393064934663
Valid AUC score for lightgbm model is 0.5001814029026819


In [90]:
dump(oof_lv2_lgb42, predpath/f"{wandb_config['name']}_oof_lv2_lightgbm42_preds.joblib")
dump(test_lv2_lgb42, predpath/f"{wandb_config['name']}_test_lv2_lightgbm42_preds.joblib")

['/home/sf/code/kaggle/tabular_playgrounds/oct2021/preds/stacking_manual_20211005_205933_test_lv2_lightgbm42_preds.joblib']

In [91]:
oof_lv2['xgboost'] = oof_lv2_xgb42
oof_lv2['catboost'] = oof_lv2_cat42
oof_lv2['lightgbm'] = oof_lv2_lgb42

test_lv2['xgboost'] = test_lv2_xgb42
test_lv2['catboost'] = test_lv2_cat42
test_lv2['lightgbm'] = test_lv2_lgb42

In [92]:
oof_lv2.head()

Unnamed: 0,xgboost,catboost,lightgbm
0,0.4973,0.492342,0.489108
1,0.494436,0.486861,0.475497
2,0.518005,0.544128,0.5421
3,0.503651,0.484256,0.504308
4,0.500344,0.495525,0.508661


In [94]:
oof_lv1_df = pd.read_feather(predpath/f"{wandb_config['name']}_oof_lv1.feather)

In [95]:
oof_lv1_df.head()

Unnamed: 0,lgb1983,lgb42,cat1983,cat42,xgb1983,xgb42
0,0.651299,0.642872,0.637863,0.629671,0.666962,0.627152
1,0.979994,0.978403,0.978872,0.978188,0.976313,0.975591
2,0.727006,0.717471,0.695742,0.758795,0.670068,0.674095
3,0.307745,0.322052,0.281502,0.280899,0.321891,0.294775
4,0.085114,0.08816,0.061077,0.062779,0.071851,0.069561


In [96]:
oof_lv2_full = oof_lv2.join(oof_lv1_df)

In [97]:
test_lv2_full = test_lv2.join(pd.read_feather(predpath/f"{wandb_config['name']}_oof_lv1.feather"))

In [131]:
test_lv1_df = pd.read_feather(predpath/f"{wandb_config['name']}_test_lv1.feather")

In [132]:
test_lv1_df.head()

Unnamed: 0,lgb1983,lgb42,cat1983,cat42,xgb1983,xgb42
0,0.716279,0.707717,0.742959,0.729703,0.74467,0.742503
1,0.227226,0.228462,0.236283,0.238824,0.255078,0.263515
2,0.907678,0.903571,0.908473,0.910906,0.903481,0.90982
3,0.812557,0.805153,0.844619,0.846002,0.86036,0.8614
4,0.282562,0.276703,0.267734,0.265967,0.262067,0.261821


In [133]:
test_lv1_df_corrected = pd.DataFrame()

In [134]:
test_lv1_df_corrected['xgb42'] = test_lv1_xgb42
test_lv1_df_corrected['xgb1983'] = test_lv1_xgb1983
test_lv1_df_corrected['cat42'] = test_lv1_cat42
test_lv1_df_corrected['cat1983'] = test_lv1_cat1983
test_lv1_df_corrected['lgb42'] = test_lv1_lgb42
test_lv1_df_corrected['lgb1983'] = test_lv1_lgb1983

In [135]:
oof_lv2_full.head()

Unnamed: 0,xgboost,catboost,lightgbm,lgb1983,lgb42,cat1983,cat42,xgb1983,xgb42
0,0.4973,0.492342,0.489108,0.651299,0.642872,0.637863,0.629671,0.666962,0.627152
1,0.494436,0.486861,0.475497,0.979994,0.978403,0.978872,0.978188,0.976313,0.975591
2,0.518005,0.544128,0.5421,0.727006,0.717471,0.695742,0.758795,0.670068,0.674095
3,0.503651,0.484256,0.504308,0.307745,0.322052,0.281502,0.280899,0.321891,0.294775
4,0.500344,0.495525,0.508661,0.085114,0.08816,0.061077,0.062779,0.071851,0.069561


In [136]:
test_lv2_full = test_lv2.join(test_lv1_df_corrected)

In [137]:
test_lv2_full.head()

Unnamed: 0,xgboost,catboost,lightgbm,xgb42,xgb1983,cat42,cat1983,lgb42,lgb1983
0,0.490498,0.489155,0.481449,0.742503,0.74467,0.729703,0.742959,0.707717,0.716279
1,0.509766,0.514715,0.519687,0.263515,0.255078,0.238824,0.236283,0.228462,0.227226
2,0.498902,0.478333,0.483331,0.90982,0.903481,0.910906,0.908473,0.903571,0.907678
3,0.494088,0.472826,0.472603,0.8614,0.86036,0.846002,0.844619,0.805153,0.812557
4,0.503125,0.509,0.522709,0.261821,0.262067,0.265967,0.267734,0.276703,0.282562


In [110]:
oof_lv2_np = oof_lv2_full.to_numpy()
test_lv2_np = test_lv2_full.to_numpy()

In [111]:
oof_y_np = oof_y

## Level Three (Logistic Regression)

In [113]:
from sklearn import model_selection

In [115]:
# kfolds = model_selection.StratifiedKFold(n_splits=5, shuffle=False) # no random_state if shuffle == False

In [125]:
oof_preds, oof_y = [], []

In [126]:
test_preds = np.zeros((X_test.shape[0]))

In [127]:
X = oof_lv2_np
y = oof_y_np
X_test = test_lv2_np

In [130]:
X[:10]

array([[0.49729979, 0.49234212, 0.48910773, 0.651299  , 0.64287168,
        0.63786275, 0.62967111, 0.66696191, 0.62715232],
       [0.4944357 , 0.48686108, 0.47549723, 0.97999374, 0.97840349,
        0.9788715 , 0.97818756, 0.97631311, 0.9755913 ],
       [0.51800519, 0.54412764, 0.54209993, 0.72700594, 0.7174706 ,
        0.69574226, 0.75879474, 0.67006797, 0.6740948 ],
       [0.50365067, 0.48425647, 0.50430834, 0.30774478, 0.3220523 ,
        0.28150182, 0.28089858, 0.32189095, 0.29477474],
       [0.50034362, 0.49552517, 0.50866063, 0.08511447, 0.08816023,
        0.06107668, 0.06277937, 0.0718514 , 0.06956097],
       [0.50675994, 0.56949121, 0.51309979, 0.9323953 , 0.93559867,
        0.93508565, 0.93551411, 0.9304139 , 0.93387043],
       [0.50627351, 0.5217009 , 0.53059488, 0.86189195, 0.86211665,
        0.83106644, 0.83630092, 0.84129363, 0.83574039],
       [0.49421409, 0.46431078, 0.4778398 , 0.17739634, 0.17881085,
        0.15259177, 0.1563106 , 0.16667581, 0.15861823],


In [128]:
library = 'sklearn (LogisticRegressor(max_iter=1000))'
# exmodel_config['library'] = library
# wandb.init(
#     project="202110_Kaggle_tabular_playground",
#     save_code=True,
#     tags=wandb_config['tags'],
#     name=wandb_config['name'],
#     notes=wandb_config['notes'],
#     config=exmodel_config
# )   

# prepare for k-fold cross-validation
# kfold = exmodel_config['cross_val_strategy'](n_splits=exmodel_config['kfolds'], shuffle=True, random_state=SEED)

# setup for serialization
# model_path = Path(datapath/f"models/{wandb_config['name']}_{library}_{exmodel_config['kfolds']}folds/")
# (model_path).mkdir(exist_ok=True)

In [129]:
for fold, (train_idx, valid_idx) in enumerate(kfold.split(X,y)):
    X_train, X_valid = X[train_idx], X[valid_idx]
    y_train, y_valid = y[train_idx], y[valid_idx]
    
    print(f"FOLD {fold}")
    print("---------------------")
    
    model = LogisticRegression(max_iter=1000)
    
    model.fit(X_train, y_train)
    preds = model.predict_proba(X_valid)[:,1]
    
    oof_preds.extend(preds)
    oof_y.extend(y_valid)
    
    test_preds += model.predict_proba(X_test)[:,1]
    
    valid_auc = roc_auc_score(y_valid, preds)
    print(f"ROC AUC of fold {fold} is {valid_auc}")
    
#     dump(preds, /'lv_3)

valid_auc_total = roc_auc_score(oof_y, oof_preds)
print(f"Overall ROC_AUC is {valid_auc_total}")

dump(oof_preds, predpath/'oof_lv3_preds.joblib')
dump(oof_y, predpath/'oof_lv3_y.joblib')

test_preds /= 5

dump(test_preds, predpath/'test_lv3_preds.joblib')

FOLD 0
---------------------
ROC AUC of fold 0 is 0.5015212532875889
FOLD 1
---------------------
ROC AUC of fold 1 is 0.5006806231361965
FOLD 2
---------------------
ROC AUC of fold 2 is 0.5033895236738666
FOLD 3
---------------------
ROC AUC of fold 3 is 0.5038070526147008
FOLD 4
---------------------
ROC AUC of fold 4 is 0.5040978869942625
Overall ROC_AUC is 0.5025660215323697


['/home/sf/code/kaggle/tabular_playgrounds/oct2021/preds/test_lv3_preds.joblib']

In [None]:
# X_test_imputed_scaled = pd.read_feather(path=datapath/'X_test_NaNcounts_imputed-Median-wIndicators_StandardScaled.feather')
# X_test_imputed_scaled = pd.read_feather(path=datapath/'X_test_NaNcounts_SummaryStats_imputed-Median-wIndicators-StandardScaled.feather')

## Prediction Generation

In [None]:
# preds_path = Path(datapath/"preds/")

# blender_preds = blender.predict_proba(X_test_imputed_scaled)[:,1]
# dump(blender_preds, preds_path/f"{config_run['name']}_stack.joblib")

# Ensemble Submission

In [121]:
sample_df = pd.read_csv(datapath/'sample_submission.csv.zip')

In [122]:
sample_df.loc[:, 'target'] = test_preds

In [123]:
sample_df.head()

Unnamed: 0,id,target
0,1000000,0.495505
1,1000001,0.507115
2,1000002,0.499234
3,1000003,0.4982
4,1000004,0.501345


In [None]:
# submission_path = datapath/'submissions'
# submission_path.mkdir(exist_ok=True)

In [124]:
sample_df.to_csv(subpath/f"{wandb_config['name']}_3level-GBM-ensemble_{exmodel_config['kfolds']}folds_rs{42}_baseline_preds.csv", index=False)

In [56]:
wandb.log({'leaderboard_auc': ,
#            'catboost_params': str(best_catboost_params),
          })

In [57]:
wandb.finish()

VBox(children=(Label(value=' 0.23MB of 0.23MB uploaded (0.06MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
blender-cv,5
blender-final_estimator,LogisticRegression(m...
blender-stack_mdethod,predict_proba
leaderboard_auc,0.81725
lightgbm_params,{'boosting_type': 'g...
test_set,/media/sf/easystore/...
train_auc,0.8414
train_loss,0.494
xgboost_params,{'objective': 'binar...


0,1
blender-cv,▁
leaderboard_auc,▁
train_auc,▁
train_loss,▁
