# Baseline
Setting up a more robust baseline notebook, suitable for use with all of the "Big Three" (XGBoost, CatBoost, LightGBM) libraries and on either Google Colab or the local machine.

# Setup

In [1]:
# two manual flags (ex-config)
COLAB = False
USE_GPU = True
# libraries = ['xgboost', 'lightgbm', 'catboost']
libraries = ['xgboost', 'lightgbm', 'catboost']

In [2]:
# basic imports
from pathlib import Path
import os
import math
from datetime import datetime
import random

In [3]:
%matplotlib inline
%config Completer.use_jedi = False
os.environ['WANDB_NOTEBOOK_NAME'] = f"stacking_manual_{datetime.now().strftime('%Y%m%d')}.ipynb"

In [4]:
# handle Google Colab-specific library installation/updating
if COLAB:
    # much of the below inspired by or cribbed from the May 2021 Kaggle Tabular Playground winner, at 
    # https://colab.research.google.com/gist/academicsuspect/0aac7bd6e506f5f70295bfc9a3dc2250/tabular-may-baseline.ipynb?authuser=1#scrollTo=LJoVKJb5wN0L
    
    # Kaggle API for downloading the datasets
#     !pip install --upgrade -q kaggle

    # weights and biases
    !pip install -qqqU wandb
    
    # Optuna for parameter search
    !pip install -q optuna

    # upgrade sklearn
    !pip install --upgrade scikit-learn

#     !pip install category_encoders
    
    if 'catboost' in libraries:
        !pip install catboost
    
    if 'xgboost' in libraries:
        if USE_GPU: 
            # this part is from https://github.com/rapidsai/gputreeshap/issues/24
            !pip install cmake --upgrade
            # !pip install sklearn --upgrade
            !git clone --recursive https://github.com/dmlc/xgboost
            %cd /content/xgboost
            !mkdir build
            %cd build
            !cmake .. -DUSE_CUDA=ON
            !make -j4
            %cd /content/xgboost/python-package
            !python setup.py install --use-cuda --use-nccl
            !/opt/bin/nvidia-smi
            !pip install shap
        else:
            !pip install --upgrade xgboost
    if 'lightgbm' in libraries:
        if USE_GPU:
            # lighgbm gpu compatible
            !git clone --recursive https://github.com/Microsoft/LightGBM
            ! cd LightGBM && rm -rf build && mkdir build && cd build && cmake -DUSE_GPU=1 ../../LightGBM && make -j4 && cd ../python-package && python3 setup.py install --precompile --gpu;
        else:
            !pip install --upgrade lightgbm
        

        

Now, non-stdlib imports

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

# general ML tooling
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.metrics import log_loss, roc_auc_score
import wandb
from wandb.xgboost import wandb_callback
from wandb.lightgbm import wandb_callback
from sklearn.impute import SimpleImputer #, KNNImputer
# import timm

import seaborn as sns

from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
# from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.preprocessing import StandardScaler #, MinMaxScaler, MaxAbsScaler, RobustScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
# from sklearn.feature_selection import SelectKBest, f_regression
from joblib import dump, load
# feature engineering tools
# from sklearn.feature_selection import mutual_info_regression
# import featuretools as ft


In [6]:
from pytorch_widedeep import Trainer
from pytorch_widedeep.preprocessing import WidePreprocessor, TabPreprocessor
from pytorch_widedeep.models import Wide, TabMlp, WideDeep, SAINT, TabTransformer, TabNet, TabFastFormer, TabResnet
from pytorch_widedeep.metrics import Accuracy
from torchmetrics import AUROC
import torch
from torch.optim import Adam, AdamW, Adagrad, SGD, RMSprop, LBFGS
from torch.optim.lr_scheduler import ReduceLROnPlateau, CosineAnnealingWarmRestarts, CyclicLR, OneCycleLR, StepLR, CosineAnnealingLR
from pytorch_widedeep.callbacks import EarlyStopping, LRHistory, ModelCheckpoint

Now, datapath setup

In [7]:
# # This is the code for reading the train.csv and converting it to a .feather file
# df = pd.read_csv(datapath/'train.csv', index_col='id', low_memory=False)
# df.index.name = None
# df.to_feather(path='./dataset_df.feather')

  and should_run_async(code)


In [8]:
if COLAB:
    # mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')
    
    # handling datapath
    datapath = Path('/content/drive/MyDrive/kaggle/tabular_playgrounds/oct2021/')
    
else:
    # if on local machine
#     datapath = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/sep2021/')  
    root = Path('/home/sf/code/kaggle/tabular_playgrounds/oct2021/')
    datapath = root/'datasets'
    edapath = root/'EDA'
    modelpath = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/oct2021/models/')
    predpath = root/'preds'
    subpath = root/'submissions'
    
    for pth in [root, datapath, edapath, modelpath, predpath, subpath]:
        pth.mkdir(exist_ok=True)
    


In [9]:
SEED = 42

# Function to seed everything
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

seed_everything(seed=SEED)

## Ex-Model Config

In [10]:
# meta-config for preprocessing and cross-validation, but NOT for model parameters
exmodel_config = {
#     "feature_selector": SelectKBest,
#     "k_best": 80,
#     "feature_selection_scoring": f_regression,
#     'random_state': SEED,
#     'feature_generation': ['NaN_counts', 'SummaryStats', 'NaN_OneHots'],
#     'subsample': 1,
    'cross_val_strategy': KFold, # None for holdout, or the relevant sklearn class
    'kfolds': 5, # if 1, that means just doing holdout
    'test_size': 0.2,
#     'features_created': False,
#     'feature_creator': None,
}

## Data Setup

**TODO** Write some conditional logic here to automate it -- possibly as part of a sklearn.*pipeline

In [11]:
# if exmodel_config['scaler']:
#     scaler = exmodel_config['scaler']()
#     scaler.fit_transform()

In [12]:
train_source = datapath/'train.feather'
df = pd.read_feather(path=train_source)
# df.index.name = 'id'
y = np.array(df.target)
features = [x for x in df.columns if x != 'target']
X = df[features] # passing X as a pd.DataFrame to the trainer below, rather than as an np.ndarray
# X_train = df[features]
# X.index.name = 'id'
# y.index.name = 'id'

In [13]:
X.shape

  and should_run_async(code)


(1000000, 285)

In [14]:
# X_train.columns

In [15]:
# wide_cols = [f for f in X_train.columns if X_train[f].nunique() == 2]

In [16]:
# wide_cols

In [17]:
# X_train.shape

In [18]:
# X = np.array(X_train)
# y = np.array(y_train)

# del df, X_train, y_train


# exmodel_config['feature_count'] = len(X.columns)
exmodel_config['feature_count'] = X.shape[1]
exmodel_config['instance_count'] = X.shape[0]

# exmodel_config['feature_generator'] = None
# exmodel_config['feature_generator'] = "Summary statistics"

exmodel_config['train_source'] = str(train_source)

In [19]:
# X_df = pd.DataFrame(X)

In [20]:
# print(X_df.iloc[:,0])

In [21]:
# print(X_df.iloc[:, list(X_df.columns)[0]])

In [22]:
test_source = datapath/'test.feather'
exmodel_config['test_source'] = str(test_source)
X_test = pd.read_feather(path=test_source)
X_test = X_test.iloc[:, 1:]

In [23]:
# X_test = np.array(X_test)

## Weights and Biases Run Config

Below is the configuration for a Weights and Biases (`wandb`) run.

In [24]:
# wandb config:
wandb_config = {
    'name': f"{os.environ['WANDB_NOTEBOOK_NAME'][:-6]}_{datetime.now().strftime('%H%M%S')}", # just removes the .ipynb extension, leaving the notebook filename's stem
    'tags': ['baseline', 'deep_learning'],
    'notes': "Trying wide-deep",
}

# Training

# Hyperparameters

In [25]:
# optuna 20211004, thru 106 trials on unaltered original dataset
best_xgboost_params = {
    'n_estimators': 3878,
    'max_depth': 4,
    'learning_rate': 0.024785857161974977,
    'reg_alpha': 26.867682044658245,
    'reg_lambda': 10.839759074147148,
    'subsample': 0.8208581489835881,
    'min_child_weight': 8.829122644339664,
    'colsample_bytree': 0.906420714280384,
    'gamma': 1.472322916021486
}

# best as of 20211005, thru 65 trials on unaltered original dataset
best_lightgbm_params = {
    'n_estimators': 6631,
    'max_depth': 10,
    'learning_rate': 0.004677044539666842,
    'reg_alpha': 19.334971246299116,
    'reg_lambda': 0.024384251140153856,
    'subsample': 0.5082183652689569,
    'boosting_type': 'gbdt',
    'min_child_samples': 9,
    'num_leaves': 233,
    'colsample_bytree': 0.5008014086989773
}

# catboost 20211001 on colab with 100 trials on GPU, unaltered original dataset
best_catboost_params = {
    'iterations': 29338,
    'max_depth': 9,
    'learning_rate': 0.004769831650275205,
    'random_strength': 7,
    'od_wait': 1968,
    'reg_lambda': 28.435563240493586,
    'border_count': 162,
    'min_child_samples': 14,
    'leaf_estimation_iterations': 1
}

In [26]:
def cross_validate_model(library:str, params:dict={}, X=X, y=y, X_test=X_test, start_fold=0, 
                         exmodel_config=exmodel_config, wandb_config=wandb_config, 
                         random_state=42, shuffle_kfolds=True, wandb_tracked=True):
    """
    Function to handle model training process in the context of cross-validation -- via hold-out or via k-fold.
    If exmodel_config['cross_val_strategy'] == None, then any kfolds= input is ignored; otherwise, the number specified is used.
    
    :param kfolds: int specifying number of k-folds to use in cross-validation
    :param exmodel_config: dict containing general config including for cross-validation -- `kfold=1` implies hold-out
    """
#     if exmodel_config['kfolds'] == 1:
#         print("Proceeding with holdout")
#     X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
#                                                           test_size=0.2, 
#                                                           random_state=SEED)                 
    
    # prepare for k-fold cross-validation; random-state here is notebook-wide, not per-model
    # shuffle on the initial sets, but not subsequently -- performing the same operation twice means a very different dataset
    if shuffle_kfolds:
        kfold = exmodel_config['cross_val_strategy'](n_splits=exmodel_config['kfolds'], shuffle=True, random_state=SEED)
    else:
        kfold = exmodel_config['cross_val_strategy'](n_splits=exmodel_config['kfolds'], shuffle=False)
    
    if wandb_tracked:
        exmodel_config['library'] = library
        exmodel_config[f'{library}_params'] = str(params)
        wandb.init(
            project="202110_Kaggle_tabular_playground",
            save_code=True,
            tags=wandb_config['tags'],
            name=wandb_config['name'],
            notes=wandb_config['notes'],
            config=exmodel_config
    )   
    
    # setup for serialization
    runpath = Path(modelpath/f"{wandb_config['name']}_{library}_{exmodel_config['kfolds']}folds/")
    (runpath).mkdir(exist_ok=True)
    
    
    # initialize lists for out-of-fold preds and ground truth
    oof_preds, oof_y = [], []
    
    # initialize a numpy.ndarray containing the fold-model's preds for test set
    test_preds = np.zeros((X_test.shape[0]))
    
    # if using deep learning with pytorch-widedeep, do data preprocessing now, before splits
    if 'widedeep' in library:
        # preprocessing first
        wide_cols = [f for f in X.columns if X[f].nunique() == 2] #list(X_train.columns) if X_train.iloc[:,f].nunique() == 2] # binary indicator vars are wide
        cont_cols = [f for f in X.columns if X[f].nunique() > 2] #list(X_train.columns) if X_train.iloc[:,f].nunique() > 2] # others are cont

        # wide part
        wide_preprocessor = WidePreprocessor(wide_cols=wide_cols)
        X_wide = wide_preprocessor.fit_transform(X)
#         print(f"X_wide.shape = {X_wide.shape}")
#         X_wide = np.array(X_train[wide_cols])
        

        # deep part
        tab_preprocessor = TabPreprocessor(continuous_cols=cont_cols)#, embed_cols=embed_cols, )
        X_tab = tab_preprocessor.fit_transform(X)   
#         print(f"X_tab.shape = {X_tab.shape}")
        
        # transforming the test set
        X_test_wide = wide_preprocessor.transform(X_test)
        X_test_tab = tab_preprocessor.transform(X_test)
        
        n_epochs = 500
        
        # at this point, X_wide, X_tab, X_test_wide, and X_test_tab will all be np.ndarrays
    
    else: # if using a GBM, simply convert the pd.DataFrames to np.ndarrays
        X = np.array(X)
        X_test = np.array(X_test)
    
    for fold, (train_ids, valid_ids) in enumerate(kfold.split(X,y)):
#         print(f"type(train_ids) = {type(train_ids)} and train_ids.shape = {train_ids.shape}")
#         print(f"type(valid_ids) = {type(valid_ids)} and train_ids.shape = {valid_ids.shape}")
        if fold < start_fold: # skip folds that are already trained
            continue
        else:
            print(f"FOLD {fold}")
            print("---------------------------------------------------")
            y_train, y_valid = y[train_ids], y[valid_ids] # y will be an np.ndarray already; handling will be same regardless of model
            if 'widedeep' in library: # handle wide and deep tabs in parallel
                X_train_wide, X_valid_wide = X_wide[train_ids, :], X_wide[valid_ids, :]
                X_train_tab, X_valid_tab = X_tab[train_ids, :], X_tab[valid_ids, :]
#                 print(f"X_train_wide.shape = {X_train_wide.shape}")
#                 print(f"X_train_tab.shape = {X_train_tab.shape}")
#                 print(f"X_test_wide.shape = {X_test_wide.shape}")
#                 print(f"X_test_tab.shape = {X_test_tab.shape}")
            else: # handle datasets for GBMs
                X_train, X_valid = X[train_ids], X[valid_ids]
#                 y_train, y_valid = y[train_ids], y[valid_ids]
            
        # define models
        if library == 'xgboost':
            model = XGBClassifier(
                booster='gbtree',
                tree_method='gpu_hist',
                random_state=random_state,
                n_jobs=-1, 
                verbosity=1, 
                objective='binary:logistic',
                **params)
            if wandb_tracked:
                model.fit(X_train, y_train, callbacks=[wandb.xgboost.wandb_callback()])
            else:
                model.fit(X_train, y_train)
            y_valid_preds = model.predict_proba(X_valid)[:,1]
            
            # add the fold-model's OOF preds and ground truths to the out-of-loop lists
            oof_preds.extend(y_valid_preds)
            oof_y.extend(y_valid)
            
            # add the fold's predictions to the model's test-set predictions (will divide later)
            test_preds += model.predict_proba(X_test)[:,1]


        elif library == 'lightgbm':
            model = LGBMClassifier(
                objective='binary',
                random_state=random_state,
#                 device_type='cpu',
#                 n_jobs=-1,
#                 eval_metric='auc',
                device_type='gpu',
                max_bin=63, # 15 might be even better for GPU perf, but depends on dataset -- see https://lightgbm.readthedocs.io/en/latest/GPU-Performance.html
                gpu_use_dp=True, # forces use of single precision rather than double for better perf, esp on consumer Nvidia chips
                **params)
            if wandb_tracked:
                model.fit(X_train, y_train, callbacks=[wandb.lightgbm.wandb_callback()],)
            else:
                model.fit(X_train, y_train)
            y_valid_preds = model.predict_proba(X_valid)[:,1]
            
            # add the fold-model's OOF preds and ground truths to the out-of-loop lists
            oof_preds.extend(y_valid_preds)
            oof_y.extend(y_valid)
            
            # add the fold's predictions to the model's test-set predictions (will divide later)
            test_preds += model.predict_proba(X_test)[:,1]

            
        elif library == 'catboost':
            model = CatBoostClassifier(
                task_type='GPU',
                silent=True,
                random_state=random_state,
                **params) 
        
            model.fit(X_train, y_train)
            y_valid_preds = model.predict_proba(X_valid)[:,1]
            
            # add the fold-model's OOF preds and ground truths to the out-of-loop lists
            oof_preds.extend(y_valid_preds)
            oof_y.extend(y_valid)
            
            # add the fold's predictions to the model's test-set predictions (will divide later)
            test_preds += model.predict_proba(X_test)[:,1]
            
        elif 'widedeep' in library: # only coding for TabMlp right now
#             X_train = pd.DataFrame(X_train, columns=[f"f{x}" for x in range(X_train.shape[1])])
#             X_valid = pd.DataFrame(X_valid, columns=[f"f{x}" for x in range(X_valid.shape[1])])
#             X_test = pd.DataFrame(X_test, columns=[f"f{x}" for x in range(X_test.shape[1])])
            
            wide = Wide(wide_dim=np.unique(X_wide).shape[0], pred_dim=1)
        
        
        
            deeptabular = TabMlp(
                mlp_hidden_dims=[64,32],
                column_idx=tab_preprocessor.column_idx,
            #     embed_input=tab_preprocessor.embeddings_input,
                continuous_cols=cont_cols,
            )
            
            # model instantiation and training
            model = WideDeep(wide=wide, deeptabular=deeptabular)
            
            # pytorch hyperparams
            wide_opt = AdamW(model.wide.parameters(), lr=0.1)
            deep_opt = AdamW(model.deeptabular.parameters(), lr=0.1)
            
#             wide_sch = ReduceLROnPlateau(wide_opt)
#             deep_sch = ReduceLROnPlateau(deep_opt)

            wide_sch = OneCycleLR(optimizer=wide_opt, max_lr=0.01, steps_per_epoch=X_train_wide.shape[0], epochs=n_epochs)
            deep_sch = OneCycleLR(optimizer=deep_opt, max_lr=0.01, steps_per_epoch=X_train_tab.shape[0], epochs=n_epochs)
            
            optimizers = {'wide': wide_opt, 'deeptabular': deep_opt }
            lr_schedulers = {'wide': wide_sch, 'deeptabular': deep_sch }
            
            
            callbacks = [
                LRHistory(n_epochs=n_epochs), 
#                 EarlyStopping, 
#                 ModelCheckpoint(
#                     filepath='',
#                     save_best_only=True
#                 )
            ]
            
            trainer = Trainer(model=model, 
                              objective='binary', 
                              metrics=[Accuracy], # with AUROC got TypeError: '>' not supported between instances of 'NoneType' and 'int' 
                              seed=random_state, 
#                               lr_schedulers=lr_schedulers,
                              optimizers=optimizers,
                              callbacks=callbacks
                             )
            
#             print(f"type(X_train_wide) is {type(X_train_wide)} and type(X_train_tab) is {type(X_train_tab)}")
            trainer.fit( # this is where problem is beginning
                X_wide=X_train_wide,
                X_tab=X_train_tab,
                target=y_train,
                n_epochs=n_epochs,
                batch_size=1024, # default value is 32
#                 val_split=0.2, # no need for this
            )
            
            # validation set preprocessing
#             X_valid_wide = np.array(X_valid[wide_cols]) # wide_preprocessor.transform(X_valid)
#             X_tab_valid = tab_preprocessor.transform(X_valid)
#             print(f"X_valid_wide: {X_valid_wide[:10]}")
#             print(f"X_valid_tab: {X_valid_tab[:10]}")
            y_valid_preds = trainer.predict_proba(X_wide=X_valid_wide, X_tab=X_valid_tab, batch_size=1024)[:,1]
            
            # add the fold-model's OOF preds and ground truths to the out-of-loop lists
            oof_preds.extend(y_valid_preds)
            oof_y.extend(y_valid)
            
            # test set preprocessing
#             X_test_wide = np.array(X_test[wide_cols]) # wide_preprocessor.transform(X_test)
#             X_tab_test = tab_preprocessor.transform(X_test)
#             print(f"X_test_wide: {X_test_wide[:10]}")
#             print(f"X_test_tab: {X_test_tab[:10]}")
            
            # test set inference
            fold_test_preds = trainer.predict_proba(X_wide=X_test_wide, X_tab=X_test_tab, batch_size=1024)[:,1]
            test_preds += fold_test_preds
#             print(f"fold_test_preds: {fold_test_preds[:10]}")
#             print(f"test_preds_so_far: {test_preds[:10]}")
            
        # take the training set predictions, if desired
#         y_train_pred = model.predict_proba(X_train)[:,1]
#         train_loss = log_loss(y_train, y_train_pred)
#         train_auc = roc_auc_score(y_train, y_train_pred)
#         wandb.log({'train_loss': train_loss, 'train_auc': train_auc})

        # log the parameters, if desired
#         if exmodel_config['library'] == 'catboost':
#             print(model.get_all_params())
#             wandb.log(model.get_all_params())
#         else:
#             wandb.log(model.get_params()) # logging model parameters, trying bare-invocation rather than params: model.get_params()

        
        
       
        
        

#         valid_loss = log_loss(y_valid, y_pred)
        # give the valid AUC score, for edification
        fold_valid_auc = roc_auc_score(y_valid, y_valid_preds)
        print(f"Valid AUC for fold {fold} is {fold_valid_auc}")   
        dump(model, Path(runpath/f"{library}_fold{fold}_rs{random_state}_model.joblib"))

    model_valid_auc = roc_auc_score(oof_y, oof_preds)
    print(f"Valid AUC score for {library} model is {model_valid_auc}")
    
    # finalize test preds
    test_preds /= exmodel_config['kfolds']
    
    # save OOF preds and test-set preds
    if 'widedeep' in library:
        dump(oof_preds, Path(predpath/f"{wandb_config['name']}_{library}_{exmodel_config['kfolds']}folds_{n_epochs}epochs_model-rs{random_state}_oof_preds.joblib"))
        dump(test_preds, Path(predpath/f"{wandb_config['name']}_{library}_{exmodel_config['kfolds']}folds_{n_epochs}epochs_model-rs{random_state}_test_preds.joblib"))
        dump(oof_y, predpath/f"{exmodel_config['kfolds']}folds_{n_epochs}epochs_seed-rs{SEED}_oof_y.joblib")
    else:
        dump(oof_preds, Path(predpath/f"{wandb_config['name']}_{library}_{exmodel_config['kfolds']}folds_model-rs{random_state}_oof_preds.joblib"))
        dump(test_preds, Path(predpath/f"{wandb_config['name']}_{library}_{exmodel_config['kfolds']}folds_model-rs{random_state}_test_preds.joblib"))
        dump(oof_y, predpath/f"{exmodel_config['kfolds']}folds_seed-rs{SEED}_oof_y.joblib")
    
    
    if wandb_tracked:
#         if 'widedeep' in library:
        wandb.log({'model_valid_auc': model_valid_auc,
                   'oof_preds': oof_preds,
                   'test_preds': test_preds,
                   'model_params': str(model.parameters()) if 'widedeep' in library else str(model.get_params()), 
        #                    'model_params': str(model.get_params()),
        })
#         wandb.log({'model_valid_auc': model_valid_auc,
#                    'oof_preds': oof_preds,
#                    'test_preds': test_preds,
# #                    'model_params': str(model.get_params()),
#                   })
        wandb.finish()
    return oof_preds, test_preds
        

In [None]:
# oof_lv1_lgb1983, test_lv1_lgb1983 = cross_validate_model(library='lightgbm', wandb_tracked=True, random_state=1983)

In [None]:
# oof_lv1_cat1983, test_lv1_cat1983 = cross_validate_model(library='catboost', wandb_tracked=True, random_state=1983)

In [None]:
# oof_lv1_xgb1983, test_lv1_xgb1983 = cross_validate_model(library='xgboost', wandb_tracked=True, random_state=1983)

In [None]:
# oof_lv1_tabmlp1983, test_lv1_tabmlp1983 = cross_validate_model(library='widedeep-TabMLP', wandb_tracked=False, random_state=1983)

In [None]:
# widedeep_oof_preds = load(predpath/f"{wandb_config['name']}_widedeep-TabMLP_{exmodel_config['kfolds']}folds_rs{42}_oof_preds.joblib")

In [None]:
# widedeep_test_preds = load(predpath/f"{wandb_config['name']}_widedeep-TabMLP_{exmodel_config['kfolds']}folds_rs{42}_test_preds.joblib")

In [None]:
oof_lv1_tabmlp42, test_lv1_tabmlp42 = cross_validate_model(library='widedeep-TabMLP', wandb_tracked=True, random_state=42)

[34m[1mwandb[0m: Currently logged in as: [33mhushifang[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.4 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


FOLD 0
---------------------------------------------------


epoch 1: 100%|██████████| 782/782 [00:10<00:00, 74.41it/s, loss=0.948, metrics={'acc': 0.5859}]
epoch 2: 100%|██████████| 782/782 [00:09<00:00, 78.72it/s, loss=0.688, metrics={'acc': 0.6534}]
epoch 3: 100%|██████████| 782/782 [00:10<00:00, 76.81it/s, loss=0.556, metrics={'acc': 0.7197}]
epoch 4: 100%|██████████| 782/782 [00:10<00:00, 77.17it/s, loss=0.508, metrics={'acc': 0.7524}]
epoch 5: 100%|██████████| 782/782 [00:10<00:00, 76.32it/s, loss=0.495, metrics={'acc': 0.7602}]
epoch 6: 100%|██████████| 782/782 [00:09<00:00, 79.76it/s, loss=0.492, metrics={'acc': 0.7614}]
epoch 7: 100%|██████████| 782/782 [00:10<00:00, 76.78it/s, loss=0.491, metrics={'acc': 0.7622}]
epoch 8: 100%|██████████| 782/782 [00:10<00:00, 77.66it/s, loss=0.49, metrics={'acc': 0.7622}] 
epoch 9: 100%|██████████| 782/782 [00:10<00:00, 77.38it/s, loss=0.489, metrics={'acc': 0.7629}]
epoch 10: 100%|██████████| 782/782 [00:09<00:00, 79.10it/s, loss=0.488, metrics={'acc': 0.763}] 
epoch 11: 100%|██████████| 782/782 [00:

In [None]:
dump(oof_lv1_tabmlp42, predpath/f"{wandb_config['name']}_widedeep-TabMLP_{exmodel_config['kfolds']}folds_rs{42}_500epochs_oof_preds.joblib")
dump(test_lv1_tabmlp42, predpath/f"{wandb_config['name']}_widedeep-TabMLP_{exmodel_config['kfolds']}folds_rs{42}_500epochs_test_preds.joblib")

In [None]:
oof_lv1_tabmlp1983, test_lv1_tabmlp1983 = cross_validate_model(library='widedeep-TabMLP', wandb_tracked=True, random_state=1983)

In [33]:
dump(oof_lv1_tabmlp1983, predpath/f"{wandb_config['name']}_widedeep-TabMLP_{exmodel_config['kfolds']}folds_rs{1983}_500epochs_oof_preds.joblib")
dump(test_lv1_tabmlp1983, predpath/f"{wandb_config['name']}_widedeep-TabMLP_{exmodel_config['kfolds']}folds_rs{1983}_500epochs_test_preds.joblib")

['/home/sf/code/kaggle/tabular_playgrounds/oct2021/preds/stacking_manual_20211012_194716_widedeep-TabMLP_5folds_rs1983_500epochs_test_preds.joblib']

In [None]:
# widedeep_oof_preds[:10]

In [33]:
# widedeep_test_preds[:10]

# Single Submission

In [39]:
sample_df = pd.read_csv(datapath/'sample_submission.csv.zip')

In [40]:
sample_df.loc[:, 'target'] = test_lv1_tabmlp1983

In [41]:
sample_df.iloc[:20] # rs1983

Unnamed: 0,id,target
0,1000000,0.613051
1,1000001,0.300757
2,1000002,0.861658
3,1000003,0.831054
4,1000004,0.27175
5,1000005,0.180083
6,1000006,0.053755
7,1000007,0.297473
8,1000008,0.900432
9,1000009,0.903095


In [37]:
sample_df.iloc[:20] # rs42

Unnamed: 0,id,target
0,1000000,0.658592
1,1000001,0.331519
2,1000002,0.846932
3,1000003,0.837291
4,1000004,0.222636
5,1000005,0.212883
6,1000006,0.04912
7,1000007,0.347424
8,1000008,0.906956
9,1000009,0.894582


In [37]:
# submission_path = datapath/'submissions'
# submission_path.mkdir(exist_ok=True)

In [42]:
sample_df.to_csv(subpath/f"{wandb_config['name']}_widedeep-TabMLP-500epochs-64x32_{exmodel_config['kfolds']}folds_rs{1983}_baseline_preds.csv", index=False)

  and should_run_async(code)


In [39]:
# str(blender.estimators[2][1].get_all_params())
# blender.estimators[2][1]

In [40]:
# wandb.log({'leaderboard_auc': 0.81725,
# #            'catboost_params': str(best_catboost_params),
#           })

In [41]:
# wandb.finish()

In [42]:
# oof_y_pd = pd.Series(oof_y)

# Predictions

Loading predictions from previous model runs for ensembling.

In [32]:
# oof_lv1_xgb42 = load(predpath/'stacking_manual_20211005_205933_xgboost_5folds_rs42_oof_preds.joblib')
# test_lv1_xgb42 = load(predpath/'stacking_manual_20211005_205933_xgboost_5folds_rs42_test_preds.joblib')
oof_lv1_xgb1983 = load(predpath/'stacking_manual_20211005_205933_xgboost_5folds_rs1983_oof_preds.joblib')
test_lv1_xgb1983 = load(predpath/'stacking_manual_20211005_205933_xgboost_5folds_rs1983_test_preds.joblib')

# oof_lv1_lgb42 = load(predpath/'stacking_manual_20211005_205933_lightgbm_5folds_rs42_oof_preds.joblib')
# test_lv1_lgb42 = load(predpath/'stacking_manual_20211005_205933_lightgbm_5folds_rs42_test_preds.joblib')
oof_lv1_lgb1983 = load(predpath/'stacking_manual_20211005_205933_lightgbm_5folds_rs1983_oof_preds.joblib')
test_lv1_lgb1983 = load(predpath/'stacking_manual_20211005_205933_lightgbm_5folds_rs1983_test_preds.joblib')

oof_lv1_cat42 = load(predpath/'stacking_manual_20211007_090049_catboost_5folds_rs42_oof_preds.joblib')
test_lv1_cat42 = load(predpath/'stacking_manual_20211007_090049_catboost_5folds_rs42_test_preds.joblib')
oof_lv1_cat1983 = load(predpath/'validAUC_0.856970844007867_stacking_manual_20211005_205933_catboost_5folds_rs1983_oof_preds.joblib')
test_lv1_cat1983 = load(predpath/'stacking_manual_20211005_205933_catboost_5folds_rs1983_test_preds.joblib')

oof_lv1_tabmlp42 = load(predpath/'stacking_manual_20211008_205340_widedeep-TabMLP_5folds_30epochs_rs42_oof_preds.joblib')
test_lv1_tabmlp42 = load(predpath/'stacking_manual_20211008_205340_widedeep-TabMLP_5folds_30epochs_rs42_test_preds.joblib')


## Level One

In [33]:
oof_lv1, test_lv1 = pd.DataFrame(), pd.DataFrame() # initialize dataframes

### Generating

In [47]:
X.shape, X_test.shape

((1000000, 285), (500000, 286))

In [48]:
X_test.head()

Unnamed: 0,id,f0,f1,f2,f3,f4,f5,f6,f7,f8,...,f275,f276,f277,f278,f279,f280,f281,f282,f283,f284
0,1000000,0.178216,0.435617,0.01023,0.202074,0.39017,0.324221,0.221722,0.738894,0.582588,...,1,0,0,0,0,0,1,1,1,0
1,1000001,0.18125,0.476455,0.022413,0.283146,0.59802,0.349508,0.283467,0.721575,0.26899,...,0,0,0,0,0,0,0,0,0,0
2,1000002,0.159721,0.451202,0.259649,0.365274,0.594634,0.413502,0.249318,0.642339,0.411104,...,0,0,0,0,0,0,1,0,0,0
3,1000003,0.182424,0.520976,0.095344,0.327742,0.74183,0.358711,0.270077,0.601662,0.297742,...,0,0,0,0,0,1,1,0,0,0
4,1000004,0.229329,0.336513,0.023511,0.300913,0.668738,0.481586,0.54566,0.667849,0.546045,...,0,0,0,0,1,0,0,1,0,0


In [55]:
oof_lv1_lgb42, test_lv1_lgb42 = cross_validate_model(library='lightgbm', X=X, y=y, X_test=X_test, 
                                         wandb_config=wandb_config,
                                         random_state=42,
                                         params=best_lightgbm_params,
                                         exmodel_config=exmodel_config, 
                                         wandb_tracked=True
                                        )
oof_lv1['lgb42'] = oof_lv1_lgb42
test_lv1['lgb42'] = test_lv1_lgb42

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

[34m[1mwandb[0m: wandb version 0.12.4 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


FOLD 0
---------------------------------------------------
Valid AUC for fold 0 is 0.8569902133831346
FOLD 1
---------------------------------------------------
Valid AUC for fold 1 is 0.8557177152217484
FOLD 2
---------------------------------------------------
Valid AUC for fold 2 is 0.8566120475807572
FOLD 3
---------------------------------------------------
Valid AUC for fold 3 is 0.8556486832608683
FOLD 4
---------------------------------------------------
Valid AUC for fold 4 is 0.8562722350166698
Valid AUC score for lightgbm model is 0.85624566920555




VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
model_valid_auc,▁

0,1
model_params,{'boosting_type': 'g...
model_valid_auc,0.85625


In [57]:
dump(oof_lv1_lgb42, Path(predpath/f"{wandb_config['name']}_lightgbm_{exmodel_config['kfolds']}folds_model-rs{42}_oof_preds.joblib"))
dump(test_lv1_lgb42, Path(predpath/f"{wandb_config['name']}_lightgbm_{exmodel_config['kfolds']}folds_model-rs{42}_test_preds.joblib"))

['/home/sf/code/kaggle/tabular_playgrounds/oct2021/preds/stacking_manual_20211011_092728_lightgbm_5folds_model-rs42_test_preds.joblib']

In [35]:
# oof_lv1_lgb1983, test_lv1_lgb1983 = cross_validate_model(library='lightgbm', X=X, y=y, X_test=X_test, 
#                                                  wandb_config=wandb_config,
#                                                  random_state=1983,
#                                                  params=best_lightgbm_params,
#                                                  exmodel_config=exmodel_config, 
#                                                  wandb_tracked=True
#                                                 )
oof_lv1['lgb1983'] = oof_lv1_lgb1983
test_lv1['lgb1983'] = test_lv1_lgb1983

In [58]:
oof_lv1_xgb42, test_lv1_xgb42 = cross_validate_model(library='xgboost', X=X, y=y, X_test=X_test, 
                                         wandb_config=wandb_config,
                                         random_state=42,
                                         params=best_xgboost_params,
                                         exmodel_config=exmodel_config, 
                                         wandb_tracked=True
                                        )

oof_lv1['xgb42'] = oof_lv1_xgb42
test_lv1['xgb42'] = test_lv1_xgb42

[34m[1mwandb[0m: wandb version 0.12.4 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


FOLD 0
---------------------------------------------------




Valid AUC for fold 0 is 0.8572554115376164
FOLD 1
---------------------------------------------------




Valid AUC for fold 1 is 0.8561654493709842
FOLD 2
---------------------------------------------------




Valid AUC for fold 2 is 0.8572168508119474
FOLD 3
---------------------------------------------------




Valid AUC for fold 3 is 0.8560833380957398
FOLD 4
---------------------------------------------------




Valid AUC for fold 4 is 0.8567086183230934
Valid AUC score for xgboost model is 0.8566841128860819




VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
model_valid_auc,▁

0,1
model_params,{'objective': 'binar...
model_valid_auc,0.85668


In [59]:
dump(oof_lv1_xgb42, Path(predpath/f"{wandb_config['name']}_xgboost_{exmodel_config['kfolds']}folds_model-rs{42}_oof_preds.joblib"))
dump(test_lv1_xgb42, Path(predpath/f"{wandb_config['name']}_xgboost_{exmodel_config['kfolds']}folds_model-rs{42}_test_preds.joblib"))

  and should_run_async(code)


['/home/sf/code/kaggle/tabular_playgrounds/oct2021/preds/stacking_manual_20211011_092728_xgboost_5folds_model-rs42_test_preds.joblib']

In [37]:
# oof_lv1_xgb1983, test_lv1_xgb1983 = cross_validate_model(library='xgboost', X=X, y=y, X_test=X_test, 
#                                                  wandb_config=wandb_config,
#                                                  random_state=1983,
#                                                  params=best_xgboost_params,
#                                                  exmodel_config=exmodel_config, 
#                                                  wandb_tracked=True
#                                                 )
oof_lv1['xgb1983'] = oof_lv1_xgb1983
test_lv1['xgb1983'] = test_lv1_xgb1983

In [38]:
# oof_lv1_cat42, test_lv1_cat42 = cross_validate_model(library='catboost', X=X, y=y, X_test=X_test, 
#                                          wandb_config=wandb_config,
#                                          random_state=42,
#                                          params=best_catboost_params,
#                                          exmodel_config=exmodel_config, 
#                                          wandb_tracked=True
#                                         )
oof_lv1['cat42'] = oof_lv1_cat42
test_lv1['cat42'] = test_lv1_cat42

In [39]:
# oof_lv1_cat1983, test_lv1_cat1983 = cross_validate_model(library='catboost', X=X, y=y, X_test=X_test, 
#                                                  wandb_config=wandb_config,
#                                                  random_state=1983,
#                                                  params=best_catboost_params,
#                                                  exmodel_config=exmodel_config, 
#                                                  wandb_tracked=False
#                                                 )

oof_lv1['cat1983'] = oof_lv1_cat1983
test_lv1['cat1983'] = test_lv1_cat1983

In [42]:
oof_lv1['tabmlp42'] = oof_lv1_tabmlp42
test_lv1['tabmlp42'] = test_lv1_tabmlp42

oof_lv1_tabmlp1983 = load(predpath/'stacking_manual_20211011_092728_widedeep-TabMLP_5folds_30epochs_model-rs1983_oof_preds.joblib')
test_lv1_tabmlp1983 = load(predpath/'stacking_manual_20211011_092728_widedeep-TabMLP_5folds_30epochs_model-rs1983_test_preds.joblib')

oof_lv1['tabmlp1983'] = oof_lv1_tabmlp1983
test_lv1['tabmlp1983'] = test_lv1_tabmlp1983

  and should_run_async(code)


In [36]:
# this cell just for after-the-fact error corrections

# oof_lv1 = pd.DataFrame()
# test_lv1 = pd.DataFrame()

# oof_lv1['lgb1983'] = oof_lv1_lgb1983
# test_lv1['lgb1983'] = test_lv1_lgb1983
# oof_lv1['lgb42'] = oof_lv1_lgb42
# test_lv1['lgb42'] = test_lv1_lgb42
# oof_lv1['cat1983'] = oof_lv1_cat1983
# test_lv1['cat1983'] = test_lv1_cat1983
# oof_lv1['cat42'] = oof_lv1_cat42
# test_lv1['cat42'] = test_lv1_cat42
# oof_lv1['xgb1983'] = oof_lv1_xgb1983
# test_lv1['xgb1983'] = test_lv1_xgb1983
# oof_lv1['xgb42'] = oof_lv1_xgb42
# test_lv1['xgb42'] = test_lv1_xgb42



### Loading Sets of Predictions

In [37]:
# oof_lv1, test_lv1 = pd.DataFrame(), pd.DataFrame()
# preds_path = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/sep2021/preds/')

In [38]:
# oof_lv1['xgb42'] = load(predpath/'stacking_manual_20211005_085253_xgboost_5folds_rs42_oof_preds.joblib')
# test_lv1['xgb42'] = load(preds_path/'stacking_manual_20210925_212129_xgboost_5folds_rs42_test_preds.joblib')

In [39]:
# oof_lv_xgb42_y = load(predpath/'stacking_manual_20211005_085253_xgboost_5folds_rs42_oof_y.joblib')

In [40]:
# roc_auc_score(y_true=oof_lv_xgb42_y, y_score=oof_lv1['xgb42'])

In [41]:
# oof_lv1['xgb1983'] = load(preds_path/'validAUC_0.8146252172737458_stacking_manual_20210926_211701_xgboost_5folds_rs1983_oof_preds.joblib')
# test_lv1['xgb1983'] = load(preds_path/'stacking_manual_20210926_211701_xgboost_5folds_rs1983_test_preds.joblib')

In [42]:
# oof_lv1['lgb42'] = load(preds_path/'validAUC_0.8156810521798477_stacking_manual_20210925_212129_lightgbm_5folds_rs42_oof_preds.joblib')
# test_lv1['lgb42'] = load(preds_path/'stacking_manual_20210925_212129_lightgbm_5folds_rs42_test_preds.joblib')

In [43]:
# oof_lv1['lgb1983'] = load(preds_path/'validAUC_0.8156503194185875_stacking_manual_20210925_212129_lightgbm_5folds_rs1983_oof_preds.joblib')
# test_lv1['lgb1983'] = load(preds_path/'stacking_manual_20210925_212129_lightgbm_5folds_rs1983_test_preds.joblib')

In [44]:
# oof_lv1['cat42'] = load(preds_path/'validAUC_0.8116727090290558_stacking_manual_20210925_212129_catboost_5folds_rs42_oof_preds.joblib')
# test_lv1['cat42'] = load(preds_path/'stacking_manual_20210925_212129_catboost_5folds_rs42_test_preds.joblib')

In [45]:
# oof_lv1['cat1983'] = load(predpath/'stacking_manual_20211005_085253_catboost_5folds_rs1983_oof_preds.joblib')
# oof_cat1983_y = load(predpath/'stacking_manual_20211005_085253_catboost_5folds_rs1983_oof_y.joblib')
# roc_auc_score(y_true=oof_cat1983_y, y_score=oof_lv1['cat1983'])
# test_lv1['cat1983'] = load(preds_path/'stacking_manual_20210925_212129_catboost_5folds_rs1983_test_preds.joblib')

In [46]:
# oof_cat42_y = load(predpath/'stacking_manual_20211005_085253_catboost_5folds_rs42_oof_y.joblib')

In [47]:
# oof_cat42_y == oof_lv_xgb42_y

In [48]:
# oof_lv1.iloc[:20, :]

In [49]:
# oof_y_pd.iloc[:20]

- Why is it that the random seed seems far more important than the model type in making a prediction???

### Serialization

In [60]:
oof_lv1.head()

Unnamed: 0,lgb42,lgb1983,xgb42,xgb1983,cat42,cat1983,tabmlp42,tabmlp1983
0,0.589611,0.651299,0.627152,0.666962,0.617215,0.637863,0.517801,0.523073
1,0.977506,0.979994,0.975591,0.976313,0.998443,0.978872,0.966934,0.969927
2,0.712757,0.727006,0.674095,0.670068,0.692972,0.695742,0.764769,0.683457
3,0.284891,0.307745,0.294775,0.321891,0.289597,0.281502,0.197155,0.326599
4,0.081467,0.085114,0.069561,0.071851,0.025563,0.061077,0.071227,0.035203


In [61]:
test_lv1.head()

  and should_run_async(code)


Unnamed: 0,lgb42,lgb1983,xgb42,xgb1983,cat42,cat1983,tabmlp42,tabmlp1983
0,0.705919,0.716279,0.742503,0.74467,0.697371,0.742959,0.675174,0.667623
1,0.229481,0.227226,0.263515,0.255078,0.281691,0.236283,0.280769,0.241168
2,0.910173,0.907678,0.90982,0.903481,0.923829,0.908473,0.850372,0.862173
3,0.792741,0.812557,0.8614,0.86036,0.827925,0.844619,0.793903,0.827527
4,0.291263,0.282562,0.261821,0.262067,0.299697,0.267734,0.275253,0.271981


In [52]:
# oof_lv1.to_csv('oof_lv1.csv', index=False)
# test_lv1.to_csv('test_lv1.csv', index=False)

In [62]:
oof_lv1.to_feather(predpath/f"{wandb_config['name']}_oof_lv1.feather")
test_lv1.to_feather(predpath/f"{wandb_config['name']}_test_lv1.feather")

  and should_run_async(code)


### Lv1 Finalization

In [63]:
# oof_lv1 = pd.read_feather(predpath/f"stacking_manual_20211005_205933_oof_lv1.feather")#, columns=[str(x) for x in range()])
# test_lv1 = pd.read_feather(predpath/f"stacking_manual_20211005_205933_test_lv1.feather")
oof_y = load(predpath/f"{exmodel_config['kfolds']}folds_rs{SEED}_oof_y.joblib")

In [67]:
# del oof_lv1_lgb42, oof_lv1_lgb1983, oof_lv1_cat42, oof_lv1_cat1983, test_lv1_xgb42, test_lv1_xgb1983, test_lv1_lgb42, test_lv1_lgb1983

NameError: name 'oof_lv1_lgb42' is not defined

In [68]:
# oof_lv1['target'] = oof_y

In [69]:
# oof_lv1.iloc[:30,:]

In [70]:
# roc_auc_score(y_score=oof_lv1['lgb1983'], y_true=oof_y) # 0.5002913418201236 from original, bad version

**SO THERE'S SOMETHING WRONG WITH THE TARGET**

In [72]:
# def regenerate_targets(X=X_train, y=y_train):
#     kfold = exmodel_config['cross_val_strategy'](n_splits=exmodel_config['kfolds'], shuffle=True, random_state=SEED)
#     oof_y = []
#     for fold, (train_ids, valid_ids) in enumerate(kfold.split(X,y)):
# #         X_train, X_valid = X[train_ids], X[valid_ids]
#         y_train, y_valid = y[train_ids], y[valid_ids]
#         oof_y.extend(y_valid)
#     return oof_y

In [73]:
# oof_y_restored = regenerate_targets()

In [74]:
# oof_y[:20]

In [75]:
# oof_y_restored[:20]

In [76]:
# roc_auc_score(y_score=oof_lv1['lgb1983'], y_true=oof_y_restored) # 0.8564776078917813 from fixed version, via `regenerate_targets`

There we go

In [77]:
# dump(oof_y_restored, predpath/f"{exmodel_config['kfolds']}folds_rs{SEED}_oof_y.joblib")

In [78]:
# oof_lv1.index.name = 'id'
# test_lv1.index.name = 'id'
# oof_y.index.name = 'id'
oof_lv1 = np.array(oof_lv1)
test_lv1 = np.array(test_lv1)
oof_y = np.array(oof_y)

## Level Two

In [79]:
oof_lv2, test_lv2 = pd.DataFrame(), pd.DataFrame()

In [80]:
oof_lv2_xgb42, test_lv2_xgb42 = cross_validate_model(library='xgboost', X=oof_lv1, y=oof_y, X_test=test_lv1, 
                                         wandb_config=wandb_config,
                                         random_state=42,
                                         params=best_xgboost_params,
                                         exmodel_config=exmodel_config, 
                                         shuffle_kfolds=False,
                                         wandb_tracked=False
                                        )

dump(oof_lv2_xgb42, predpath/f"{wandb_config['name']}_oof_lv2_xgboost42_preds.joblib")
dump(test_lv2_xgb42, predpath/f"{wandb_config['name']}_test_lv2_xgboost42_preds.joblib")



FOLD 0
---------------------------------------------------




Valid AUC for fold 0 is 0.8577202863454728
FOLD 1
---------------------------------------------------




Valid AUC for fold 1 is 0.8564854813369533
FOLD 2
---------------------------------------------------




Valid AUC for fold 2 is 0.8575762043587294
FOLD 3
---------------------------------------------------




Valid AUC for fold 3 is 0.8564854476368184
FOLD 4
---------------------------------------------------




Valid AUC for fold 4 is 0.8570768676417139
Valid AUC score for xgboost model is 0.8570391151841034


['/home/sf/code/kaggle/tabular_playgrounds/oct2021/preds/stacking_manual_20211011_092728_test_lv2_xgboost42_preds.joblib']

In [81]:
# oof_xgb_f0_rs1983 = load('/media/sf/easystore/kaggle_data/tabular_playgrounds/sep2021/models/stacking_manual_20210926_211701_xgboost_5folds/xgboost_fold0_model.joblib')
# oof_xgb_f0_rs42 = load('/media/sf/easystore/kaggle_data/tabular_playgrounds/sep2021/models/stacking_manual_20210925_212129_xgboost_5folds/xgboost_fold0_model.joblib')

  and should_run_async(code)


In [82]:
oof_lv2_cat42, test_lv2_cat42 = cross_validate_model(library='catboost', X=oof_lv1, y=oof_y, X_test=test_lv1, 
                                         wandb_config=wandb_config,
                                         random_state=42,
                                         params=best_catboost_params,
                                         exmodel_config=exmodel_config, 
                                         shuffle_kfolds=False,
                                         wandb_tracked=False
                                        )

FOLD 0
---------------------------------------------------
Valid AUC for fold 0 is 0.8573228581387562
FOLD 1
---------------------------------------------------
Valid AUC for fold 1 is 0.8559621775783263
FOLD 2
---------------------------------------------------
Valid AUC for fold 2 is 0.8571996944525527
FOLD 3
---------------------------------------------------
Valid AUC for fold 3 is 0.8560668401235401
FOLD 4
---------------------------------------------------
Valid AUC for fold 4 is 0.8565969696863505
Valid AUC score for catboost model is 0.856613188427349


In [83]:
dump(oof_lv2_cat42, predpath/f"{wandb_config['name']}_oof_lv2_catboost42_preds.joblib")
dump(test_lv2_cat42, predpath/f"{wandb_config['name']}_test_lv2_catboost42_preds.joblib")

  and should_run_async(code)


['/home/sf/code/kaggle/tabular_playgrounds/oct2021/preds/stacking_manual_20211011_092728_test_lv2_catboost42_preds.joblib']

In [84]:

oof_lv2_lgb42, test_lv2_lgb42 = cross_validate_model(library='lightgbm', X=oof_lv1, y=oof_y, X_test=test_lv1, 
                                         wandb_config=wandb_config,
                                         random_state=42,
                                         params=best_lightgbm_params,
                                         exmodel_config=exmodel_config,
                                         shuffle_kfolds=False,
                                         wandb_tracked=False
                                        )

FOLD 0
---------------------------------------------------
Valid AUC for fold 0 is 0.8573649549394676
FOLD 1
---------------------------------------------------
Valid AUC for fold 1 is 0.8560835095975796
FOLD 2
---------------------------------------------------
Valid AUC for fold 2 is 0.8572661930389673
FOLD 3
---------------------------------------------------
Valid AUC for fold 3 is 0.856140245272314
FOLD 4
---------------------------------------------------
Valid AUC for fold 4 is 0.856681672252205
Valid AUC score for lightgbm model is 0.8566881890659169


In [85]:
dump(oof_lv2_lgb42, predpath/f"{wandb_config['name']}_oof_lv2_lightgbm42_preds.joblib")
dump(test_lv2_lgb42, predpath/f"{wandb_config['name']}_test_lv2_lightgbm42_preds.joblib")

  and should_run_async(code)


['/home/sf/code/kaggle/tabular_playgrounds/oct2021/preds/stacking_manual_20211011_092728_test_lv2_lightgbm42_preds.joblib']

In [86]:
oof_lv2['xgboost'] = oof_lv2_xgb42
oof_lv2['catboost'] = oof_lv2_cat42
oof_lv2['lightgbm'] = oof_lv2_lgb42

test_lv2['xgboost'] = test_lv2_xgb42
test_lv2['catboost'] = test_lv2_cat42
test_lv2['lightgbm'] = test_lv2_lgb42

In [87]:
oof_lv2.head()

Unnamed: 0,xgboost,catboost,lightgbm
0,0.591556,0.633439,0.596115
1,0.99908,0.99862,0.999482
2,0.672347,0.667398,0.6664
3,0.323725,0.313328,0.326222
4,0.02613,0.026472,0.036818


In [88]:
# oof_lv1_df = pd.read_feather(predpath/f"{wandb_config['name']}_oof_lv1.feather)

  and should_run_async(code)


In [89]:
oof_lv1 = pd.DataFrame(oof_lv1)

In [90]:
oof_lv1.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.589611,0.651299,0.627152,0.666962,0.617215,0.637863,0.517801,0.523073
1,0.977506,0.979994,0.975591,0.976313,0.998443,0.978872,0.966934,0.969927
2,0.712757,0.727006,0.674095,0.670068,0.692972,0.695742,0.764769,0.683457
3,0.284891,0.307745,0.294775,0.321891,0.289597,0.281502,0.197155,0.326599
4,0.081467,0.085114,0.069561,0.071851,0.025563,0.061077,0.071227,0.035203


In [91]:
oof_lv2_full = oof_lv2.join(oof_lv1)

  and should_run_async(code)


In [92]:
oof_lv2_full.head()

Unnamed: 0,xgboost,catboost,lightgbm,0,1,2,3,4,5,6,7
0,0.591556,0.633439,0.596115,0.589611,0.651299,0.627152,0.666962,0.617215,0.637863,0.517801,0.523073
1,0.99908,0.99862,0.999482,0.977506,0.979994,0.975591,0.976313,0.998443,0.978872,0.966934,0.969927
2,0.672347,0.667398,0.6664,0.712757,0.727006,0.674095,0.670068,0.692972,0.695742,0.764769,0.683457
3,0.323725,0.313328,0.326222,0.284891,0.307745,0.294775,0.321891,0.289597,0.281502,0.197155,0.326599
4,0.02613,0.026472,0.036818,0.081467,0.085114,0.069561,0.071851,0.025563,0.061077,0.071227,0.035203


In [93]:
test_lv1 = pd.DataFrame(test_lv1)

  and should_run_async(code)


In [94]:
test_lv1.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.705919,0.716279,0.742503,0.74467,0.697371,0.742959,0.675174,0.667623
1,0.229481,0.227226,0.263515,0.255078,0.281691,0.236283,0.280769,0.241168
2,0.910173,0.907678,0.90982,0.903481,0.923829,0.908473,0.850372,0.862173
3,0.792741,0.812557,0.8614,0.86036,0.827925,0.844619,0.793903,0.827527
4,0.291263,0.282562,0.261821,0.262067,0.299697,0.267734,0.275253,0.271981


In [95]:
test_lv2_full = test_lv2.join(test_lv1)

  and should_run_async(code)


In [96]:
test_lv2_full.head()

Unnamed: 0,xgboost,catboost,lightgbm,0,1,2,3,4,5,6,7
0,0.700139,0.701442,0.690154,0.705919,0.716279,0.742503,0.74467,0.697371,0.742959,0.675174,0.667623
1,0.267479,0.289907,0.293804,0.229481,0.227226,0.263515,0.255078,0.281691,0.236283,0.280769,0.241168
2,0.917104,0.913674,0.916517,0.910173,0.907678,0.90982,0.903481,0.923829,0.908473,0.850372,0.862173
3,0.814998,0.815807,0.811139,0.792741,0.812557,0.8614,0.86036,0.827925,0.844619,0.793903,0.827527
4,0.301881,0.304549,0.302501,0.291263,0.282562,0.261821,0.262067,0.299697,0.267734,0.275253,0.271981


In [97]:
oof_lv2_full.head()

  and should_run_async(code)


Unnamed: 0,xgboost,catboost,lightgbm,0,1,2,3,4,5,6,7
0,0.591556,0.633439,0.596115,0.589611,0.651299,0.627152,0.666962,0.617215,0.637863,0.517801,0.523073
1,0.99908,0.99862,0.999482,0.977506,0.979994,0.975591,0.976313,0.998443,0.978872,0.966934,0.969927
2,0.672347,0.667398,0.6664,0.712757,0.727006,0.674095,0.670068,0.692972,0.695742,0.764769,0.683457
3,0.323725,0.313328,0.326222,0.284891,0.307745,0.294775,0.321891,0.289597,0.281502,0.197155,0.326599
4,0.02613,0.026472,0.036818,0.081467,0.085114,0.069561,0.071851,0.025563,0.061077,0.071227,0.035203


In [98]:
oof_lv2_np = oof_lv2_full.to_numpy()
test_lv2_np = test_lv2_full.to_numpy()

  and should_run_async(code)


In [99]:
type(oof_y)

numpy.ndarray

In [100]:
oof_y_np = oof_y

## Level Three (Logistic Regression)

In [101]:
from sklearn import model_selection

In [102]:
# kfolds = model_selection.StratifiedKFold(n_splits=5, shuffle=False) # no random_state if shuffle == False

In [103]:
oof_preds, oof_y = [], []

In [104]:
test_preds = np.zeros((X_test.shape[0]))

In [105]:
X = oof_lv2_np
y = oof_y_np
X_test = test_lv2_np

In [106]:
X[:10]

array([[0.59155649, 0.63343854, 0.59611475, 0.58961118, 0.651299  ,
        0.62715232, 0.66696191, 0.61721529, 0.63786275, 0.51780111,
        0.52307266],
       [0.9990803 , 0.99862045, 0.99948165, 0.97750649, 0.97999374,
        0.9755913 , 0.97631311, 0.9984426 , 0.9788715 , 0.96693379,
        0.96992701],
       [0.67234731, 0.66739816, 0.66639993, 0.71275723, 0.72700594,
        0.6740948 , 0.67006797, 0.6929717 , 0.69574226, 0.7647689 ,
        0.68345737],
       [0.32372525, 0.31332825, 0.32622165, 0.28489091, 0.30774478,
        0.29477474, 0.32189095, 0.28959721, 0.28150182, 0.19715466,
        0.32659888],
       [0.02613011, 0.02647234, 0.03681793, 0.0814668 , 0.08511447,
        0.06956097, 0.0718514 , 0.02556273, 0.06107668, 0.07122669,
        0.03520279],
       [0.97676998, 0.97671638, 0.97270592, 0.93597717, 0.9323953 ,
        0.93387043, 0.9304139 , 0.97475635, 0.93508565, 0.91050041,
        0.92516482],
       [0.81803101, 0.81964988, 0.82615617, 0.85558287, 0.

In [107]:
library = 'sklearn (LogisticRegressor(max_iter=1000))'
# exmodel_config['library'] = library
# wandb.init(
#     project="202110_Kaggle_tabular_playground",
#     save_code=True,
#     tags=wandb_config['tags'],
#     name=wandb_config['name'],
#     notes=wandb_config['notes'],
#     config=exmodel_config
# )   

# prepare for k-fold cross-validation
# kfold = exmodel_config['cross_val_strategy'](n_splits=exmodel_config['kfolds'], shuffle=True, random_state=SEED)
kfold = exmodel_config['cross_val_strategy'](n_splits=exmodel_config['kfolds'], shuffle=False)

# setup for serialization
# model_path = Path(datapath/f"models/{wandb_config['name']}_{library}_{exmodel_config['kfolds']}folds/")
# (model_path).mkdir(exist_ok=True)

In [108]:
for fold, (train_idx, valid_idx) in enumerate(kfold.split(X,y)):
    X_train, X_valid = X[train_idx], X[valid_idx]
    y_train, y_valid = y[train_idx], y[valid_idx]
    
    print(f"FOLD {fold}")
    print("---------------------")
    
    model = LogisticRegression(max_iter=1000)
    
    model.fit(X_train, y_train)
    preds = model.predict_proba(X_valid)[:,1]
    
    oof_preds.extend(preds)
    oof_y.extend(y_valid)
    
    test_preds += model.predict_proba(X_test)[:,1]
    
    valid_auc = roc_auc_score(y_valid, preds)
    print(f"ROC AUC of fold {fold} is {valid_auc}")
    
#     dump(preds, /'lv_3)

valid_auc_total = roc_auc_score(oof_y, oof_preds)
print(f"Overall ROC_AUC is {valid_auc_total}")

dump(oof_preds, predpath/'oof_lv3_preds.joblib')
dump(oof_y, predpath/'oof_lv3_y.joblib')

test_preds /= 5

dump(test_preds, predpath/'test_lv3_preds.joblib')

FOLD 0
---------------------
ROC AUC of fold 0 is 0.8566223260769172
FOLD 1
---------------------
ROC AUC of fold 1 is 0.8551355383715116
FOLD 2
---------------------
ROC AUC of fold 2 is 0.8561968791534109
FOLD 3
---------------------
ROC AUC of fold 3 is 0.8553880969102232
FOLD 4
---------------------
ROC AUC of fold 4 is 0.8558941728129841
Overall ROC_AUC is 0.855782905682136


['/home/sf/code/kaggle/tabular_playgrounds/oct2021/preds/test_lv3_preds.joblib']

In [109]:
# X_test_imputed_scaled = pd.read_feather(path=datapath/'X_test_NaNcounts_imputed-Median-wIndicators_StandardScaled.feather')
# X_test_imputed_scaled = pd.read_feather(path=datapath/'X_test_NaNcounts_SummaryStats_imputed-Median-wIndicators-StandardScaled.feather')

  and should_run_async(code)


## Prediction Generation

In [110]:
# preds_path = Path(datapath/"preds/")

# blender_preds = blender.predict_proba(X_test_imputed_scaled)[:,1]
# dump(blender_preds, preds_path/f"{config_run['name']}_stack.joblib")

# Ensemble Submission

In [111]:
sample_df = pd.read_csv(datapath/'sample_submission.csv.zip')

In [112]:
sample_df.loc[:, 'target'] = test_preds

In [113]:
sample_df.head()

Unnamed: 0,id,target
0,1000000,0.714215
1,1000001,0.265864
2,1000002,0.935361
3,1000003,0.844627
4,1000004,0.317418


In [114]:
# submission_path = datapath/'submissions'
# submission_path.mkdir(exist_ok=True)

  and should_run_async(code)


In [115]:
sample_df.to_csv(subpath/f"{wandb_config['name']}_3level-GBM-ensemble_{exmodel_config['kfolds']}folds_rs{42}_preds.csv", index=False)

In [116]:
# wandb.log({'leaderboard_auc': ,
# #            'catboost_params': str(best_catboost_params),
#           })

In [117]:
# wandb.finish()

# Power Averaging

In [118]:
stack_sub = sample_df.copy()

In [119]:
stack_preds = stack_sub.iloc[:,1]

In [120]:
cat1983_preds = load(predpath/'stacking_manual_20211005_205933_catboost_5folds_rs1983_test_preds.joblib')

In [121]:
# cat_preds[:10]

In [122]:
# stack_sub.iloc[:10,1]

In [123]:
lgb1983_preds = load(predpath/'stacking_manual_20211005_205933_lightgbm_5folds_rs1983_test_preds.joblib')

In [124]:
lgb1983_preds[:10]

array([0.71627935, 0.22722624, 0.90767775, 0.81255748, 0.28256192,
       0.19362319, 0.02520114, 0.32767655, 0.95997145, 0.83308257])

## 4th power

In [125]:
power4_avg = (stack_preds**4 + cat1983_preds**4 + lgb1983_preds**4) / 3

In [126]:
power4_avg[:20]

0     0.276041
1     0.003593
2     0.708461
3     0.484591
4     0.007221
5     0.002113
6     0.000001
7     0.011919
8     0.867742
9     0.510930
10    0.670985
11    0.111903
12    0.000256
13    0.021074
14    0.026674
15    0.005284
16    0.007638
17    0.063980
18    0.000042
19    0.850564
Name: target, dtype: float64

In [127]:
sample_df = pd.read_csv(datapath/'sample_submission.csv.zip')

In [128]:
sample_df.loc[:, 'target'] = power4_avg

In [129]:
sample_df.head()

Unnamed: 0,id,target
0,1000000,0.276041
1,1000001,0.003593
2,1000002,0.708461
3,1000003,0.484591
4,1000004,0.007221


In [130]:
# submission_path = datapath/'submissions'
# submission_path.mkdir(exist_ok=True)

  and should_run_async(code)


In [131]:
sample_df.to_csv(subpath/f"{wandb_config['name']}_3level-GBM-stack_cat1983_lgb1983_pow-avg4_ensemble_preds.csv", index=False)

## 4th power with DL

In [156]:
type(oof_lv1_tabmlp1983), type(cat1983_preds)

(list, numpy.ndarray)

In [157]:
cat1983_preds.shape

(500000,)

In [158]:
tabmlp1983_preds = np.array(oof_lv1_tabmlp1983)

In [160]:
del tabmlp1983_preds

In [161]:
power4_avg_dl = (stack_preds**4 + cat1983_preds**4 + lgb1983_preds**4 + np.array(test_lv1_tabmlp1983)**4) / 4
power4_avg_dl[:20]

0     0.256697
1     0.003540
2     0.669486
3     0.480682
4     0.006784
5     0.002043
6     0.000002
7     0.011605
8     0.806730
9     0.539848
10    0.633282
11    0.115899
12    0.000384
13    0.018725
14    0.023493
15    0.004069
16    0.007619
17    0.059981
18    0.000093
19    0.813301
Name: target, dtype: float64

In [162]:
sample_df = pd.read_csv(datapath/'sample_submission.csv.zip')

In [163]:
sample_df.loc[:, 'target'] = power4_avg_dl

  and should_run_async(code)


In [164]:
sample_df.head()

Unnamed: 0,id,target
0,1000000,0.256697
1,1000001,0.00354
2,1000002,0.669486
3,1000003,0.480682
4,1000004,0.006784


In [130]:
# submission_path = datapath/'submissions'
# submission_path.mkdir(exist_ok=True)

  and should_run_async(code)


In [165]:
sample_df.to_csv(subpath/f"{wandb_config['name']}_3level-GBM-stack+TabMLP-at-lv1_cat1983_lgb1983_TabMLP1983_pow-avg4_ensemble_preds.csv", index=False)

  and should_run_async(code)


## 6th power

In [132]:
power6_avg = (stack_preds**6 + cat1983_preds**6 + lgb1983_preds**6) / 3

In [133]:
power6_avg[:20]

0     1.453223e-01
1     2.216024e-04
2     5.970323e-01
3     3.379802e-01
4     6.333571e-04
5     1.000517e-04
6     1.893294e-09
7     1.307097e-03
8     8.083981e-01
9     3.654518e-01
10    5.509207e-01
11    3.779948e-02
12    4.286530e-06
13    3.088500e-03
14    4.443441e-03
15    3.922523e-04
16    6.697587e-04
17    1.627795e-02
18    3.187391e-07
19    7.846978e-01
Name: target, dtype: float64

In [134]:
sample_df = pd.read_csv(datapath/'sample_submission.csv.zip')

In [135]:
sample_df.loc[:, 'target'] = power6_avg

In [136]:
sample_df.head()

Unnamed: 0,id,target
0,1000000,0.145322
1,1000001,0.000222
2,1000002,0.597032
3,1000003,0.33798
4,1000004,0.000633


In [137]:
# submission_path = datapath/'submissions'
# submission_path.mkdir(exist_ok=True)

  and should_run_async(code)


In [138]:
sample_df.to_csv(subpath/f"{wandb_config['name']}_3level-GBM-stack_cat1983_lgb1983_pow-avg6_ensemble_preds.csv", index=False)

## 5th power

In [139]:
power5_avg = (stack_preds**5 + cat1983_preds**5 + lgb1983_preds**5) / 3

In [140]:
# power5_avg[:20]

In [141]:
sample_df = pd.read_csv(datapath/'sample_submission.csv.zip')

In [142]:
sample_df.loc[:, 'target'] = power5_avg

In [143]:
sample_df.head()

Unnamed: 0,id,target
0,1000000,0.200253
1,1000001,0.00089
2,1000002,0.650299
3,1000003,0.404638
4,1000004,0.002133


In [144]:
# submission_path = datapath/'submissions'
# submission_path.mkdir(exist_ok=True)

  and should_run_async(code)


In [145]:
sample_df.to_csv(subpath/f"{wandb_config['name']}_3level-GBM-stack_cat1983_lgb1983_pow-avg5_ensemble_preds.csv", index=False)

## 3rd power

In [146]:
power3_avg = (stack_preds**3 + cat1983_preds**3 + lgb1983_preds**3) / 3

In [147]:
# power5_avg[:20]

In [148]:
sample_df = pd.read_csv(datapath/'sample_submission.csv.zip')

In [149]:
sample_df.loc[:, 'target'] = power3_avg

In [150]:
sample_df.head()

Unnamed: 0,id,target
0,1000000,0.38064
1,1000001,0.014572
2,1000002,0.771982
3,1000003,0.580526
4,1000004,0.024578


In [151]:
# submission_path = datapath/'submissions'
# submission_path.mkdir(exist_ok=True)

  and should_run_async(code)


In [152]:
sample_df.to_csv(subpath/f"{wandb_config['name']}_3level-GBM-stack_cat1983_lgb1983_pow-avg3_ensemble_preds.csv", index=False)