# GBM Trainer

# Setup

20211202

In [1]:
# notebook configuration
COLAB = False # will trigger manual installation of packages
SAGE = False # if notebook will be used on Amazon SageMaker
USE_GPU = True 
%config Completer.use_jedi = False

## Imports

In [2]:
if SAGE:
    !pip install --upgrade sklearn
    !pip install --upgrade wandb
    !pip install --upgrade catboost
    !pip install --upgrade lightgbm
    !pip install --upgrade xgboost
    !pip install --upgrade optuna

In [3]:
# basic imports
from pathlib import Path
import os
import math
from datetime import datetime
import random
import gc

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


import requests # for telegram notifications
from tqdm.notebook import tqdm

from joblib import dump, load

Now, non-stdlib imports

In [4]:
# model selection
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold

# metrics
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, log_loss, f1_score, fbeta_score

# eda
import missingno
import doubtlab 

# data cleaning
# from sklearn.impute import SimpleImputer #, KNNImputer
import cleanlab

# normalization
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler, QuantileTransformer
# from gauss_rank_scaler import GaussRankScaler

# feature generation
from sklearn.preprocessing import PolynomialFeatures
import category_encoders as ce

# models
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
# from sklearn.ensemble import StackingClassifier, RandomForestClassifier

# feature reduction
from sklearn.decomposition import PCA
from umap import UMAP

# clustering
from sklearn.cluster import DBSCAN, KMeans
import hdbscan

# feature selection
# from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
# import featuretools as ft
# from BorutaShap import BorutaShap
# from boruta import BorutaPy

# tracking 
import wandb
from wandb.xgboost import wandb_callback
from wandb.lightgbm import wandb_callback
os.environ['WANDB_NOTEBOOK_NAME'] = f"gbms_{datetime.now().strftime('%Y%m%d')}.ipynb"

In [5]:
# deep learning
import torch
from torch.optim import Adam, AdamW, Adagrad, SGD, RMSprop, LBFGS
from torch.optim.lr_scheduler import ReduceLROnPlateau, CosineAnnealingWarmRestarts, CyclicLR, OneCycleLR, StepLR, CosineAnnealingLR

# widedeep
from pytorch_widedeep import Trainer
from pytorch_widedeep.preprocessing import WidePreprocessor, TabPreprocessor
from pytorch_widedeep.models import Wide, TabMlp, WideDeep, SAINT#, TabTransformer, TabNet, TabFastFormer, TabResnet
from pytorch_widedeep.metrics import Accuracy
from pytorch_widedeep.callbacks import EarlyStopping, LRHistory, ModelCheckpoint

## Routing

Now, datapath setup

In [6]:
if COLAB:
    # mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')
    
    # handling datapath
    # datapath = Path('/content/drive/MyDrive/kaggle/tabular_playgrounds/dec2021/')
    
else:
    # if on local machine
    root = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/dec2021/')
    datapath = root/'datasets'
    # edapath = root/'EDA'
    # modelpath = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/oct2021/models/')
    predpath = root/'preds'
    subpath = root/'submissions'
    studypath = root/'studies'
    
    for pth in [datapath, predpath, subpath, studypath]:
        pth.mkdir(exist_ok=True)

  and should_run_async(code)


## Helpers

In [7]:
SEED = 42

# Function to seed everything but the models
def seed_everything(seed, pytorch=True, reproducible=True):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    if pytorch:
        torch.manual_seed(seed) # set torch CPU seed
        if torch.cuda.is_available():
            torch.cuda.manual_seed_all(seed) # set torch GPU(s) seed(s)
        if reproducible and torch.backends.cudnn.is_available():
            torch.backends.cudnn.deterministic = True
            torch.backends.cudnn.benchmark = False

seed_everything(seed=SEED)

In [8]:
def reduce_memory_usage(df, verbose=True):
    """
    Function to reduce memory usage by downcasting datatypes in a Pandas DataFrame when possible.
    
    h/t to Bryan Arnold (https://www.kaggle.com/puremath86/label-correction-experiments-tps-nov-21)
    """
    
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

In [9]:
tg_api_token = 'your_api_token' # for Galileo (jupyter_watcher_bot) on Telegram
tg_chat_id = 'your_chat_id'

import requests

def send_tg_message(text='Cell execution completed.'):  
    """
    h/t Ivan Dembicki Jr. for the base version 
    (https://medium.com/@ivan.dembicki.jr/notifications-in-jupyter-notebook-with-telegram-f2e892c55173)
    """
    requests.post('https://api.telegram.org/' +  'bot{}/sendMessage'.format(tg_api_token),
                  params=dict(chat_id=tg_chat_id, text=text))

## Dataset Setup

In [10]:
# dataset_params will initially include either trivial class instances or loaded, precomputed artifacts
dataset_params = {
    'train_source': str(datapath/'train.csv'),
    'target_source': str(datapath/'train.csv'),
    'test_source': str(datapath/'test.csv'),
    # 'scaler': str(RobustScaler()),
    # 'pca': str(load(datapath/'pca_mle-RobustScaled_orig_trainset.joblib')),
    # 'umap': str(load(datapath/'umap_reducer-20211107-n_comp10-n_neighbors15-rs42-pca_mle-RobustScaled_orig_trainset.joblib')),
}   

# referring back to the already-entered attributes, specify how the pipeline was sequenced
# dataset_params['preprocessing_pipeline'] = str([dataset_params['scaler'], dataset_params['pca'], dataset_params['umap']]) # ACTUALLY this is unwieldy
# dataset_params['preprocessing_pipeline'] = '[scaler, pca, umap]' # more fragile, but also more readable

# now, load the datasets and generate more metadata from them
# X = load(dataset_params['train_source'])
# X = pd.read_feather(dataset_params['train_source'])
# y = load(dataset_params['target_source'])
# X_test = load(dataset_params['test_source'])
# X_test = pd.read_feather(dataset_params['test_source'])

train = pd.read_csv(dataset_params['train_source'])
test = pd.read_csv(dataset_params['test_source'])

In [11]:
train.columns

Index(['Id', 'Elevation', 'Aspect', 'Slope',
       'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology',
       'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon',
       'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points',
       'Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3',
       'Wilderness_Area4', 'Soil_Type1', 'Soil_Type2', 'Soil_Type3',
       'Soil_Type4', 'Soil_Type5', 'Soil_Type6', 'Soil_Type7', 'Soil_Type8',
       'Soil_Type9', 'Soil_Type10', 'Soil_Type11', 'Soil_Type12',
       'Soil_Type13', 'Soil_Type14', 'Soil_Type15', 'Soil_Type16',
       'Soil_Type17', 'Soil_Type18', 'Soil_Type19', 'Soil_Type20',
       'Soil_Type21', 'Soil_Type22', 'Soil_Type23', 'Soil_Type24',
       'Soil_Type25', 'Soil_Type26', 'Soil_Type27', 'Soil_Type28',
       'Soil_Type29', 'Soil_Type30', 'Soil_Type31', 'Soil_Type32',
       'Soil_Type33', 'Soil_Type34', 'Soil_Type35', 'Soil_Type36',
       'Soil_Type37', 'Soil_Type38', 'Soil_Type39', 'Soil_

In [12]:
test.columns

Index(['Id', 'Elevation', 'Aspect', 'Slope',
       'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology',
       'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon',
       'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points',
       'Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3',
       'Wilderness_Area4', 'Soil_Type1', 'Soil_Type2', 'Soil_Type3',
       'Soil_Type4', 'Soil_Type5', 'Soil_Type6', 'Soil_Type7', 'Soil_Type8',
       'Soil_Type9', 'Soil_Type10', 'Soil_Type11', 'Soil_Type12',
       'Soil_Type13', 'Soil_Type14', 'Soil_Type15', 'Soil_Type16',
       'Soil_Type17', 'Soil_Type18', 'Soil_Type19', 'Soil_Type20',
       'Soil_Type21', 'Soil_Type22', 'Soil_Type23', 'Soil_Type24',
       'Soil_Type25', 'Soil_Type26', 'Soil_Type27', 'Soil_Type28',
       'Soil_Type29', 'Soil_Type30', 'Soil_Type31', 'Soil_Type32',
       'Soil_Type33', 'Soil_Type34', 'Soil_Type35', 'Soil_Type36',
       'Soil_Type37', 'Soil_Type38', 'Soil_Type39', 'Soil_

In [13]:
y = train['Cover_Type']
X = train.iloc[:,1:-1]
X_test = test.iloc[:,1:]

In [14]:
del train, test
gc.collect()

162

In [15]:
# reduce memory usage
# X = reduce_memory_usage(X)
# X_test = reduce_memory_usage(X)

# metadata logging
dataset_params['feature_count'] = X.shape[1]
dataset_params['instance_count'] = X.shape[0]
    

## Feature Engineering

First, going to try some of the basic tweaks suggested [here](https://www.kaggle.com/c/tabular-playground-series-dec-2021/discussion/293612).

In [16]:
# # remove unuseful features
# if 'Soil_Type7' in X.columns:
#     X = X.drop([ 'Soil_Type7', 'Soil_Type15'], axis=1)
#     X_test = X_test.drop(['Soil_Type7', 'Soil_Type15'], axis=1)

In [17]:
# extra feature engineering
def r(x):
    if x+180>360:
        return x-180
    else:
        return x+180

In [18]:
def fe(df):
    df['EHiElv'] = df['Horizontal_Distance_To_Roadways'] * df['Elevation']
    df['EViElv'] = df['Vertical_Distance_To_Hydrology'] * df['Elevation']
    df['Aspect2'] = df.Aspect.map(r)
    ### source: https://www.kaggle.com/c/tabular-playground-series-dec-2021/discussion/293373
    df["Aspect"][df["Aspect"] < 0] += 360
    df["Aspect"][df["Aspect"] > 359] -= 360
    df.loc[df["Hillshade_9am"] < 0, "Hillshade_9am"] = 0
    df.loc[df["Hillshade_Noon"] < 0, "Hillshade_Noon"] = 0
    df.loc[df["Hillshade_3pm"] < 0, "Hillshade_3pm"] = 0
    df.loc[df["Hillshade_9am"] > 255, "Hillshade_9am"] = 255
    df.loc[df["Hillshade_Noon"] > 255, "Hillshade_Noon"] = 255
    df.loc[df["Hillshade_3pm"] > 255, "Hillshade_3pm"] = 255
    ########
    df['Highwater'] = (df.Vertical_Distance_To_Hydrology < 0).astype(int)
    df['EVDtH'] = df.Elevation - df.Vertical_Distance_To_Hydrology
    df['EHDtH'] = df.Elevation - df.Horizontal_Distance_To_Hydrology * 0.2
    df['Euclidean_Distance_to_Hydrology'] = (df['Horizontal_Distance_To_Hydrology']**2 + df['Vertical_Distance_To_Hydrology']**2)**0.5
    df['Manhattan_Distance_to_Hydrology'] = df['Horizontal_Distance_To_Hydrology'] + df['Vertical_Distance_To_Hydrology']
    df['Hydro_Fire_1'] = df['Horizontal_Distance_To_Hydrology'] + df['Horizontal_Distance_To_Fire_Points']
    df['Hydro_Fire_2'] = abs(df['Horizontal_Distance_To_Hydrology'] - df['Horizontal_Distance_To_Fire_Points'])
    df['Hydro_Road_1'] = abs(df['Horizontal_Distance_To_Hydrology'] + df['Horizontal_Distance_To_Roadways'])
    df['Hydro_Road_2'] = abs(df['Horizontal_Distance_To_Hydrology'] - df['Horizontal_Distance_To_Roadways'])
    df['Fire_Road_1'] = abs(df['Horizontal_Distance_To_Fire_Points'] + df['Horizontal_Distance_To_Roadways'])
    df['Fire_Road_2'] = abs(df['Horizontal_Distance_To_Fire_Points'] - df['Horizontal_Distance_To_Roadways'])
    df['Hillshade_3pm_is_zero'] = (df.Hillshade_3pm == 0).astype(int)
    return df

In [19]:
X = fe(X)
X_test = fe(X_test)

In [20]:
# Summed features pointed out by @craigmthomas (https://www.kaggle.com/c/tabular-playground-series-dec-2021/discussion/292823)
soil_features = [x for x in X.columns if x.startswith("Soil_Type")]
wilderness_features = [x for x in X.columns if x.startswith("Wilderness_Area")]

X["soil_type_count"] = X[soil_features].sum(axis=1)
X_test["soil_type_count"] = X_test[soil_features].sum(axis=1)

X["wilderness_area_count"] = X[wilderness_features].sum(axis=1)
X_test["wilderness_area_count"] = X_test[wilderness_features].sum(axis=1)

  and should_run_async(code)


In [21]:
X.columns

Index(['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology',
       'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways',
       'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm',
       'Horizontal_Distance_To_Fire_Points', 'Wilderness_Area1',
       'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4',
       'Soil_Type1', 'Soil_Type2', 'Soil_Type3', 'Soil_Type4', 'Soil_Type5',
       'Soil_Type6', 'Soil_Type7', 'Soil_Type8', 'Soil_Type9', 'Soil_Type10',
       'Soil_Type11', 'Soil_Type12', 'Soil_Type13', 'Soil_Type14',
       'Soil_Type15', 'Soil_Type16', 'Soil_Type17', 'Soil_Type18',
       'Soil_Type19', 'Soil_Type20', 'Soil_Type21', 'Soil_Type22',
       'Soil_Type23', 'Soil_Type24', 'Soil_Type25', 'Soil_Type26',
       'Soil_Type27', 'Soil_Type28', 'Soil_Type29', 'Soil_Type30',
       'Soil_Type31', 'Soil_Type32', 'Soil_Type33', 'Soil_Type34',
       'Soil_Type35', 'Soil_Type36', 'Soil_Type37', 'Soil_Type38',
       'Soil_Type39', 'Soil_Type40

### Encoding the `Soil_Type` Features
Going to try reducing the dimensionality (while minimizing information loss) by reducing the `Soil_Type*` features to five 8-bit integers -- concatenate the binaries together and then convert them to decimal integer representations.

h/t Craig Thomas (https://www.kaggle.com/craigmthomas/tps-dec-2021-40-bit-soil-type-conversion)

In [22]:
X["soiltype_label"] = 0
X_test["soiltype_label"] = 0

X["soiltype_label"] = X["soiltype_label"].astype(np.int64)
X_test["soiltype_label"] = X_test["soiltype_label"].astype(np.int64)

soil_columns = [x for x in X.columns if x.startswith("Soil_Type")]



In [23]:
# X[soil_columns] = X[soil_columns].astype(np.int64)

In [24]:
X['Soil_Type3'].dtype

dtype('int64')

In [25]:
X['Soil_Type1'].dtype

dtype('int64')

In [26]:
X.iloc[0,:]['Soil_Type1']

0.0

In [27]:
type(X.iloc[0,:]['Soil_Type1'])

numpy.float64

In [28]:
X['Soil_Type1'].apply(int)

0          0
1          0
2          0
3          0
4          0
          ..
3999995    0
3999996    0
3999997    0
3999998    0
3999999    0
Name: Soil_Type1, Length: 4000000, dtype: int64

In [29]:
type(X.iloc[0,:]['Soil_Type1'])

numpy.float64

In [30]:
def make_40_bit_int_from_soiltype(row):
    value = 0
    # value = np.int64(value)
    for column in soil_columns:
        # print(iif"column is {column}")
        # print(f"value type is {type(value)}")
        # print(f"row[column] is {row[column]} with type {type(row[column])}")
        value |= int(row[column])
        value = value << 1
    return value



X["soiltype_label"] = X.apply(make_40_bit_int_from_soiltype, axis=1)
print(": Number of unique labels: {:,d}".format(X["soiltype_label"].nunique()))
#: Number of unique labels: 24,438

: Number of unique labels: 24,438


In [31]:
X_test["soiltype_label"] = X_test.apply(make_40_bit_int_from_soiltype, axis=1)
print(": Number of unique labels: {:,d}".format(X_test["soiltype_label"].nunique()))


: Number of unique labels: 16,111


In [32]:
def make_5_8_bit_ints_from_soiltype(row):
    integer1 = (np.int64(row["soiltype_label"]) & 0xFF00000000) >> 30
    integer2 = (np.int64(row["soiltype_label"]) & 0x00FF000000) >> 24
    integer3 = (np.int64(row["soiltype_label"]) & 0x0000FF0000) >> 16
    integer4 = (np.int64(row["soiltype_label"]) & 0x000000FF00) >> 8
    integer5 = (np.int64(row["soiltype_label"]) & 0x00000000FF)
    return integer1, integer2, integer3, integer4, integer5

In [33]:
X[["soiltype_int1", "soiltype_int2", "soiltype_int3", "soiltype_int4", "soiltype_int5"]] = X.apply(make_5_8_bit_ints_from_soiltype, axis=1, result_type="expand")
X_test[["soiltype_int1", "soiltype_int2", "soiltype_int3", "soiltype_int4", "soiltype_int5"]] = X_test.apply(make_5_8_bit_ints_from_soiltype, axis=1, result_type="expand")

In [34]:
X.columns

Index(['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology',
       'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways',
       'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm',
       'Horizontal_Distance_To_Fire_Points', 'Wilderness_Area1',
       'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4',
       'Soil_Type1', 'Soil_Type2', 'Soil_Type3', 'Soil_Type4', 'Soil_Type5',
       'Soil_Type6', 'Soil_Type7', 'Soil_Type8', 'Soil_Type9', 'Soil_Type10',
       'Soil_Type11', 'Soil_Type12', 'Soil_Type13', 'Soil_Type14',
       'Soil_Type15', 'Soil_Type16', 'Soil_Type17', 'Soil_Type18',
       'Soil_Type19', 'Soil_Type20', 'Soil_Type21', 'Soil_Type22',
       'Soil_Type23', 'Soil_Type24', 'Soil_Type25', 'Soil_Type26',
       'Soil_Type27', 'Soil_Type28', 'Soil_Type29', 'Soil_Type30',
       'Soil_Type31', 'Soil_Type32', 'Soil_Type33', 'Soil_Type34',
       'Soil_Type35', 'Soil_Type36', 'Soil_Type37', 'Soil_Type38',
       'Soil_Type39', 'Soil_Type40

In [35]:
X_test.columns

Index(['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology',
       'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways',
       'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm',
       'Horizontal_Distance_To_Fire_Points', 'Wilderness_Area1',
       'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4',
       'Soil_Type1', 'Soil_Type2', 'Soil_Type3', 'Soil_Type4', 'Soil_Type5',
       'Soil_Type6', 'Soil_Type7', 'Soil_Type8', 'Soil_Type9', 'Soil_Type10',
       'Soil_Type11', 'Soil_Type12', 'Soil_Type13', 'Soil_Type14',
       'Soil_Type15', 'Soil_Type16', 'Soil_Type17', 'Soil_Type18',
       'Soil_Type19', 'Soil_Type20', 'Soil_Type21', 'Soil_Type22',
       'Soil_Type23', 'Soil_Type24', 'Soil_Type25', 'Soil_Type26',
       'Soil_Type27', 'Soil_Type28', 'Soil_Type29', 'Soil_Type30',
       'Soil_Type31', 'Soil_Type32', 'Soil_Type33', 'Soil_Type34',
       'Soil_Type35', 'Soil_Type36', 'Soil_Type37', 'Soil_Type38',
       'Soil_Type39', 'Soil_Type40

In [36]:
X = X.drop([f'Soil_Type{x}' for x in range(1,41)], axis=1)

In [37]:
X_test = X_test.drop([f'Soil_Type{x}' for x in range(1,41)], axis=1)

In [38]:
X.columns

Index(['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology',
       'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways',
       'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm',
       'Horizontal_Distance_To_Fire_Points', 'Wilderness_Area1',
       'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4', 'EHiElv',
       'EViElv', 'Aspect2', 'Highwater', 'EVDtH', 'EHDtH',
       'Euclidean_Distance_to_Hydrology', 'Manhattan_Distance_to_Hydrology',
       'Hydro_Fire_1', 'Hydro_Fire_2', 'Hydro_Road_1', 'Hydro_Road_2',
       'Fire_Road_1', 'Fire_Road_2', 'Hillshade_3pm_is_zero',
       'soil_type_count', 'wilderness_area_count', 'soiltype_label',
       'soiltype_int1', 'soiltype_int2', 'soiltype_int3', 'soiltype_int4',
       'soiltype_int5'],
      dtype='object')

## Dataset Parameters

Initialized above, but now records of feature engineering efforts included.

In [39]:
dataset_params['feature_count'] = X.shape[1]
dataset_params['instance_count'] = X.shape[0]

# might eventually shift from dict to tuple

# simplest approach: k-v where key is new feature, v is string with the operation to get it
# sacrifices sortability, but could recover that through regexes, and it's much quicker to input
dataset_params['feature_combinations'] = {
    'EHiElv': "df['Horizontal_Distance_To_Roadways'] * df['Elevation']",
    'EViElv': "df['Vertical_Distance_To_Hydrology'] * df['Elevation']",
    'EVDtH': "df.Elevation - df.Vertical_Distance_To_Hydrology",
    'EHDtH': "df.Elevation - df.Horizontal_Distance_To_Hydrology * 0.2",
    'Euclidean_Distance_to_Hydrology': "(df['Horizontal_Distance_To_Hydrology']**2 + df['Vertical_Distance_To_Hydrology']**2)**0.5",
    'Manhattan_Distance_to_Hydrology': "df['Horizontal_Distance_To_Hydrology'] + df['Vertical_Distance_To_Hydrology']",
    'Hydro_Fire_1': "df['Horizontal_Distance_To_Hydrology'] + df['Horizontal_Distance_To_Fire_Points']",
    'Hydro_Fire_2': "abs(df['Horizontal_Distance_To_Hydrology'] - df['Horizontal_Distance_To_Fire_Points'])",
    'Hydro_Road_1': "abs(df['Horizontal_Distance_To_Hydrology'] + df['Horizontal_Distance_To_Roadways'])",
    'Hydro_Road_2': "abs(df['Horizontal_Distance_To_Hydrology'] - df['Horizontal_Distance_To_Roadways'])",
    'Fire_Road_1': "abs(df['Horizontal_Distance_To_Fire_Points'] + df['Horizontal_Distance_To_Roadways'])",
    'Fire_Road_2': "abs(df['Horizontal_Distance_To_Fire_Points'] - df['Horizontal_Distance_To_Roadways'])"
}

dataset_params['feature_clipping'] = [
    {
        'features': ['Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm'],
        'range': range(0,256)
    },
    {
        'features': ['Aspect'],
        'range': range(0,360)
    }
]

# the features that are just getting the one-hots counted
dataset_params['feature_counts'] = ['Soil_Type*', 'Wilderness_Area*']
dataset_params['feature_complements'] = [
    {
        'old': 'Aspect', 
        'new': 'Aspect2',
        'operation': 'If x < 180 return x-180, else return x + 180'
    },
]

dataset_params['feature_indicators'] = {
    'Hillshade_3pm_is_zero': "(df.Hillshade_3pm == 0).astype(int)",
}

dataset_params['feature_typecasting'] = {
    'Highwater': "(df.Vertical_Distance_To_Hydrology < 0).astype(int)"
}

dataset_params['feature_encodings'] = "Soil_Type* features concatenated into single 40-bit integers and then five 8-bit integers, and finally to five decimals; see gbms_20211223.ipynb and the section 'Encoding the `Soil_Type` Features'."
dataset_params['feature_removals'] = "Soil_Type* features removed after being encoded"


In [40]:
X.to_feather(datapath/'X_202112231058.feather')
X_test.to_feather(datapath/'X_test_202112231058.feather')

  and should_run_async(code)


In [41]:
dump(dataset_params, datapath/'meta_202112231058.joblib')

['/media/sf/easystore/kaggle_data/tabular_playgrounds/dec2021/datasets/meta_202112231058.joblib']

In [42]:
dataset_params

  and should_run_async(code)


{'train_source': '/media/sf/easystore/kaggle_data/tabular_playgrounds/dec2021/datasets/train.csv',
 'target_source': '/media/sf/easystore/kaggle_data/tabular_playgrounds/dec2021/datasets/train.csv',
 'test_source': '/media/sf/easystore/kaggle_data/tabular_playgrounds/dec2021/datasets/test.csv',
 'feature_count': 37,
 'instance_count': 4000000,
 'feature_combinations': {'EHiElv': "df['Horizontal_Distance_To_Roadways'] * df['Elevation']",
  'EViElv': "df['Vertical_Distance_To_Hydrology'] * df['Elevation']",
  'EVDtH': 'df.Elevation - df.Vertical_Distance_To_Hydrology',
  'EHDtH': 'df.Elevation - df.Horizontal_Distance_To_Hydrology * 0.2',
  'Euclidean_Distance_to_Hydrology': "(df['Horizontal_Distance_To_Hydrology']**2 + df['Vertical_Distance_To_Hydrology']**2)**0.5",
  'Manhattan_Distance_to_Hydrology': "df['Horizontal_Distance_To_Hydrology'] + df['Vertical_Distance_To_Hydrology']",
  'Hydro_Fire_1': "df['Horizontal_Distance_To_Hydrology'] + df['Horizontal_Distance_To_Fire_Points']",
  '

# Model Parameters

In [18]:
# # optuna 20211124, with corrected dataset and RobustScaler
# best_xgboost_params = {
#     'n_estimators': 9872,
#     'max_depth': 3,
#     'learning_rate': 0.12943882615104757,
#     'reg_alpha': 4.793236314677738,
#     'reg_lambda': 0.03427038053813167,
#     'subsample': 0.5026684329097286,
#     'min_child_weight': 3.2374430610042664,
#     'colsample_bytree': 0.9875504456465564,
#     'gamma': 4.691772640321729
# }

# # best as of 20211125, with corrected dataset and RobustScaler
# best_lightgbm_params = {
#     'n_estimators': 6986,
#     'max_depth': 3,
#     'learning_rate': 0.09080435106650955,
#     'reg_alpha': 19.060739534647425,
#     'reg_lambda': 0.12865332700612375,
#     'subsample': 0.5612404690403716,
#     'boosting_type': 'goss',
#     'min_child_samples': 17,
#     'num_leaves': 59,
#     'colsample_bytree': 0.5125554530181221
# }

# # best as of 20211126, with corrected dataset and RobustScaler
# best_catboost_params = {
#     'iterations': 17997,
#     'depth': 4,
#     'learning_rate': 0.05807421036756052,
#     'random_strength': 27,
#     'od_wait': 1664,
#     'reg_lambda': 57.67864249277457,
#     'border_count': 275,
#     'min_child_samples': 10,
#     'leaf_estimation_iterations': 2
# }

# # # 20211021 lv2 on the K-Means 8-cluster, synth dataset
# # lv2_xgboost_params = {
# #     'n_estimators': 1534,
# #     'max_depth': 4,
# #     'learning_rate': 0.0062941159127744535,
# #     'reg_alpha': 21.3946930650266,
# #     'reg_lambda': 0.021003786013817635,
# #     'subsample': 0.5726680367393964,
# #     'min_child_weight': 0.07566661785187714,
# #     'colsample_bytree': 0.7850419523745037,
# #     'gamma': 4.26660233356059
# # }

# # # 20211021 lv2 on the K-Means 8-cluster, synth dataset
# # lv2_lightgbm_params = {
# #     'n_estimators': 5776,
# #     'max_depth': 4,
# #     'learning_rate': 0.0010172282832994653,
# #     'reg_alpha': 0.013879765609402173,
# #     'reg_lambda': 0.002787031048344079,
# #     'subsample': 0.800000753298926,
# #     'boosting_type': 'gbdt',
# #     'min_child_samples': 11,
# #     'num_leaves': 190,
# #     'colsample_bytree': 0.9976443570341007
# # }

# # # 20211021 lv2 on the K-Means 8-cluster, synth dataset
# # lv2_catboost_params = {
# #     'iterations': 2000,
# #     'depth': 6,
# #     'learning_rate': 0.002984126581340097,
# #     'random_strength': 0,
# #     'od_wait': 334,
# #     'reg_lambda': 33.469738674488084,
# #     'border_count': 158,
# #     'min_child_samples': 8,
# #     'leaf_estimation_iterations': 4
# # }

# # # initial, non-default guess -- need to get optuna working (20211010)
# # # basic_widedeep_tabmlp_params = {
    
# # # }

# # # basic_widedeep_trainer_params = {
# # #     optimizers=AdamW()
# # # }

In [19]:
from lightgbm.basic import LightGBMError

## Training Parameters

In [43]:
training_params = {
    'general_random_state': SEED,
}

folds = 5
training_params['cross_val_strategy'] = StratifiedKFold(n_splits=folds, shuffle=True, random_state=SEED)

## Metadata

In [44]:
# baseline -- alter as needed later
exmodel_config = {
#     'general_random_state': SEED,
# #     'feature_generation': ['NaN_counts', 'SummaryStats', 'NaN_OneHots'],
#     'cross_val_strategy': KFold, 
#     'kfolds': 5, # if 1, that means just doing holdout
#     'test_size': 0.2,
    **training_params,
    **dataset_params
}

## Dataset Setup

## Weights and Biases Run Config

Below is the configuration for a Weights and Biases (`wandb`) run.

In [45]:
# wandb config:
wandb_config = {
    'name': f"{os.environ['WANDB_NOTEBOOK_NAME'][:-6]}_{datetime.now().strftime('%H%M%S')}", # just removes the .ipynb extension, leaving the notebook filename's stem
    'tags': ['experiment'],
    'notes': "Trying the encoded Soil_Type* features with the other feature engineering measures (see 'meta_202112231058.joblib') on vanilla GBM models for comparison"
}

# Cross-Validator


In [46]:
training_params['cross_val_strategy']

StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

In [47]:
def cross_validate_model(arch:str, X=X, y=y, X_test=X_test, model_params:dict={}, training_params=training_params, dataset_params=dataset_params,
                         folds=list(range(folds)), exmodel_config=exmodel_config, wandb_config=wandb_config,  telegram=True, random_state=42, 
                         wandb_tracked=True, encode_cats=False):
    """
    Function to handle model training process in the context of cross-validation -- via hold-out or via k-fold.
    If exmodel_config['cross_val_strategy'] == None, then any kfolds= input is ignored; otherwise, the number specified is used.
    
    :param kfolds: int specifying number of k-folds to use in cross-validation
    :param exmodel_config: dict containing general config including for cross-validation -- `kfold=1` implies hold-out
    """
    # if exmodel_config['kfolds'] == 1: # holdout case
    #     print("Proceeding with holdout")
    #     X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, 
    #                                                           random_state=SEED)                 
    # else: # k-fold cross validation case
    #     # prepare for k-fold cross-validation; random-state here is notebook-wide, not per-model
    #     # shuffle on the initial sets, but not subsequently -- performing the same operation twice means a very different dataset
    #     if shuffle_kfolds:
    #         kfold = exmodel_config['cross_val_strategy'](n_splits=exmodel_config['kfolds'], shuffle=True, random_state=SEED)
    #     else:
    #         kfold = exmodel_config['cross_val_strategy'](n_splits=exmodel_config['kfolds'], shuffle=False)
    
    kfold = training_params['cross_val_strategy']
    
    if wandb_tracked:
        exmodel_config['arch'] = arch
        exmodel_config[f'{arch}_params'] = str(model_params)
        wandb.init(
            project="202112_Kaggle_tabular_playground",
            save_code=True,
            tags=wandb_config['tags'],
            name=wandb_config['name'],
            notes=wandb_config['notes'],
            config=exmodel_config
    )   
    
    # initialize lists for out-of-fold preds and ground truth
    oof_preds, oof_y = [], []
    
    # initialize a numpy.ndarray containing the fold-model's preds for test set
    
    test_preds = np.zeros((X_test.shape[0]))
    # test_probs = np.zeros((X_test.shape[0]))
    # preprocessing
    # if using a GBM, simply use the RobustScaler
        # scaler = RobustScaler()
        # X = scaler.fit_transform(X)
        # X_test = scaler.transform(X_test)
    
    for fold, (train_ids, valid_ids) in enumerate(kfold.split(X,y)):
        if fold not in folds: # skip folds that are already trained, i.e. that haven't been specified
            continue
        else:
            print(f"FOLD {fold}")
            print("---------------------------------------------------")
            y_train, y_valid = y[train_ids], y[valid_ids] # y will be an np.ndarray already; handling will be same regardless of model
            if isinstance(X, np.ndarray):
                X_train, X_valid = X[train_ids], X[valid_ids]
            else:
                X_train, X_valid = X.iloc[train_ids,:], X.iloc[valid_ids,:] # bc need pandas.DataFrames for ce
                
                # scaling
                # category_encoding
                # if encode_cats:
                #     encoder = ce.WOEEncoder(cols=categoricals)
                #     encoder.fit(X_train,y_train)
                #     X_train = encoder.transform(X_train)
                #     X_valid = encoder.transform(X_valid)
                # # exmodel_config['feature_count'] = len(X.columns)
                #     wandb.log({
                #         'feature_count': X_train.shape[1],
                #         'instance_count': X_train.shape[0],
                #         'encoder': str(encoder)
                #     })
        
        # define models
        if arch == 'xgboost':
            model = XGBClassifier(
                booster='gbtree',
                tree_method='gpu_hist',
                random_state=random_state,
                n_jobs=-1, 
                verbosity=1, 
                objective='binary:logistic',
                **model_params)
            if wandb_tracked:
                model.fit(X_train, y_train, callbacks=[wandb.xgboost.wandb_callback()])
            else:
                model.fit(X_train, y_train)
            
            y_valid_preds = model.predict(X_valid)
            # y_valid_probs = model.predict_proba(X_valid)
            
            # add the fold-model's OOF preds and ground truths to the out-of-loop lists
            oof_preds.extend(y_valid_preds)
            # oof_probs.extend(y_valid_probs)
            oof_y.extend(y_valid)
            
            # add the fold's predictions to the model's test-set predictions (will divide later)
            test_preds += model.predict(X_test)
            # test_probs += model.predict_proba(X_test)


        elif arch == 'lightgbm':
            # try:
            model = LGBMClassifier(
                objective='binary',
                random_state=random_state,
#                     device_type='cpu',
#                     n_jobs=-1,
#                 eval_metric='auc',
                device_type='gpu',
                max_bin=63, # 15 might be even better for GPU perf, but depends on dataset -- see https://lightgbm.readthedocs.io/en/latest/GPU-Performance.html
                gpu_use_dp=False, # forces use of single precision rather than double for better perf, esp on consumer Nvidia chips
                **model_params)

            if wandb_tracked:
                model.fit(X_train, y_train, callbacks=[wandb.lightgbm.wandb_callback()],)
            else:
                model.fit(X_train, y_train)
#             except LightGBMError:
#                 model = LGBMClassifier(
#                     objective='binary',
#                     random_state=random_state,
#                     device_type='cpu',
#                     n_jobs=-1,
#     #                 eval_metric='auc',
#     #                 device_type='gpu',
#     #                 max_bin=63, # 15 might be even better for GPU perf, but depends on dataset -- see https://lightgbm.readthedocs.io/en/latest/GPU-Performance.html
#     #                 gpu_use_dp=False, # forces use of single precision rather than double for better perf, esp on consumer Nvidia chips
#                     **params)
                
#                 if wandb_tracked:
#                     model.fit(X_train, y_train, callbacks=[wandb.lightgbm.wandb_callback()],)
#                 else:
#                     model.fit(X_train, y_train)
            y_valid_preds = model.predict(X_valid)
            # y_valid_probs = model.predict_proba(X_valid)
            
            # add the fold-model's OOF preds and ground truths to the out-of-loop lists
            oof_preds.extend(y_valid_preds)
            # oof_probs.extend(y_valid_probs)
            oof_y.extend(y_valid)
            
            # add the fold's predictions to the model's test-set predictions (will divide later)
            test_preds += model.predict(X_test)
            # test_probs += model.predict_proba(X_test)[:,1]

            
        elif arch == 'catboost':
            model = CatBoostClassifier(
                task_type='GPU',
                silent=True,
                random_state=random_state,
                **model_params) 
        
            model.fit(X_train, y_train)
            
            y_valid_preds = model.predict(X_valid)
            # y_valid_probs = model.predict_proba(X_valid)[:,1] # this would only take one of 7 cols
            
            # add the fold-model's OOF preds and ground truths to the out-of-loop lists
            oof_preds.extend(y_valid_preds)
            # oof_probs.extend(y_valid_probs)
            oof_y.extend(y_valid)
            
            # add the fold's predictions to the model's test-set predictions (will divide later)
            test_preds += model.predict(X_test).flatten()
            # test_probs += model.predict_proba(X_test)[:,1]
            
#         valid_loss = log_loss(y_valid, y_pred)
        # give the valid AUC score, for edification

        fold_accuracy = accuracy_score(y_true=y_valid, y_pred=y_valid_preds) # or should be preds?
        # fold_confusion = confusion_matrix(y_true=y_valid, y_pred=y_valid_preds)# , labels=list(range(7)))
        # fold_log_loss = log_loss(y_pred=y_valid_preds, y_true=y_valid,) #labels=list(range(7)))
        # fold_roc_auc = roc_auc_score(y_true=y_valid, y_score=y_valid_probs)
        # fold_f1_score = f1_score(
        # fold_fbeta_score = fbeta_score(
        
        if wandb_tracked:
            wandb.log({f'fold{fold}_accuracy': fold_accuracy,
                       # f'fold{fold}_confusion': fold_confusion,
                       # f'fold{fold}_log_loss': fold_log_loss,
                       # f'fold{fold}_roc_auc': fold_roc_auc,
                      })
        fold_human_results = f"{os.environ['WANDB_NOTEBOOK_NAME']}\nMetrics for fold {fold} are: \nAccuracy: {fold_accuracy}"
        print(fold_human_results)
        if telegram:
            send_tg_message(text=f"{arch} model's fold {fold} complete.\n"+fold_human_results)
        # dump(model, Path(runpath/f"{arch}_fold{fold}_rs{random_state}_model.joblib"))

    model_accuracy = accuracy_score(y_true=oof_y, y_pred=oof_preds) 
    # model_confusion = confusion_matrix(y_true=oof_y, y_pred=oof_preds, labels=list(range(7)))
    # model_log_loss = log_loss(y_pred=oof_preds, y_true=oof_y, labels=list(range(7)))
    # model_valid_auc = roc_auc_score(oof_y, oof_preds)
    model_human_results = f"{os.environ['WANDB_NOTEBOOK_NAME']}\nMetrics for model {arch} are: \nAccuracy: {model_accuracy}"
    print(model_human_results)
    if telegram:
        send_tg_message(text=f"{arch} model run complete.\n"+model_human_results)
    if wandb_tracked:
        wandb.log({f'model_accuracy': fold_accuracy,
                   # f'model_confusion': fold_confusion,
                   # f'model_log_loss': fold_log_loss,
                   # f'model_roc_auc': fold_roc_auc,
                   'model_params': str(model.get_params()),
                   'model_seed': random_state,
                  })
        wandb.finish()
    
    # finalize test preds
    # test_probs /= exmodel_config['kfolds']
    # test_preds /= exmodel_config['kfolds']
    
    
    # save OOF preds and test-set preds
#     if 'widedeep' in arch:
#         dump(oof_preds, Path(predpath/f"{wandb_config['name']}_{arch}_{exmodel_config['kfolds']}folds_{n_epochs}epochs-per-fold_rs{random_state}_oof_preds.joblib"))
#         dump(test_preds, Path(predpath/f"{wandb_config['name']}_{arch}_{exmodel_config['kfolds']}folds_{n_epochs}epochs-per-fold_rs{random_state}_test_preds.joblib"))
    
#     else:
#         dump(oof_preds, Path(predpath/f"{wandb_config['name']}_{arch}_{exmodel_config['kfolds']}folds_rs{random_state}_oof_preds.joblib"))
#         dump(test_preds, Path(predpath/f"{wandb_config['name']}_{arch}_{exmodel_config['kfolds']}folds_rs{random_state}_test_preds.joblib"))
    
    # if not (datapath/f"{exmodel_config['kfolds']}folds_rs{SEED}_oof_y.joblib").is_file():
    #     dump(oof_y, predpath/f"{exmodel_config['kfolds']}folds_rs{SEED}_oof_y.joblib")
    
#     if wandb_tracked:
# #         if 'widedeep' in arch:
#         wandb.log({'model_valid_auc': model_valid_auc,
# #                    'oof_preds': oof_preds,
# #                    'test_preds': test_preds,
#                    'model_params': str(model.parameters()) if 'widedeep' in arch else str(model.get_params()), 
#         #                    'model_params': str(model.get_params()),
#         })
# #         wandb.log({'model_valid_auc': model_valid_auc,
# #                    'oof_preds': oof_preds,
# #                    'test_preds': test_preds,
# # #                    'model_params': str(model.get_params()),
# #                   })
#         wandb.finish()
    return oof_preds, test_preds#, model_confusion
        

  and should_run_async(code)


# Interface

In [48]:
xgboost_oof_preds, xgboost_test_preds = cross_validate_model('xgboost', telegram=True)

[34m[1mwandb[0m: Currently logged in as: [33mhushifang[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.9 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade




FOLD 0
---------------------------------------------------




gbms_20211223.ipynb
Metrics for fold 0 are: 
Accuracy: 0.96209875
FOLD 1
---------------------------------------------------




gbms_20211223.ipynb
Metrics for fold 1 are: 
Accuracy: 0.96262
FOLD 2
---------------------------------------------------




gbms_20211223.ipynb
Metrics for fold 2 are: 
Accuracy: 0.96191875
FOLD 3
---------------------------------------------------




gbms_20211223.ipynb
Metrics for fold 3 are: 
Accuracy: 0.96230375
FOLD 4
---------------------------------------------------




gbms_20211223.ipynb
Metrics for fold 4 are: 
Accuracy: 0.96194375
gbms_20211223.ipynb
Metrics for model xgboost are: 
Accuracy: 0.962177


VBox(children=(Label(value=' 0.34MB of 0.34MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
fold0_accuracy,▁
fold1_accuracy,▁
fold2_accuracy,▁
fold3_accuracy,▁
fold4_accuracy,▁
model_accuracy,▁
model_seed,▁

0,1
fold0_accuracy,0.9621
fold1_accuracy,0.96262
fold2_accuracy,0.96192
fold3_accuracy,0.9623
fold4_accuracy,0.96194
model_accuracy,0.96194
model_params,{'objective': 'multi...
model_seed,42


In [49]:
lightgbm_oof_preds, lightgbm_test_preds = cross_validate_model('lightgbm', telegram=True)


  and should_run_async(code)
[34m[1mwandb[0m: wandb version 0.12.9 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade




FOLD 0
---------------------------------------------------
gbms_20211223.ipynb
Metrics for fold 0 are: 
Accuracy: 0.93337875
FOLD 1
---------------------------------------------------
gbms_20211223.ipynb
Metrics for fold 1 are: 
Accuracy: 0.9322075
FOLD 2
---------------------------------------------------
gbms_20211223.ipynb
Metrics for fold 2 are: 
Accuracy: 0.94882125
FOLD 3
---------------------------------------------------
gbms_20211223.ipynb
Metrics for fold 3 are: 
Accuracy: 0.95228625
FOLD 4
---------------------------------------------------
gbms_20211223.ipynb
Metrics for fold 4 are: 
Accuracy: 0.94496
gbms_20211223.ipynb
Metrics for model lightgbm are: 
Accuracy: 0.94233075


VBox(children=(Label(value=' 0.35MB of 0.35MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
fold0_accuracy,▁
fold1_accuracy,▁
fold2_accuracy,▁
fold3_accuracy,▁
fold4_accuracy,▁
model_accuracy,▁
model_seed,▁

0,1
fold0_accuracy,0.93338
fold1_accuracy,0.93221
fold2_accuracy,0.94882
fold3_accuracy,0.95229
fold4_accuracy,0.94496
model_accuracy,0.94496
model_params,{'boosting_type': 'g...
model_seed,42


In [50]:
catboost_oof_preds, catboost_test_preds = cross_validate_model('catboost', telegram=True)
# except:
    # send_tg_message(text=f"{os.environ['WANDB_NOTEBOOK_NAME']}\n{arch} model training crashed")

  and should_run_async(code)
[34m[1mwandb[0m: wandb version 0.12.9 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade




FOLD 0
---------------------------------------------------
gbms_20211223.ipynb
Metrics for fold 0 are: 
Accuracy: 0.9621775
FOLD 1
---------------------------------------------------
gbms_20211223.ipynb
Metrics for fold 1 are: 
Accuracy: 0.96228875
FOLD 2
---------------------------------------------------
gbms_20211223.ipynb
Metrics for fold 2 are: 
Accuracy: 0.96179875
FOLD 3
---------------------------------------------------
gbms_20211223.ipynb
Metrics for fold 3 are: 
Accuracy: 0.9620025
FOLD 4
---------------------------------------------------
gbms_20211223.ipynb
Metrics for fold 4 are: 
Accuracy: 0.96153875
gbms_20211223.ipynb
Metrics for model catboost are: 
Accuracy: 0.96196125


VBox(children=(Label(value=' 0.35MB of 0.35MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
fold0_accuracy,▁
fold1_accuracy,▁
fold2_accuracy,▁
fold3_accuracy,▁
fold4_accuracy,▁
model_accuracy,▁
model_seed,▁

0,1
fold0_accuracy,0.96218
fold1_accuracy,0.96229
fold2_accuracy,0.9618
fold3_accuracy,0.962
fold4_accuracy,0.96154
model_accuracy,0.96154
model_params,"{'silent': True, 'ta..."
model_seed,42


## Serialization

In [27]:
wrapper = {
    'metadata': {
        'dataset_params': dataset_params,
        'training_params': training_params,
        'model_params': 'defaults, all on GPU'
    },
    'preds': {
        'oof_preds': {
            'xgb42': xgboost_oof_preds,
            'lgb42': lightgbm_oof_preds,
            'cat42': catboost_oof_preds,
        },
        'test_preds': {
            'xgb42': xgboost_test_preds,
            'lgb42': lightgbm_test_preds,
            'cat42': catboost_test_preds
        }
    }
}

  and should_run_async(code)


In [28]:
dump(wrapper, predpath/'gbms_20211209-default_model_params__manual_feature_engineering__strat5fold.joblib')

  and should_run_async(code)


['/media/sf/easystore/kaggle_data/tabular_playgrounds/dec2021/preds/gbms_20211209-default_model_params__manual_feature_engineering__strat5fold.joblib']

In [29]:
check = load(predpath/'gbms_20211209-default_model_params__manual_feature_engineering__strat5fold.joblib')

In [30]:
oof_preds = pd.DataFrame({key: check['preds']['oof_preds'][key] for key in check['preds']['oof_preds'].keys()})

In [31]:
oof_preds.head()

Unnamed: 0,xgb42,lgb42,cat42
0,1,1,[1]
1,2,2,[2]
2,2,2,[2]
3,1,1,[1]
4,1,1,[1]


# CatBoost Debugging
The `catboost` model is having some issues with inference. Specifically, this:

```
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-17-70ed9541a9b0> in <module>
----> 1 catboost_oof_preds, catboost_test_preds = cross_validate_model('catboost', telegram=False)
      2 # except:
      3     # send_tg_message(text=f"{os.environ['WANDB_NOTEBOOK_NAME']}\n{arch} model training crashed")

<ipython-input-14-1fec85dde944> in cross_validate_model(arch, X, y, X_test, params, folds, exmodel_config, wandb_config, telegram, random_state, shuffle_kfolds, wandb_tracked, encode_cats)
    164 
    165             # add the fold's predictions to the model's test-set predictions (will divide later)
--> 166             test_preds += model.predict(X_test)
    167             # test_probs += model.predict_proba(X_test)[:,1]
    168 

ValueError: non-broadcastable output operand with shape (1000000,) doesn't match the broadcast shape (1000000,1000000)
```

In [15]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=SEED)        

In [16]:
model = CatBoostClassifier(
                task_type='GPU',
                silent=True,
                random_state=SEED,
                # **params
) 
        
model.fit(X_train, y_train)

<catboost.core.CatBoostClassifier at 0x7f56d15aebb0>

In [17]:
y_valid_preds = model.predict(X_valid)
# y_valid_probs = model.predict_proba(X_valid)[:,1] # this would only take one of 7 cols

# # add the fold-model's OOF preds and ground truths to the out-of-loop lists
# oof_preds.extend(y_valid_preds)
# # oof_probs.extend(y_valid_probs)
# oof_y.extend(y_valid)

In [18]:
y_valid_preds

array([[1],
       [2],
       [2],
       ...,
       [2],
       [1],
       [2]])

In [19]:
y_valid_preds.shape

(800000, 1)

In [20]:
# add the fold's predictions to the model's test-set predictions (will divide later)
test_preds = np.zeros(X_test.shape[0])

test_preds += model.predict(X_test)

ValueError: non-broadcastable output operand with shape (1000000,) doesn't match the broadcast shape (1000000,1000000)

In [21]:
pred = model.predict(X_test)

In [22]:
pred.shape

(1000000, 1)

In [23]:
pred

array([[2],
       [2],
       [2],
       ...,
       [2],
       [1],
       [3]])

In [24]:
template_preds = np.zeros(X_test.shape[0])

In [25]:
template_preds.shape

(1000000,)

In [26]:
template_preds

array([0., 0., 0., ..., 0., 0., 0.])

Is the issue that CatBoost is outputting its predictions as single-item lists rather than as scalars? 

Let's see what XGBoost does (since it seems to work just fine):

In [28]:
xgb_model = XGBClassifier(
                booster='gbtree',
                tree_method='gpu_hist',
                random_state=SEED,
                n_jobs=-1, 
                verbosity=1, 
                objective='binary:logistic',
                # **params
)

In [29]:
xgb_model.fit(X_train, y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=0, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=-1,
              num_parallel_tree=1, objective='multi:softprob', predictor='auto',
              random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
              subsample=1, tree_method='gpu_hist', validate_parameters=1,
              verbosity=1)

In [30]:
xgb_test_preds = xgb_model.predict(X_test)

In [31]:
xgb_test_preds

array([2, 2, 2, ..., 2, 1, 3], dtype=int8)

In [32]:
xgb_test_preds.shape

(1000000,)

I think that's the issue -- I just need to flatten out the CatBoost preds. $\blacksquare$

## Solution

In [34]:
cat_test_preds = pred
del pred

In [35]:
cat_test_preds.shape

(1000000, 1)

In [36]:
cat_flat = cat_test_preds.flatten()

  and should_run_async(code)


In [37]:
cat_flat.shape

(1000000,)

In [38]:
cat_flat

array([2, 2, 2, ..., 2, 1, 3])

Looks fine. $\blacksquare$

## Level One

# Serialization

In [None]:
baseline_oof_preds = pd.DataFrame({'xgb42': xgboost_oof_preds,
                                   'lgb42': lightgbm_oof_preds,
                                   'cat42': catboost_oof_preds
                                  })

In [None]:
baseline_oof_preds.to_feather

I'm going to try adding more models to this run -- a third seed for everything -- and also train the TabMLP models deeper -- 300 epochs/fold. I'm not confident that I've found a set of predictions that use the best model hyperparameters on the original dataset, so I'm going to just run this all over again. In the future, I **need** to do better at tracking artifacts.

In [16]:
architectures = ['xgboost', 'lightgbm', 'catboost']#, 'widedeep-tabmlp', 'widedeep-saint']

In [17]:
model_seeds = [42]#, 1983, 550, 1701, 2063]

In [19]:
lv1_params = {
    'xgboost': best_xgboost_params,
    'lightgbm': best_lightgbm_params,
    'catboost': best_catboost_params
}
#         'n_estimators': 8784,
#         'depth': 9,
#         'learning_rate': 0.004167178645277267,
#         'reg_alpha': 0.007249923752866805,
#         'reg_lambda': 0.08945255185214125,
#         'subsample': 0.7288417897178108,
#         'min_child_weight': 3.9187138542139577,
#         'colsample_bytree': 0.5284325948533055,
#         'gamma': 3.0265775282730822}
#     'lightgbm': { # thru trial 38 cross-validated on RobustScaled orig dataset, as of 202111031440
#         'n_estimators': 5108,
#         'max_depth': 4,
#         'learning_rate': 0.01253791570387513,
#         'reg_alpha': 0.015194423057424834,
#         'reg_lambda': 10.289397982794664,
#         'subsample': 0.996318668039871,
#         'boosting_type': 'goss',
#         'min_child_samples': 18,
#         'num_leaves': 218,
#         'colsample_bytree': 0.580388444330496 },
#     'catboost': { # thur trial 45 cross-validated on RobustScaled orig dataset, as of 202111041011
#         'iterations': 29222,
#         'depth': 7,
#         'learning_rate': 0.0067277390824230605,
#         'random_strength': 1,
#         'od_wait': 1989,
#         'reg_lambda': 51.436909447809484,
#         'border_count': 239,
#         'min_child_samples': 11,
#         'leaf_estimation_iterations': 2}
#     # 'widedeep-tabmlp': #todo,
#     # 'widedeep-saint': #todo,
# }

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


In [18]:
oof_lv1, test_lv1 = pd.DataFrame(), pd.DataFrame() # initialize dataframes

### Generating

In [38]:
for arch in architectures:
    for model_seed in model_seeds:
        # update exmodel_config here
        oof_pred, test_pred = cross_validate_model(arch=arch, X=X, y=y, X_test=X_test, 
                                         wandb_config=wandb_config,
                                         random_state=model_seed,
                                         params=lv1_params[arch],
                                         exmodel_config=exmodel_config, 
                                         wandb_tracked=True
                                        )
        oof_lv1[f'{arch}{model_seed}'] = oof_pred
        test_lv1[f'{arch}{model_seed}'] = test_pred

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

[34m[1mwandb[0m: wandb version 0.12.7 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


FOLD 0
---------------------------------------------------


Old style callback is deprecated.  See: https://xgboost.readthedocs.io/en/latest/python/callbacks.html


Valid AUC for fold 0 is 0.9977384987899559
FOLD 1
---------------------------------------------------


Old style callback is deprecated.  See: https://xgboost.readthedocs.io/en/latest/python/callbacks.html


Valid AUC for fold 1 is 0.9975907995539091
FOLD 2
---------------------------------------------------


Old style callback is deprecated.  See: https://xgboost.readthedocs.io/en/latest/python/callbacks.html


Valid AUC for fold 2 is 0.9977698301461745
FOLD 3
---------------------------------------------------


Old style callback is deprecated.  See: https://xgboost.readthedocs.io/en/latest/python/callbacks.html


Valid AUC for fold 3 is 0.997832811842116
FOLD 4
---------------------------------------------------


Old style callback is deprecated.  See: https://xgboost.readthedocs.io/en/latest/python/callbacks.html


Valid AUC for fold 4 is 0.9978775631669342
Valid AUC score for xgboost model is 0.9977616541196173


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
fold0_valid_roc_auc,▁
fold1_valid_roc_auc,▁
fold2_valid_roc_auc,▁
fold3_valid_roc_auc,▁
fold4_valid_roc_auc,▁
model_seed,▁
overall_valid_auc,▁

0,1
fold0_valid_roc_auc,0.99774
fold1_valid_roc_auc,0.99759
fold2_valid_roc_auc,0.99777
fold3_valid_roc_auc,0.99783
fold4_valid_roc_auc,0.99788
model_params,{'objective': 'binar...
model_seed,42
overall_valid_auc,0.99776


[34m[1mwandb[0m: wandb version 0.12.7 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


FOLD 0
---------------------------------------------------
Valid AUC for fold 0 is 0.9942307089004218
FOLD 1
---------------------------------------------------
Valid AUC for fold 1 is 0.9938512506755542
FOLD 2
---------------------------------------------------
Valid AUC for fold 2 is 0.994493551634329
FOLD 3
---------------------------------------------------
Valid AUC for fold 3 is 0.9943653102613532
FOLD 4
---------------------------------------------------
Valid AUC for fold 4 is 0.9943556788100247
Valid AUC score for lightgbm model is 0.9942580478596814


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
fold0_valid_roc_auc,▁
fold1_valid_roc_auc,▁
fold2_valid_roc_auc,▁
fold3_valid_roc_auc,▁
fold4_valid_roc_auc,▁
model_seed,▁
overall_valid_auc,▁

0,1
fold0_valid_roc_auc,0.99423
fold1_valid_roc_auc,0.99385
fold2_valid_roc_auc,0.99449
fold3_valid_roc_auc,0.99437
fold4_valid_roc_auc,0.99436
model_params,{'boosting_type': 'g...
model_seed,42
overall_valid_auc,0.99426


[34m[1mwandb[0m: wandb version 0.12.7 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


FOLD 0
---------------------------------------------------
Valid AUC for fold 0 is 0.998392591369159
FOLD 1
---------------------------------------------------
Valid AUC for fold 1 is 0.9982998446127181
FOLD 2
---------------------------------------------------
Valid AUC for fold 2 is 0.9983677957877765
FOLD 3
---------------------------------------------------
Valid AUC for fold 3 is 0.9983324731459333
FOLD 4
---------------------------------------------------
Valid AUC for fold 4 is 0.9984893279267936
Valid AUC score for catboost model is 0.9983761649480589


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
fold0_valid_roc_auc,▁
fold1_valid_roc_auc,▁
fold2_valid_roc_auc,▁
fold3_valid_roc_auc,▁
fold4_valid_roc_auc,▁
model_seed,▁
overall_valid_auc,▁

0,1
fold0_valid_roc_auc,0.99839
fold1_valid_roc_auc,0.9983
fold2_valid_roc_auc,0.99837
fold3_valid_roc_auc,0.99833
fold4_valid_roc_auc,0.99849
model_params,{'iterations': 17997...
model_seed,42
overall_valid_auc,0.99838


In [41]:
dump(oof_lv1, predpath/'oof_gbms.joblib')
dump(test_lv1, predpath/'testpreds_gbms.joblib')

['/media/sf/easystore/kaggle_data/tabular_playgrounds/nov2021/preds/testpreds_gbms.joblib']

In [23]:
X_test.head()

Unnamed: 0,id,f0,f1,f2,f3,f4,f5,f6,f7,f8,...,f90,f91,f92,f93,f94,f95,f96,f97,f98,f99
0,600000,0.003229,4.83866,585.529,2.28291,0.71318,3.90783,0.480696,1.48227,4.89181,...,0.11291,1.07355,0.122149,0.08633,0.03601,0.010619,0.290343,1.8982,0.131533,0.012047
1,600001,0.008602,0.505536,-100.099,3.01267,0.027199,1.19461,5.03662,2.51744,4.55389,...,-0.020214,2.62234,0.123307,0.033063,0.123059,0.005771,-0.392923,3.68964,0.047418,0.120015
2,600002,1.461,2.43726,-112.964,3.54123,0.752338,4.33831,1.64808,4.69991,1.95025,...,-0.011036,2.03018,-0.000426,0.084091,0.123605,0.499554,4.05465,3.33067,0.108843,0.064687
3,600003,0.140556,3.08561,179.451,0.573945,0.057342,2.21679,1.62348,0.526174,1.54254,...,0.050117,0.221613,0.045298,0.129966,0.004015,0.018279,2.69658,-0.533491,0.052524,0.011058
4,600004,0.128876,5.19976,107.466,-0.497149,0.08022,0.458121,0.629839,5.24046,-0.232279,...,0.05886,2.66043,0.135425,0.036481,0.093912,0.056315,1.11071,3.58447,0.145319,-0.050393


In [24]:
X_test = X_test.drop('id', axis=1)

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


In [None]:
# oof_lv1.to_feather(predpath/f'stacking_oof.feather')
# test_lv1.to_feather(predpath/f'stacking_test.feather')

### Serialization

In [20]:
# oof_lv1 = pd.read_feather(predpath/f"stacking_manual_20211027_143755nb-20211028183233run-X_orig-oof-lv1.feather")#, columns=[str(x) for x in range()])
# test_lv1 = pd.read_feather(predpath/f"stacking_manual_20211027_143755nb-20211028183233run-X_orig-test-lv1.feather")
oof_y = load(predpath/f"{exmodel_config['kfolds']}folds_rs{SEED}_oof_y.joblib")

In [21]:
# oof_lv1_1701 = pd.read_feather(altdatapath/'oof_lv1_1701_models.feather')
# test_lv1_1701 = pd.read_feather(altdatapath/'test_lv1_1701_models.feather')
# oof_lv1_2063 = pd.read_feather(altdatapath/'oof_lv1_2063_models.feather')
# test_lv1_2063 = pd.read_feather(altdatapath/'test_lv1_2063_models.feather')

In [22]:
# oof_lv1.iloc[:10,:]

In [23]:
# oof_lv1_1701.head()

In [24]:
# oof_lv1_2063.head()

In [25]:
# oof_lv1.join(oof_lv1_1701).join(oof_lv1_2063)

In [26]:
# test_lv1.join(test_lv1_1701).join(test_lv1_2063)

In [89]:
oof_lv1.to_feather(altdatapath/'oof_lv1_5rs.feather')
# oof_lv1 = pd.read_feather(altdatapath/'oof_lv1_5rs.feather')

In [90]:
test_lv1.to_feather(altdatapath/'test_lv1_5rs.feather')
# test_lv1 = pd.read_feather(altdatapath/'test_lv1_5rs.feather')

In [29]:
oof_lv1.head()

Unnamed: 0,lgb42,lgb1983,lgb550,xgb42,xgb1983,xgb550,cat42,cat1983,cat550,tabmlp42,tabmlp1983,tabmlp550
0,0.643063,0.644761,0.651015,0.627152,0.666962,0.658337,0.633626,0.637863,0.618124,0.528318,0.55158,0.45573
1,0.979158,0.980269,0.979628,0.975591,0.976313,0.975725,0.978221,0.978872,0.979702,0.960835,0.958873,0.96527
2,0.6948,0.735218,0.685842,0.674095,0.670068,0.668585,0.75838,0.695742,0.687652,0.722547,0.600854,0.755067
3,0.315001,0.310552,0.31715,0.294775,0.321891,0.326354,0.277934,0.281502,0.280313,0.200837,0.214472,0.273882
4,0.086256,0.089822,0.081587,0.069561,0.071851,0.072671,0.060639,0.061077,0.068461,0.060948,0.034382,0.082108


In [30]:
oof_lv1 = oof_lv1.join(pd.read_feather(altdatapath/'oof_lv1_1701_models.feather')).join(pd.read_feather(altdatapath/'oof_lv1_2063_models.feather'))

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


In [31]:
oof_lv1.head()

Unnamed: 0,lgb42,lgb1983,lgb550,xgb42,xgb1983,xgb550,cat42,cat1983,cat550,tabmlp42,tabmlp1983,tabmlp550,lgb1701,xgb1701,cat1701,tablmp1701,lgb2063,xgb2063,cat2063,tablmp2063
0,0.643063,0.644761,0.651015,0.627152,0.666962,0.658337,0.633626,0.637863,0.618124,0.528318,0.55158,0.45573,0.647845,0.636991,0.604513,0.550344,0.636336,0.652631,0.630627,0.550438
1,0.979158,0.980269,0.979628,0.975591,0.976313,0.975725,0.978221,0.978872,0.979702,0.960835,0.958873,0.96527,0.979586,0.975426,0.977638,0.967344,0.979467,0.973305,0.977416,0.966318
2,0.6948,0.735218,0.685842,0.674095,0.670068,0.668585,0.75838,0.695742,0.687652,0.722547,0.600854,0.755067,0.71665,0.679206,0.788293,0.634997,0.642301,0.663404,0.696574,0.747513
3,0.315001,0.310552,0.31715,0.294775,0.321891,0.326354,0.277934,0.281502,0.280313,0.200837,0.214472,0.273882,0.30176,0.318009,0.272723,0.223079,0.311523,0.316786,0.280914,0.174859
4,0.086256,0.089822,0.081587,0.069561,0.071851,0.072671,0.060639,0.061077,0.068461,0.060948,0.034382,0.082108,0.094915,0.065146,0.06642,0.050387,0.084422,0.068258,0.064579,0.073367


In [32]:
test_lv1 = test_lv1.join(pd.read_feather(altdatapath/'test_lv1_1701_models.feather')).join(pd.read_feather(altdatapath/'test_lv1_2063_models.feather'))

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


In [33]:
oof_lv1

Unnamed: 0,lgb42,lgb1983,lgb550,xgb42,xgb1983,xgb550,cat42,cat1983,cat550,tabmlp42,tabmlp1983,tabmlp550,lgb1701,xgb1701,cat1701,tablmp1701,lgb2063,xgb2063,cat2063,tablmp2063
0,0.643063,0.644761,0.651015,0.627152,0.666962,0.658337,0.633626,0.637863,0.618124,0.528318,0.551580,0.455730,0.647845,0.636991,0.604513,0.550344,0.636336,0.652631,0.630627,0.550438
1,0.979158,0.980269,0.979628,0.975591,0.976313,0.975725,0.978221,0.978872,0.979702,0.960835,0.958873,0.965270,0.979586,0.975426,0.977638,0.967344,0.979467,0.973305,0.977416,0.966318
2,0.694800,0.735218,0.685842,0.674095,0.670068,0.668585,0.758380,0.695742,0.687652,0.722547,0.600854,0.755067,0.716650,0.679206,0.788293,0.634997,0.642301,0.663404,0.696574,0.747513
3,0.315001,0.310552,0.317150,0.294775,0.321891,0.326354,0.277934,0.281502,0.280313,0.200837,0.214472,0.273882,0.301760,0.318009,0.272723,0.223079,0.311523,0.316786,0.280914,0.174859
4,0.086256,0.089822,0.081587,0.069561,0.071851,0.072671,0.060639,0.061077,0.068461,0.060948,0.034382,0.082108,0.094915,0.065146,0.066420,0.050387,0.084422,0.068258,0.064579,0.073367
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,0.877003,0.866812,0.859368,0.873962,0.876338,0.882213,0.876955,0.876530,0.873538,0.872165,0.898925,0.846161,0.876102,0.867594,0.875290,0.880338,0.863979,0.866435,0.880339,0.894021
999996,0.735823,0.732218,0.702689,0.777820,0.785770,0.789151,0.766000,0.766401,0.768164,0.751403,0.660141,0.760324,0.758805,0.749080,0.768249,0.743479,0.712077,0.768061,0.726242,0.661662
999997,0.151834,0.155176,0.146293,0.140220,0.146466,0.153901,0.164341,0.165027,0.164354,0.103621,0.129194,0.094390,0.151669,0.151052,0.155469,0.077828,0.143994,0.135747,0.161125,0.168268
999998,0.205738,0.182986,0.196857,0.216337,0.200384,0.197216,0.232795,0.227559,0.229330,0.277918,0.401489,0.343227,0.222530,0.217675,0.229789,0.335021,0.186798,0.224917,0.226820,0.178284


Now, I'll quickly do some Optuna for the above lv1 tables.

## Level Two

In [34]:
# oof_lv1 = pd.read_feather(predpath/'stacking_manual_20211020_104938_X_orig+KMeans8+synth_oof_lv1.feather')
# oof_lv1 = pd.read_feather(predpath/'stacking_manual_20211011_092728_oof_lv1.feather') # basis for best original dataset run as of 20211021
# test_lv1 = pd.read_feather(predpath/'stacking_manual_20211020_104938_X_orig+KMeans8+synth_test_lv1.feather')
# test_lv1 = pd.read_feather(predpath/'stacking_manual_20211011_092728_test_lv1.feather') # basis for best original dataset run as of 20211021
# oof_y = load(predpath/'5folds_rs42_oof_y.joblib')

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


In [91]:
lv2_xgboost_params = {
    'n_estimators': 8398,
    'max_depth': 7,
    'learning_rate': 0.005152495267590912,
    'reg_alpha': 0.003501877866082655,
    'reg_lambda': 0.5892650659577978,
    'subsample': 0.5321195352481216,
    'min_child_weight': 1.7950489828988663,
    'colsample_bytree': 0.6100580666401978,
    'gamma': 7.836182280294878}

# trial 27
lv2_lightgbm_params = {
    'n_estimators': 5978,
    'max_depth': 4,
    'learning_rate': 0.002972483637079397,
    'reg_alpha': 0.0066240595682091315,
    'reg_lambda': 0.023021277110080198,
    'subsample': 0.762378215816119,
    'boosting_type': 'goss',
    'min_child_samples': 14,
    'num_leaves': 108,
    'colsample_bytree': 0.6831809216468459}

# trial 62
lv2_catboost_params = {
    'iterations': 2633, 
    'depth': 10, 
    'learning_rate': 0.0023597486442471613, 
    'random_strength': 2, 
    'od_wait': 1597, 
    'reg_lambda': 60.949736178635824, 
    'border_count': 109, 
    'min_child_samples': 20, 
    'leaf_estimation_iterations': 3
}

In [92]:
oof_lv2, test_lv2 = pd.DataFrame(), pd.DataFrame()

In [93]:
oof_y = pd.Series(oof_y)

In [94]:
oof_lv2_xgb42, test_lv2_xgb42 = cross_validate_model(library='xgboost', X=oof_lv1, y=oof_y, X_test=test_lv1, 
                                         wandb_config=wandb_config,
                                         random_state=42,
                                         params=lv2_xgboost_params,
                                         exmodel_config=exmodel_config, 
                                         shuffle_kfolds=False,
                                         wandb_tracked=False,
                                         encode_cats=False
                                        )

dump(oof_lv2_xgb42, predpath/f"{wandb_config['name']}nb-{datetime.now().strftime('%Y%m%d%H%M%S')}run-X_orig_oof_lv2_xgboost42_preds.joblib")
dump(test_lv2_xgb42, predpath/f"{wandb_config['name']}nb-{datetime.now().strftime('%Y%m%d%H%M%S')}run-X_orig_test_lv2_xgboost42_preds.joblib")



FOLD 0
---------------------------------------------------




Valid AUC for fold 0 is 0.8578150664470746
FOLD 1
---------------------------------------------------




Valid AUC for fold 1 is 0.8566466212062318
FOLD 2
---------------------------------------------------




Valid AUC for fold 2 is 0.857639013843124
FOLD 3
---------------------------------------------------




Valid AUC for fold 3 is 0.8564849475361453
FOLD 4
---------------------------------------------------




Valid AUC for fold 4 is 0.8572123879070284
Valid AUC score for xgboost model is 0.8571320078395062


['/home/sf/code/kaggle/tabular_playgrounds/oct2021/preds/stacking_manual_20211031_160241nb-20211031165842run-X_orig_test_lv2_xgboost42_preds.joblib']

In [95]:
# oof_xgb_f0_rs1983 = load('/media/sf/easystore/kaggle_data/tabular_playgrounds/sep2021/models/stacking_manual_20210926_211701_xgboost_5folds/xgboost_fold0_model.joblib')
# oof_xgb_f0_rs42 = load('/media/sf/easystore/kaggle_data/tabular_playgrounds/sep2021/models/stacking_manual_20210925_212129_xgboost_5folds/xgboost_fold0_model.joblib')

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


In [96]:
oof_lv2_cat42, test_lv2_cat42 = cross_validate_model(library='catboost', X=oof_lv1, y=oof_y, X_test=test_lv1, 
                                         wandb_config=wandb_config,
                                         random_state=42,
                                         params=lv2_catboost_params,
                                         exmodel_config=exmodel_config, 
                                         shuffle_kfolds=False,
                                         wandb_tracked=False,
                                         encode_cats=False
                                        )

FOLD 0
---------------------------------------------------


KeyboardInterrupt: 

In [None]:
dump(oof_lv2_cat42, predpath/f"{wandb_config['name']}nb-{datetime.now().strftime('%Y%m%d%H%M%S')}run-X_orig_oof_lv2_catboost42_preds.joblib")
dump(test_lv2_cat42, predpath/f"{wandb_config['name']}nb-{datetime.now().strftime('%Y%m%d%H%M%S')}run-X_orig_test_lv2_catboost42_preds.joblib")

In [None]:

oof_lv2_lgb42, test_lv2_lgb42 = cross_validate_model(library='lightgbm', X=oof_lv1, y=oof_y, X_test=test_lv1, 
                                         wandb_config=wandb_config,
                                         random_state=42,
                                         params=lv2_lightgbm_params,
                                         exmodel_config=exmodel_config,
                                         shuffle_kfolds=False,
                                         wandb_tracked=False,
                                         encode_cats=False
                                        )

In [None]:
dump(oof_lv2_lgb42, predpath/f"{wandb_config['name']}nb-{datetime.now().strftime('%Y%m%d%H%M%S')}run-X_orig_oof_lv2_lightgbm42_preds.joblib")
dump(test_lv2_lgb42, predpath/f"{wandb_config['name']}nb-{datetime.now().strftime('%Y%m%d%H%M%S')}run-X_orig_test_lv2_lightgbm42_preds.joblib")

In [97]:
oof_lv2['xgboost'] = oof_lv2_xgb42
oof_lv2['catboost'] = oof_lv2_cat42
oof_lv2['lightgbm'] = oof_lv2_lgb42

test_lv2['xgboost'] = test_lv2_xgb42
test_lv2['catboost'] = test_lv2_cat42
test_lv2['lightgbm'] = test_lv2_lgb42

In [98]:
# oof_lv1_df = pd.read_feather(predpath/f"{wandb_config['name']}_oof_lv1.feather)

In [99]:
oof_lv2_full = oof_lv2.join(oof_lv1)

In [100]:
oof_lv2_full.head()

Unnamed: 0,xgboost,catboost,lightgbm,lgb42,lgb1983,lgb550,xgb42,xgb1983,xgb550,cat42,...,tabmlp1983,tabmlp550,lgb1701,xgb1701,cat1701,tablmp1701,lgb2063,xgb2063,cat2063,tablmp2063
0,0.603463,0.603793,0.609108,0.643063,0.644761,0.651015,0.627152,0.666962,0.658337,0.633626,...,0.55158,0.45573,0.647845,0.636991,0.604513,0.550344,0.636336,0.652631,0.630627,0.550438
1,0.999806,0.996125,0.988117,0.979158,0.980269,0.979628,0.975591,0.976313,0.975725,0.978221,...,0.958873,0.96527,0.979586,0.975426,0.977638,0.967344,0.979467,0.973305,0.977416,0.966318
2,0.678293,0.667067,0.679545,0.6948,0.735218,0.685842,0.674095,0.670068,0.668585,0.75838,...,0.600854,0.755067,0.71665,0.679206,0.788293,0.634997,0.642301,0.663404,0.696574,0.747513
3,0.312112,0.317473,0.317336,0.315001,0.310552,0.31715,0.294775,0.321891,0.326354,0.277934,...,0.214472,0.273882,0.30176,0.318009,0.272723,0.223079,0.311523,0.316786,0.280914,0.174859
4,0.028643,0.029519,0.038828,0.086256,0.089822,0.081587,0.069561,0.071851,0.072671,0.060639,...,0.034382,0.082108,0.094915,0.065146,0.06642,0.050387,0.084422,0.068258,0.064579,0.073367


In [101]:
# test_lv1 = pd.DataFrame(test_lv1)

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


In [102]:
test_lv1.head()

Unnamed: 0,lgb42,lgb1983,lgb550,xgb42,xgb1983,xgb550,cat42,cat1983,cat550,tabmlp42,tabmlp1983,tabmlp550,lgb1701,xgb1701,cat1701,tablmp1701,lgb2063,xgb2063,cat2063,tablmp2063
0,0.704171,0.719313,0.716673,0.742503,0.74467,0.743342,0.731599,0.743098,0.736512,0.702985,0.698789,0.703244,0.709136,0.745753,0.732615,0.641977,0.712405,0.741647,0.729088,0.659293
1,0.225994,0.234714,0.240386,0.263515,0.255078,0.254107,0.239633,0.236084,0.238824,0.300979,0.257073,0.312073,0.240015,0.256124,0.23638,0.248044,0.22945,0.256039,0.242159,0.31702
2,0.905121,0.908202,0.902443,0.90982,0.903481,0.903708,0.909699,0.908522,0.910965,0.849669,0.873373,0.836253,0.907301,0.903944,0.911547,0.855545,0.902819,0.905785,0.907301,0.858448
3,0.803894,0.808522,0.810213,0.8614,0.86036,0.86323,0.8472,0.844481,0.832912,0.857812,0.836521,0.854306,0.807577,0.858694,0.837866,0.869585,0.816616,0.866747,0.836606,0.805957
4,0.282883,0.289195,0.278769,0.261821,0.262067,0.265357,0.266116,0.267789,0.264228,0.209983,0.293265,0.309422,0.281437,0.275409,0.266691,0.231212,0.277253,0.260022,0.265275,0.268644


In [103]:
test_lv2_full = test_lv2.join(test_lv1)

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


In [104]:
test_lv2_full.head()

Unnamed: 0,xgboost,catboost,lightgbm,lgb42,lgb1983,lgb550,xgb42,xgb1983,xgb550,cat42,...,tabmlp1983,tabmlp550,lgb1701,xgb1701,cat1701,tablmp1701,lgb2063,xgb2063,cat2063,tablmp2063
0,0.696256,0.690326,0.692617,0.704171,0.719313,0.716673,0.742503,0.74467,0.743342,0.731599,...,0.698789,0.703244,0.709136,0.745753,0.732615,0.641977,0.712405,0.741647,0.729088,0.659293
1,0.269481,0.269859,0.275947,0.225994,0.234714,0.240386,0.263515,0.255078,0.254107,0.239633,...,0.257073,0.312073,0.240015,0.256124,0.23638,0.248044,0.22945,0.256039,0.242159,0.31702
2,0.915829,0.912697,0.903958,0.905121,0.908202,0.902443,0.90982,0.903481,0.903708,0.909699,...,0.873373,0.836253,0.907301,0.903944,0.911547,0.855545,0.902819,0.905785,0.907301,0.858448
3,0.80724,0.812514,0.791061,0.803894,0.808522,0.810213,0.8614,0.86036,0.86323,0.8472,...,0.836521,0.854306,0.807577,0.858694,0.837866,0.869585,0.816616,0.866747,0.836606,0.805957
4,0.2969,0.293652,0.302775,0.282883,0.289195,0.278769,0.261821,0.262067,0.265357,0.266116,...,0.293265,0.309422,0.281437,0.275409,0.266691,0.231212,0.277253,0.260022,0.265275,0.268644


In [105]:
oof_lv2_full.head()

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


Unnamed: 0,xgboost,catboost,lightgbm,lgb42,lgb1983,lgb550,xgb42,xgb1983,xgb550,cat42,...,tabmlp1983,tabmlp550,lgb1701,xgb1701,cat1701,tablmp1701,lgb2063,xgb2063,cat2063,tablmp2063
0,0.603463,0.603793,0.609108,0.643063,0.644761,0.651015,0.627152,0.666962,0.658337,0.633626,...,0.55158,0.45573,0.647845,0.636991,0.604513,0.550344,0.636336,0.652631,0.630627,0.550438
1,0.999806,0.996125,0.988117,0.979158,0.980269,0.979628,0.975591,0.976313,0.975725,0.978221,...,0.958873,0.96527,0.979586,0.975426,0.977638,0.967344,0.979467,0.973305,0.977416,0.966318
2,0.678293,0.667067,0.679545,0.6948,0.735218,0.685842,0.674095,0.670068,0.668585,0.75838,...,0.600854,0.755067,0.71665,0.679206,0.788293,0.634997,0.642301,0.663404,0.696574,0.747513
3,0.312112,0.317473,0.317336,0.315001,0.310552,0.31715,0.294775,0.321891,0.326354,0.277934,...,0.214472,0.273882,0.30176,0.318009,0.272723,0.223079,0.311523,0.316786,0.280914,0.174859
4,0.028643,0.029519,0.038828,0.086256,0.089822,0.081587,0.069561,0.071851,0.072671,0.060639,...,0.034382,0.082108,0.094915,0.065146,0.06642,0.050387,0.084422,0.068258,0.064579,0.073367


In [106]:
oof_lv2_np = oof_lv2_full.to_numpy()
test_lv2_np = test_lv2_full.to_numpy()

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


In [107]:
type(oof_y)

pandas.core.series.Series

In [108]:
oof_y_np = oof_y

## Level Three (Logistic Regression)

In [109]:
from sklearn import model_selection

In [110]:
# kfolds = model_selection.StratifiedKFold(n_splits=5, shuffle=False) # no random_state if shuffle == False

In [111]:
oof_preds, oof_y = [], []

In [112]:
test_preds = np.zeros((X_test.shape[0]))

In [113]:
X = oof_lv2_np
y = oof_y_np
X_test = test_lv2_np

In [114]:
X[:10]

array([[0.60346347, 0.60379302, 0.60910751, 0.6430626 , 0.64476147,
        0.65101507, 0.62715232, 0.66696191, 0.658337  , 0.63362641,
        0.63786275, 0.61812448, 0.52831805, 0.55158025, 0.45572963,
        0.64784501, 0.63699114, 0.60451306, 0.55034405, 0.63633613,
        0.65263098, 0.6306274 , 0.55043793],
       [0.99980623, 0.99612459, 0.98811665, 0.97915809, 0.98026903,
        0.97962814, 0.9755913 , 0.97631311, 0.97572541, 0.97822092,
        0.9788715 , 0.97970244, 0.96083504, 0.95887262, 0.96526986,
        0.97958613, 0.97542602, 0.97763838, 0.96734434, 0.97946677,
        0.97330451, 0.97741563, 0.96631795],
       [0.67829287, 0.66706652, 0.67954529, 0.69480043, 0.73521753,
        0.68584185, 0.6740948 , 0.67006797, 0.66858542, 0.75837971,
        0.69574226, 0.68765168, 0.72254694, 0.60085428, 0.75506681,
        0.71664953, 0.67920649, 0.78829268, 0.63499689, 0.64230075,
        0.66340375, 0.69657356, 0.74751294],
       [0.31211236, 0.31747341, 0.31733638, 0.315

In [115]:
library = 'sklearn (LogisticRegressor(max_iter=1000))'
# exmodel_config['library'] = library
# wandb.init(
#     project="202110_Kaggle_tabular_playground",
#     save_code=True,
#     tags=wandb_config['tags'],
#     name=wandb_config['name'],
#     notes=wandb_config['notes'],
#     config=exmodel_config
# )   

# prepare for k-fold cross-validation
# kfold = exmodel_config['cross_val_strategy'](n_splits=exmodel_config['kfolds'], shuffle=True, random_state=SEED)
kfold = exmodel_config['cross_val_strategy'](n_splits=exmodel_config['kfolds'], shuffle=False)

# setup for serialization
# model_path = Path(datapath/f"models/{wandb_config['name']}_{library}_{exmodel_config['kfolds']}folds/")
# (model_path).mkdir(exist_ok=True)

In [116]:
for fold, (train_idx, valid_idx) in enumerate(kfold.split(X,y)):
    X_train, X_valid = X[train_idx], X[valid_idx]
    y_train, y_valid = y[train_idx], y[valid_idx]
    
    print(f"FOLD {fold}")
    print("---------------------")
    
    model = LogisticRegression(max_iter=1000)
    
    model.fit(X_train, y_train)
    preds = model.predict_proba(X_valid)[:,1]
    
    oof_preds.extend(preds)
    oof_y.extend(y_valid)
    
    test_preds += model.predict_proba(X_test)[:,1]
    
    valid_auc = roc_auc_score(y_valid, preds)
    print(f"ROC AUC of fold {fold} is {valid_auc}")
    
#     dump(preds, /'lv_3)

valid_auc_total = roc_auc_score(oof_y, oof_preds)
print(f"Overall ROC_AUC is {valid_auc_total}")

dump(oof_preds, predpath/f"{wandb_config['name']}nb-{datetime.now().strftime('%Y%m%d%H%M%S')}run-X_orig_oof_lv3_preds.joblib")
dump(oof_y, predpath/'oof_lv3_y.joblib')

test_preds /= 5

dump(test_preds, predpath/f"{wandb_config['name']}nb-{datetime.now().strftime('%Y%m%d%H%M%S')}run-X_orig_test_lv3_preds.joblib")

FOLD 0
---------------------
ROC AUC of fold 0 is 0.8569595004826156
FOLD 1
---------------------
ROC AUC of fold 1 is 0.8558408474590921
FOLD 2
---------------------
ROC AUC of fold 2 is 0.8568110547897336
FOLD 3
---------------------
ROC AUC of fold 3 is 0.8553678789830179
FOLD 4
---------------------
ROC AUC of fold 4 is 0.8565676032821483
Overall ROC_AUC is 0.8562392566675165


['/home/sf/code/kaggle/tabular_playgrounds/oct2021/preds/stacking_manual_20211031_160241nb-20211031165953run-X_orig_test_lv3_preds.joblib']

In [117]:
# X_test_imputed_scaled = pd.read_feather(path=datapath/'X_test_NaNcounts_imputed-Median-wIndicators_StandardScaled.feather')
# X_test_imputed_scaled = pd.read_feather(path=datapath/'X_test_NaNcounts_SummaryStats_imputed-Median-wIndicators-StandardScaled.feather')

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


## Prediction Generation

In [118]:
# preds_path = Path(datapath/"preds/")

# blender_preds = blender.predict_proba(X_test_imputed_scaled)[:,1]
# dump(blender_preds, preds_path/f"{config_run['name']}_stack.joblib")

In [119]:
# widedeep_preds = load(predpath/'stacking_manual_20211012_194716_widedeep-TabMLP_5folds_rs1983_500epochs_test_preds.joblib')

In [120]:
# widedeep_preds = pd.Series(widedeep_preds, name='widedeep_tabmlp1983')

In [None]:
# test_lv1_xgb42 = load(predpath/'stacking_manual_20211020_104938_xgboost_5folds_rs42_test_preds.joblib')

In [None]:
# test_lv1_lgb42 = load(predpath/'stacking_manual_20211020_104938_lightgbm_5folds_rs42_test_preds.joblib')
# test_lv1_cat42 = load(predpath/'stacking_manual_20211020_104938_catboost_5folds_rs42_test_preds.joblib')

In [None]:
type(test_lv1_xgb42)

In [None]:
mod_mix_three = 0.5*test_preds + 0.3*test_lv1_xgb42 + 0.05*test_lv1_lgb42 + 0.05*test_lv1_cat42 + 0.1*test_lv1_tabmlp42

In [None]:
mod_mix_three_pow4 = 0.5*test_preds**4 + 0.3*test_lv1_xgb42**4 + 0.05*test_lv1_lgb42**4 + 0.05*test_lv1_cat42**4 + 0.1*test_lv1_tabmlp42**4

In [None]:
sample_df = pd.read_csv(datapath/'sample_submission.csv.zip')

In [None]:
sample_df.loc[:, 'target'] = mod_mix_three_pow4

In [None]:
sample_df.head()

In [None]:
# submission_path = datapath/'submissions'
# submission_path.mkdir(exist_ok=True)

In [None]:
sample_df.to_csv(subpath/f"{wandb_config['name']}_3level_3rs-X_orig-GBM-0.5stack_0.3xgb42_0.1tabmlp42_0.05lgb42_0.05cat42_pow4_ensemble_preds.csv", index=False)
# sample_df.to_csv(subpath/f"{wandb_config['name']}_3level-X_orig+KMeans8+synth-GBM-stack_ensemble_preds.csv", index=False)

## Level 4 (Passthrough)

### Experimentation

In [None]:
# oof_lv1.head()

Let's see what we can get if we effectively do a pass-through -- that is, we join the results from the first level with the results from the second, and even potentially the third, then feed the resulting table through a few different models. (Given that we'll have a greater number of features, we might want to try XGBoost rather than just a simple logistic regression.

In [121]:
oof_lv1_and_2 = oof_lv1.join(oof_lv2)
oof_lv1_and_2.head()

Unnamed: 0,lgb42,lgb1983,lgb550,xgb42,xgb1983,xgb550,cat42,cat1983,cat550,tabmlp42,...,xgb1701,cat1701,tablmp1701,lgb2063,xgb2063,cat2063,tablmp2063,xgboost,catboost,lightgbm
0,0.643063,0.644761,0.651015,0.627152,0.666962,0.658337,0.633626,0.637863,0.618124,0.528318,...,0.636991,0.604513,0.550344,0.636336,0.652631,0.630627,0.550438,0.603463,0.603793,0.609108
1,0.979158,0.980269,0.979628,0.975591,0.976313,0.975725,0.978221,0.978872,0.979702,0.960835,...,0.975426,0.977638,0.967344,0.979467,0.973305,0.977416,0.966318,0.999806,0.996125,0.988117
2,0.6948,0.735218,0.685842,0.674095,0.670068,0.668585,0.75838,0.695742,0.687652,0.722547,...,0.679206,0.788293,0.634997,0.642301,0.663404,0.696574,0.747513,0.678293,0.667067,0.679545
3,0.315001,0.310552,0.31715,0.294775,0.321891,0.326354,0.277934,0.281502,0.280313,0.200837,...,0.318009,0.272723,0.223079,0.311523,0.316786,0.280914,0.174859,0.312112,0.317473,0.317336
4,0.086256,0.089822,0.081587,0.069561,0.071851,0.072671,0.060639,0.061077,0.068461,0.060948,...,0.065146,0.06642,0.050387,0.084422,0.068258,0.064579,0.073367,0.028643,0.029519,0.038828


In [122]:
oof_all_lvs = oof_lv1_and_2.join(pd.Series(oof_preds, name='lv3_logistic_reg_preds'))
oof_all_lvs

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


Unnamed: 0,lgb42,lgb1983,lgb550,xgb42,xgb1983,xgb550,cat42,cat1983,cat550,tabmlp42,...,cat1701,tablmp1701,lgb2063,xgb2063,cat2063,tablmp2063,xgboost,catboost,lightgbm,lv3_logistic_reg_preds
0,0.643063,0.644761,0.651015,0.627152,0.666962,0.658337,0.633626,0.637863,0.618124,0.528318,...,0.604513,0.550344,0.636336,0.652631,0.630627,0.550438,0.603463,0.603793,0.609108,0.620127
1,0.979158,0.980269,0.979628,0.975591,0.976313,0.975725,0.978221,0.978872,0.979702,0.960835,...,0.977638,0.967344,0.979467,0.973305,0.977416,0.966318,0.999806,0.996125,0.988117,0.967607
2,0.694800,0.735218,0.685842,0.674095,0.670068,0.668585,0.758380,0.695742,0.687652,0.722547,...,0.788293,0.634997,0.642301,0.663404,0.696574,0.747513,0.678293,0.667067,0.679545,0.695375
3,0.315001,0.310552,0.317150,0.294775,0.321891,0.326354,0.277934,0.281502,0.280313,0.200837,...,0.272723,0.223079,0.311523,0.316786,0.280914,0.174859,0.312112,0.317473,0.317336,0.320700
4,0.086256,0.089822,0.081587,0.069561,0.071851,0.072671,0.060639,0.061077,0.068461,0.060948,...,0.066420,0.050387,0.084422,0.068258,0.064579,0.073367,0.028643,0.029519,0.038828,0.033848
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,0.877003,0.866812,0.859368,0.873962,0.876338,0.882213,0.876955,0.876530,0.873538,0.872165,...,0.875290,0.880338,0.863979,0.866435,0.880339,0.894021,0.858877,0.856541,0.838445,0.872278
999996,0.735823,0.732218,0.702689,0.777820,0.785770,0.789151,0.766000,0.766401,0.768164,0.751403,...,0.768249,0.743479,0.712077,0.768061,0.726242,0.661662,0.721611,0.721759,0.722971,0.756245
999997,0.151834,0.155176,0.146293,0.140220,0.146466,0.153901,0.164341,0.165027,0.164354,0.103621,...,0.155469,0.077828,0.143994,0.135747,0.161125,0.168268,0.169669,0.166342,0.183475,0.159699
999998,0.205738,0.182986,0.196857,0.216337,0.200384,0.197216,0.232795,0.227559,0.229330,0.277918,...,0.229789,0.335021,0.186798,0.224917,0.226820,0.178284,0.251595,0.252851,0.254263,0.248749


In [123]:
test_all_lvs = test_lv1.join(test_lv2).join(pd.Series(test_preds, name='lv3_logistic_test_preds'))

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


In [124]:
test_all_lvs.head()

Unnamed: 0,lgb42,lgb1983,lgb550,xgb42,xgb1983,xgb550,cat42,cat1983,cat550,tabmlp42,...,cat1701,tablmp1701,lgb2063,xgb2063,cat2063,tablmp2063,xgboost,catboost,lightgbm,lv3_logistic_test_preds
0,0.704171,0.719313,0.716673,0.742503,0.74467,0.743342,0.731599,0.743098,0.736512,0.702985,...,0.732615,0.641977,0.712405,0.741647,0.729088,0.659293,0.696256,0.690326,0.692617,0.701891
1,0.225994,0.234714,0.240386,0.263515,0.255078,0.254107,0.239633,0.236084,0.238824,0.300979,...,0.23638,0.248044,0.22945,0.256039,0.242159,0.31702,0.269481,0.269859,0.275947,0.267346
2,0.905121,0.908202,0.902443,0.90982,0.903481,0.903708,0.909699,0.908522,0.910965,0.849669,...,0.911547,0.855545,0.902819,0.905785,0.907301,0.858448,0.915829,0.912697,0.903958,0.938763
3,0.803894,0.808522,0.810213,0.8614,0.86036,0.86323,0.8472,0.844481,0.832912,0.857812,...,0.837866,0.869585,0.816616,0.866747,0.836606,0.805957,0.80724,0.812514,0.791061,0.815654
4,0.282883,0.289195,0.278769,0.261821,0.262067,0.265357,0.266116,0.267789,0.264228,0.209983,...,0.266691,0.231212,0.277253,0.260022,0.265275,0.268644,0.2969,0.293652,0.302775,0.30834


In [None]:
oof_all_lvs.to_feather(altdatapath/'oof_all_lvs_passthru_20211031-final.feather')
test_all_lvs.to_feather(altdatapath/'test_all_lvs_passthru_20211031-final.feather')

In [None]:
xgb_lv4_params = {
    'n_estimators': 3878,
    'max_depth': 4,
    'learning_rate': 0.024785857161974977,
    'reg_alpha': 26.867682044658245,
    'reg_lambda': 10.839759074147148,
    'subsample': 0.8208581489835881,
    'min_child_weight': 8.829122644339664,
    'colsample_bytree': 0.906420714280384,
    'gamma': 1.472322916021486
}

In [72]:
xgb_lv4_model = XGBClassifier(
    objective='binary:logistic',
    verbosity=1,
    tree_method='gpu_hist',
    booster='gbtree', # not bothering with dart for time reasons
    random_state=SEED,
    **xgb_lv4_params
)

In [73]:
X_lv4_train, X_lv4_valid, y_lv4_train, y_lv4_valid = train_test_split(oof_all_lvs, oof_y, test_size=0.2, random_state=int(SEED), shuffle=True)

In [74]:
xgb_lv4_model.fit(X_lv4_train, y_lv4_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.906420714280384,
              gamma=1.472322916021486, gpu_id=0, importance_type='gain',
              interaction_constraints='', learning_rate=0.024785857161974977,
              max_delta_step=0, max_depth=4, min_child_weight=8.829122644339664,
              missing=nan, monotone_constraints='()', n_estimators=3878,
              n_jobs=16, num_parallel_tree=1, random_state=42,
              reg_alpha=26.867682044658245, reg_lambda=10.839759074147148,
              scale_pos_weight=1, subsample=0.8208581489835881,
              tree_method='gpu_hist', validate_parameters=1, verbosity=1)

In [75]:
lv4_train_preds = xgb_lv4_model.predict_proba(X_lv4_valid)[:,1]

In [76]:
roc_auc_score(y_score=lv4_train_preds, y_true=y_lv4_valid)

0.8568572581392072

Let's compare that number to what we'd gotten at lv 3, and then what we'd get at lv 4 with LogisticRegression

In [77]:
log_lv4_model = LogisticRegression(max_iter=1000)

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


In [78]:
log_lv4_model.fit(X_lv4_train, y_lv4_train)

LogisticRegression(max_iter=1000)

In [79]:
lv4_log_train_preds = log_lv4_model.predict_proba(X_lv4_valid)[:,1]

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


In [80]:
roc_auc_score(y_score=lv4_log_train_preds, y_true=y_lv4_valid)

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


0.8568131691662475

So XGBoost at this level does do a bit better than LogisticRegression would. But should it come to this point? Let's find out.

### Adding more models


### Run

In [None]:
oof_all_lvs = pd.read_feather(altdatapath/'oof_all_lvs_passthru_202110300859.feather')
test_all_lvs = pd.read_feather(altdatapath/'test_all_lvs_passthru_202110300859.feather')

In [126]:
y = oof_y_np
test_preds = np.zeros((X_test.shape[0]))

In [127]:
# y = oof_y
oof_preds = []
oof_y = []

for fold, (train_idx, valid_idx) in enumerate(kfold.split(oof_all_lvs, y)):
    X_train, X_valid = X[train_idx], X[valid_idx]
    y_train, y_valid = y[train_idx], y[valid_idx]
    
    print(f"FOLD {fold}")
    print("---------------------")
    
    model = XGBClassifier(
        objective='binary:logistic',
        verbosity=1,
        tree_method='gpu_hist',
        booster='gbtree', # not bothering with dart for time reasons
        random_state=SEED,
        **xgb_lv4_params
    )
    model.fit(X_train, y_train)
    preds = model.predict_proba(X_valid)[:,1]
    
    oof_preds.extend(preds)
    oof_y.extend(y_valid)
    
    test_preds += model.predict_proba(X_test)[:,1]
    
    valid_auc = roc_auc_score(y_valid, preds)
    print(f"ROC AUC of fold {fold} is {valid_auc}")
    
#     dump(preds, /'lv_3)

valid_auc_total = roc_auc_score(y_true=oof_y, y_score=oof_preds)
print(f"Overall ROC_AUC is {valid_auc_total}")

dump(oof_preds, predpath/f"{wandb_config['name']}nb-{datetime.now().strftime('%Y%m%d%H%M%S')}run-X_orig_oof_lv4xgb_passthru_preds.joblib")
dump(oof_y, predpath/'oof_lv4_y.joblib')

test_preds /= 5

dump(test_preds, predpath/f"{wandb_config['name']}nb-{datetime.now().strftime('%Y%m%d%H%M%S')}run-X_orig_test_lv4xgb_passthru_preds.joblib")

FOLD 0
---------------------
ROC AUC of fold 0 is 0.8576973814950857
FOLD 1
---------------------




ROC AUC of fold 1 is 0.8564821246032219
FOLD 2
---------------------




ROC AUC of fold 2 is 0.8574951345643358
FOLD 3
---------------------




ROC AUC of fold 3 is 0.8563175511608968
FOLD 4
---------------------




ROC AUC of fold 4 is 0.8570563164588454
Overall ROC_AUC is 0.8569724079813386


['/home/sf/code/kaggle/tabular_playgrounds/oct2021/preds/stacking_manual_20211031_160241nb-20211031170152run-X_orig_test_lv4xgb_passthru_preds.joblib']

Notionally better.

In [None]:
test_preds[:10]

--------------

In [128]:
sample_df = pd.read_csv(datapath/'sample_submission.csv.zip')

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


In [129]:
sample_df.loc[:, 'target'] = test_preds

In [130]:
sample_df.head()

Unnamed: 0,id,target
0,1000000,0.708235
1,1000001,0.262932
2,1000002,0.916396
3,1000003,0.808739
4,1000004,0.284263


In [131]:
# submission_path = datapath/'submissions'
# submission_path.mkdir(exist_ok=True)

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


In [132]:
sample_df.to_csv(subpath/f"{wandb_config['name']}_4level-GBM+TabMLP-ensemble_X-orig_3rs_{exmodel_config['kfolds']}folds_finalrs{42}_preds-FINAL.csv", index=False)

LB 0.85628 -- an improvement, at long last.

In [None]:
# wandb.log({'leaderboard_auc': ,
# #            'catboost_params': str(best_catboost_params),
#           })

In [None]:
# wandb.finish()

In [None]:
stack_sub = sample_df.copy()

In [None]:
stack_preds = stack_sub.iloc[:,1]

In [None]:
# cat1983_preds = load(predpath/'stacking_manual_20211005_205933_catboost_5folds_rs1983_test_preds.joblib')

In [None]:
# cat_preds[:10]

In [None]:
# stack_sub.iloc[:10,1]

In [None]:
# lgb1983_preds = load(predpath/'stacking_manual_20211005_205933_lightgbm_5folds_rs1983_test_preds.joblib')

In [None]:
# lgb1983_preds[:10]

## Correlations
Before I make these predictions, let's try some rigor.

In [None]:
type(test_lv1)

In [None]:
test_lv1.shape

In [None]:
sns.heatmap(test_lv1.iloc[:,:].corr(), annot=True)
plt.show()

- So what this is saying is,XGBoost (models 0 and 1) is a bit closer to LightGBM (models 2 and 3) than to CatBoost (models 4 and 5), and LightGBM is closer to Catboost than to XGBoost.

What if we bring in the full stack's predictions?

In [None]:
stack_feature = pd.Series(test_preds, name='full_stack')
corr_compare = test_lv1.join(stack_feature)
corr_compare.head()

In [None]:
sns.heatmap(corr_compare.corr(), annot=True)
plt.show()

- So this is indicating that the stack is further apart from each of the GBMs than they are from one another -- but, it's marginally closer to CatBoost than to LightGBM, and it's definitely closer to both of those than to XGBoost alone. **So, wrt power averaging, it actually may not make sense to power average with CatBoost and LightGBM alongside the stack.** 

What if we bring in some `widedeep` predictions too?

In [None]:
widedeep_preds = load(predpath/'stacking_manual_20211012_194716_widedeep-TabMLP_5folds_rs1983_500epochs_test_preds.joblib')

In [None]:
type(widedeep_preds)

In [None]:
widedeep_preds = pd.Series(widedeep_preds, name='widedeep_tabmlp1983')

In [None]:
corr_compare_deep = corr_compare.join(widedeep_preds)

In [None]:
corr_compare_deep.head()

In [None]:
sns.heatmap(corr_compare_deep.corr(), annot=True)
plt.show()

## Modified Mix

In [None]:
mod_mix = 0.5*stack_preds + 0.4*test_lv1_xgb42 + 0.1*widedeep_preds

In [None]:
mod_mix[:10]

In [None]:
sample_df = pd.read_csv(datapath/'sample_submission.csv.zip')

In [None]:
sample_df.loc[:, 'target'] = mod_mix

In [None]:
sample_df.head()

In [None]:
# submission_path = datapath/'submissions'
# submission_path.mkdir(exist_ok=True)

In [None]:
sample_df.to_csv(subpath/f"{wandb_config['name']}_3level-GBM-X_orig+KMeans8+synth-0.5stack_0.4xgb42_0.1tabmlp1983_ensemble_preds.csv", index=False)

## Mod Mix Two

In [None]:
mod_mix_two = 0.6*stack_preds + 0.2*test_lv1_xgb42 + 0.05*test_lv1_lgb42 + 0.05*test_lv1_cat42 + 0.1*widedeep_preds

In [None]:
mod_mix_two[:10]

In [None]:
sample_df = pd.read_csv(datapath/'sample_submission.csv.zip')

In [None]:
sample_df.loc[:, 'target'] = mod_mix_two

In [None]:
sample_df.head()

In [None]:
# submission_path = datapath/'submissions'
# submission_path.mkdir(exist_ok=True)

In [None]:
sample_df.to_csv(subpath/f"{wandb_config['name']}_3level-X_orig+KMeans8+synth-GBM-0.6stack_0.2xgb42_0.1tabmlp1983_0.05lgb42_0.05cat42_ensemble_preds.csv", index=False)

In [None]:
mod_mix_three = 0.5*stack_preds + 0.3*test_lv1_xgb42 + 0.05*test_lv1_lgb42 + 0.05*test_lv1_cat42 + 0.1*widedeep_preds

In [None]:
sample_df = pd.read_csv(datapath/'sample_submission.csv.zip')

In [None]:
sample_df.loc[:, 'target'] = mod_mix_three

In [None]:
sample_df.head()

In [None]:
# submission_path = datapath/'submissions'
# submission_path.mkdir(exist_ok=True)

In [None]:
sample_df.to_csv(subpath/f"{wandb_config['name']}_3level-X_orig+KMeans8+synth-GBM-0.5stack_0.3xgb42_0.1tabmlp1983_0.05lgb42_0.05cat42_ensemble_preds.csv", index=False)

## 4th power

In [None]:
mod_mix_three_pow4 = 0.5*stack_preds**4 + 0.3*test_lv1_xgb42**4 + 0.05*test_lv1_lgb42**4 + 0.05*test_lv1_cat42**4 + 0.1*widedeep_preds**4

In [None]:
sample_df = pd.read_csv(datapath/'sample_submission.csv.zip')

In [None]:
sample_df.loc[:, 'target'] = mod_mix_three_pow4

In [None]:
sample_df.head()

In [None]:
# submission_path = datapath/'submissions'
# submission_path.mkdir(exist_ok=True)

In [None]:
sample_df.to_csv(subpath/f"{wandb_config['name']}_3level-X_orig+KMeans8+synth-GBM-0.5stack_0.3xgb42_0.1tabmlp1983_0.05lgb42_0.05cat42_pow4_ensemble_preds.csv", index=False)

In [None]:
power4_avg_alt = 0.5*stack_preds**4 + 0.4*test_lv1_xgb42 + 0.1*widedeep_preds** 4

In [None]:
power4_avg_alt[:20]

In [None]:
sample_df = pd.read_csv(datapath/'sample_submission.csv.zip')

In [None]:
sample_df.loc[:, 'target'] = power4_avg_alt

In [None]:
sample_df.head()

In [None]:
# submission_path = datapath/'submissions'
# submission_path.mkdir(exist_ok=True)

In [None]:
sample_df.to_csv(subpath/f"{wandb_config['name']}_3level-GBM-0.5stack_0.4xgb42_0.1tabmlp1983_pow-avg4_ensemble_preds.csv", index=False)

## 6th power

In [None]:
power6_avg = (stack_preds**6 + cat1983_preds**6 + lgb1983_preds**6) / 3

In [None]:
power6_avg[:20]

In [None]:
sample_df = pd.read_csv(datapath/'sample_submission.csv.zip')

In [None]:
sample_df.loc[:, 'target'] = power6_avg

In [None]:
sample_df.head()

In [None]:
# submission_path = datapath/'submissions'
# submission_path.mkdir(exist_ok=True)

In [None]:
sample_df.to_csv(subpath/f"{wandb_config['name']}_3level-GBM-stack_cat1983_lgb1983_pow-avg6_ensemble_preds.csv", index=False)

## 5th power

In [None]:
power5_avg = (stack_preds**5 + cat1983_preds**5 + lgb1983_preds**5) / 3

In [None]:
# power5_avg[:20]

In [None]:
sample_df = pd.read_csv(datapath/'sample_submission.csv.zip')

In [None]:
sample_df.loc[:, 'target'] = power5_avg

In [None]:
sample_df.head()

In [None]:
# submission_path = datapath/'submissions'
# submission_path.mkdir(exist_ok=True)

In [None]:
sample_df.to_csv(subpath/f"{wandb_config['name']}_3level-GBM-stack_cat1983_lgb1983_pow-avg5_ensemble_preds.csv", index=False)

## 3rd power

In [None]:
power3_avg = (stack_preds**3 + cat1983_preds**3 + lgb1983_preds**3) / 3

In [None]:
# power5_avg[:20]

In [None]:
sample_df = pd.read_csv(datapath/'sample_submission.csv.zip')

In [None]:
sample_df.loc[:, 'target'] = power3_avg

In [None]:
sample_df.head()

In [None]:
# submission_path = datapath/'submissions'
# submission_path.mkdir(exist_ok=True)

In [None]:
sample_df.to_csv(subpath/f"{wandb_config['name']}_3level-GBM-stack_cat1983_lgb1983_pow-avg3_ensemble_preds.csv", index=False)