# GBM Trainer

# Setup

20211202

In [1]:
# notebook configuration
COLAB = False # will trigger manual installation of packages
SAGE = False # if notebook will be used on Amazon SageMaker
USE_GPU = True 
%config Completer.use_jedi = False

## Imports

In [2]:
if SAGE:
    !pip install --upgrade sklearn
    !pip install --upgrade wandb
    !pip install --upgrade catboost
    !pip install --upgrade lightgbm
    !pip install --upgrade xgboost
    !pip install optuna
    !pip install pyarrow

In [3]:
# basic imports
from pathlib import Path
import os
import math
from datetime import datetime
import random

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
# import seaborn as sns
%matplotlib inline


import requests # for telegram notifications
# from tqdm.notebook import tqdm

from joblib import dump, load

Now, non-stdlib imports

In [4]:
# model selection
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold

# metrics
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, log_loss, f1_score, fbeta_score

# eda
# import missingno
# import doubtlab 

# data cleaning
# from sklearn.impute import SimpleImputer #, KNNImputer
# import cleanlab

# normalization
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler, QuantileTransformer
# from gauss_rank_scaler import GaussRankScaler

# feature generation
from sklearn.preprocessing import PolynomialFeatures
# import category_encoders as ce

# models
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
# from sklearn.linear_model import LogisticRegression
# from sklearn.ensemble import StackingClassifier, RandomForestClassifier

# feature reduction
# from sklearn.decomposition import PCA
# from umap import UMAP

# clustering
# from sklearn.cluster import DBSCAN, KMeans
# import hdbscan

# feature selection
# from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
# import featuretools as ft
# from BorutaShap import BorutaShap
# from boruta import BorutaPy

# tracking 
import wandb
from wandb.xgboost import wandb_callback
from wandb.lightgbm import wandb_callback
os.environ['WANDB_NOTEBOOK_NAME'] = f"optuna_gbms_{datetime.now().strftime('%Y%m%d')}_sage.ipynb"

# hyperparameter tuning
import optuna
from optuna.integration.wandb import WeightsAndBiasesCallback
from optuna.samplers import TPESampler


In [5]:
# # deep learning
# import torch
# from torch.optim import Adam, AdamW, Adagrad, SGD, RMSprop, LBFGS
# from torch.optim.lr_scheduler import ReduceLROnPlateau, CosineAnnealingWarmRestarts, CyclicLR, OneCycleLR, StepLR, CosineAnnealingLR

# # widedeep
# from pytorch_widedeep import Trainer
# from pytorch_widedeep.preprocessing import WidePreprocessor, TabPreprocessor
# from pytorch_widedeep.models import Wide, TabMlp, WideDeep, SAINT#, TabTransformer, TabNet, TabFastFormer, TabResnet
# from pytorch_widedeep.metrics import Accuracy
# from pytorch_widedeep.callbacks import EarlyStopping, LRHistory, ModelCheckpoint

## Routing

Now, datapath setup

In [6]:
if COLAB:
    # mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')

    # handling datapath
    # datapath = Path('/content/drive/MyDrive/kaggle/tabular_playgrounds/dec2021/')
    
else:
    # if on local machine
    if SAGE:
        root = Path('/home/studio-lab-user/sagemaker-studiolab-notebooks')
    else:
        root = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/dec2021/')
    datapath = root/'datasets'
    # edapath = root/'EDA'
    # modelpath = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/oct2021/models/')
    predpath = root/'preds'
    subpath = root/'submissions'
    studypath = root/'studies'
    
    for pth in [datapath, predpath, subpath, studypath]:
        pth.mkdir(exist_ok=True)

## Helpers

In [7]:
SEED = 42

# Function to seed everything but the models
def seed_everything(seed, pytorch=True, reproducible=True):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    # if pytorch:
    #     torch.manual_seed(seed) # set torch CPU seed
    #     if torch.cuda.is_available():
    #         torch.cuda.manual_seed_all(seed) # set torch GPU(s) seed(s)
    #     if reproducible and torch.backends.cudnn.is_available():
    #         torch.backends.cudnn.deterministic = True
    #         torch.backends.cudnn.benchmark = False

seed_everything(seed=SEED)

In [8]:
def reduce_memory_usage(df, verbose=True):
    """
    Function to reduce memory usage by downcasting datatypes in a Pandas DataFrame when possible.
    
    h/t to Bryan Arnold (https://www.kaggle.com/puremath86/label-correction-experiments-tps-nov-21)
    """
    
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

In [9]:
tg_api_token = 'your_api_token' # for Galileo (jupyter_watcher_bot) on Telegram
tg_chat_id = 'your_chat_id'

import requests

def send_tg_message(text='Cell execution completed.'):  
    """
    h/t Ivan Dembicki Jr. for the base version 
    (https://medium.com/@ivan.dembicki.jr/notifications-in-jupyter-notebook-with-telegram-f2e892c55173)
    """
    requests.post('https://api.telegram.org/' +  'bot{}/sendMessage'.format(tg_api_token),
                  params=dict(chat_id=tg_chat_id, text=text))

## Dataset Setup

In [10]:
# dataset_params will initially include either trivial class instances or loaded, precomputed artifacts
dataset_params = {
    'train_source': str(datapath/'X_orig.feather'),
    'target_source': str(datapath/'y_orig.joblib'),
    'test_source': str(datapath/'X_test_orig.feather'),
    # 'scaler': str(RobustScaler()),
    # 'pca': str(load(datapath/'pca_mle-RobustScaled_orig_trainset.joblib')),
    # 'umap': str(load(datapath/'umap_reducer-20211107-n_comp10-n_neighbors15-rs42-pca_mle-RobustScaled_orig_trainset.joblib')),
}   

# referring back to the already-entered attributes, specify how the pipeline was sequenced
# dataset_params['preprocessing_pipeline'] = str([dataset_params['scaler'], dataset_params['pca'], dataset_params['umap']]) # ACTUALLY this is unwieldy
# dataset_params['preprocessing_pipeline'] = '[scaler, pca, umap]' # more fragile, but also more readable

# now, load the datasets and generate more metadata from them
# X = load(dataset_params['train_source'])
# if SAGE:
#     X = load(datapath/'X_orig.joblib')
# else:
X = pd.read_feather(dataset_params['train_source'])
y = load(dataset_params['target_source'])
# X_test = load(dataset_params['test_source'])
# X_test = pd.read_feather(dataset_params['test_source'])

# reduce memory usage
# X = reduce_memory_usage(X)
# X_test = reduce_memory_usage(X)

# metadata logging
dataset_params['feature_count'] = X.shape[1]
dataset_params['instance_count'] = X.shape[0]
    

## Feature Engineering

First, going to try some of the basic tweaks suggested [here](https://www.kaggle.com/c/tabular-playground-series-dec-2021/discussion/293612).

In [11]:
# remove unuseful features
# X = X.drop([ 'Soil_Type7', 'Soil_Type15'], axis=1)
# X_test = X_test.drop(['Soil_Type7', 'Soil_Type15'], axis=1)

In [12]:
X.shape

(4000000, 54)

In [13]:
# extra feature engineering
def r(x):
    if x+180>360:
        return x-180
    else:
        return x+180

In [14]:
def fe(df):
    df['EHiElv'] = df['Horizontal_Distance_To_Roadways'] * df['Elevation']
    df['EViElv'] = df['Vertical_Distance_To_Hydrology'] * df['Elevation']
    df['Aspect2'] = df.Aspect.map(r)
    ### source: https://www.kaggle.com/c/tabular-playground-series-dec-2021/discussion/293373
    df["Aspect"][df["Aspect"] < 0] += 360
    df["Aspect"][df["Aspect"] > 359] -= 360
    df.loc[df["Hillshade_9am"] < 0, "Hillshade_9am"] = 0
    df.loc[df["Hillshade_Noon"] < 0, "Hillshade_Noon"] = 0
    df.loc[df["Hillshade_3pm"] < 0, "Hillshade_3pm"] = 0
    df.loc[df["Hillshade_9am"] > 255, "Hillshade_9am"] = 255
    df.loc[df["Hillshade_Noon"] > 255, "Hillshade_Noon"] = 255
    df.loc[df["Hillshade_3pm"] > 255, "Hillshade_3pm"] = 255
    ########
    df['Highwater'] = (df.Vertical_Distance_To_Hydrology < 0).astype(int)
    df['EVDtH'] = df.Elevation - df.Vertical_Distance_To_Hydrology
    df['EHDtH'] = df.Elevation - df.Horizontal_Distance_To_Hydrology * 0.2
    df['Euclidean_Distance_to_Hydrology'] = (df['Horizontal_Distance_To_Hydrology']**2 + df['Vertical_Distance_To_Hydrology']**2)**0.5
    df['Manhattan_Distance_to_Hydrology'] = df['Horizontal_Distance_To_Hydrology'] + df['Vertical_Distance_To_Hydrology']
    df['Hydro_Fire_1'] = df['Horizontal_Distance_To_Hydrology'] + df['Horizontal_Distance_To_Fire_Points']
    df['Hydro_Fire_2'] = abs(df['Horizontal_Distance_To_Hydrology'] - df['Horizontal_Distance_To_Fire_Points'])
    df['Hydro_Road_1'] = abs(df['Horizontal_Distance_To_Hydrology'] + df['Horizontal_Distance_To_Roadways'])
    df['Hydro_Road_2'] = abs(df['Horizontal_Distance_To_Hydrology'] - df['Horizontal_Distance_To_Roadways'])
    df['Fire_Road_1'] = abs(df['Horizontal_Distance_To_Fire_Points'] + df['Horizontal_Distance_To_Roadways'])
    df['Fire_Road_2'] = abs(df['Horizontal_Distance_To_Fire_Points'] - df['Horizontal_Distance_To_Roadways'])
    df['Hillshade_3pm_is_zero'] = (df.Hillshade_3pm == 0).astype(int)
    return df

In [15]:
X = fe(X)
# X_test = fe(X_test)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Aspect"][df["Aspect"] < 0] += 360
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Aspect"][df["Aspect"] > 359] -= 360


In [16]:
# Summed features pointed out by @craigmthomas (https://www.kaggle.com/c/tabular-playground-series-dec-2021/discussion/292823)
soil_features = [x for x in X.columns if x.startswith("Soil_Type")]
wilderness_features = [x for x in X.columns if x.startswith("Wilderness_Area")]

X["soil_type_count"] = X[soil_features].sum(axis=1)
# X_test["soil_type_count"] = X_test[soil_features].sum(axis=1)

X["wilderness_area_count"] = X[wilderness_features].sum(axis=1)
# X_test["wilderness_area_count"] = X_test[wilderness_features].sum(axis=1)

In [17]:
X.columns

Index(['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology',
       'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways',
       'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm',
       'Horizontal_Distance_To_Fire_Points', 'Wilderness_Area1',
       'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4',
       'Soil_Type1', 'Soil_Type2', 'Soil_Type3', 'Soil_Type4', 'Soil_Type5',
       'Soil_Type6', 'Soil_Type7', 'Soil_Type8', 'Soil_Type9', 'Soil_Type10',
       'Soil_Type11', 'Soil_Type12', 'Soil_Type13', 'Soil_Type14',
       'Soil_Type15', 'Soil_Type16', 'Soil_Type17', 'Soil_Type18',
       'Soil_Type19', 'Soil_Type20', 'Soil_Type21', 'Soil_Type22',
       'Soil_Type23', 'Soil_Type24', 'Soil_Type25', 'Soil_Type26',
       'Soil_Type27', 'Soil_Type28', 'Soil_Type29', 'Soil_Type30',
       'Soil_Type31', 'Soil_Type32', 'Soil_Type33', 'Soil_Type34',
       'Soil_Type35', 'Soil_Type36', 'Soil_Type37', 'Soil_Type38',
       'Soil_Type39', 'Soil_Type40

## Dataset Parameters

Initialized above, but now records of feature engineering efforts included.

In [18]:
dataset_params['feature_count'] = X.shape[1]
dataset_params['instance_count'] = X.shape[0]

# might eventually shift from dict to tuple

# simplest approach: k-v where key is new feature, v is string with the operation to get it
# sacrifices sortability, but could recover that through regexes, and it's much quicker to input
dataset_params['feature_combinations'] = {
    'EHiElv': "df['Horizontal_Distance_To_Roadways'] * df['Elevation']",
    'EViElv': "df['Vertical_Distance_To_Hydrology'] * df['Elevation']",
    'EVDtH': "df.Elevation - df.Vertical_Distance_To_Hydrology",
    'EHDtH': "df.Elevation - df.Horizontal_Distance_To_Hydrology * 0.2",
    'Euclidean_Distance_to_Hydrology': "(df['Horizontal_Distance_To_Hydrology']**2 + df['Vertical_Distance_To_Hydrology']**2)**0.5",
    'Manhattan_Distance_to_Hydrology': "df['Horizontal_Distance_To_Hydrology'] + df['Vertical_Distance_To_Hydrology']",
    'Hydro_Fire_1': "df['Horizontal_Distance_To_Hydrology'] + df['Horizontal_Distance_To_Fire_Points']",
    'Hydro_Fire_2': "abs(df['Horizontal_Distance_To_Hydrology'] - df['Horizontal_Distance_To_Fire_Points'])",
    'Hydro_Road_1': "abs(df['Horizontal_Distance_To_Hydrology'] + df['Horizontal_Distance_To_Roadways'])",
    'Hydro_Road_2': "abs(df['Horizontal_Distance_To_Hydrology'] - df['Horizontal_Distance_To_Roadways'])",
    'Fire_Road_1': "abs(df['Horizontal_Distance_To_Fire_Points'] + df['Horizontal_Distance_To_Roadways'])",
    'Fire_Road_2': "abs(df['Horizontal_Distance_To_Fire_Points'] - df['Horizontal_Distance_To_Roadways'])"
}

dataset_params['feature_clipping'] = [
    {
        'features': ['Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm'],
        'range': range(0,256)
    },
    {
        'features': ['Aspect'],
        'range': range(0,360)
    }
]

# the features that are just getting the one-hots counted
dataset_params['feature_counts'] = ['Soil_Type*', 'Wilderness_Area*']
dataset_params['feature_complements'] = [
    {
        'old': 'Aspect', 
        'new': 'Aspect2',
        'operation': 'If x < 180 return x-180, else return x + 180'
    },
]

dataset_params['feature_indicators'] = {
    'Hillshade_3pm_is_zero': "(df.Hillshade_3pm == 0).astype(int)",
}

dataset_params['feature_typecasting'] = {
    'Highwater': "(df.Vertical_Distance_To_Hydrology < 0).astype(int)"
}
# dataset_params['feature_combinations'] = [
#     {
#         'old': ['Horizontal_Distance_To_Roadways', 'Elevation'], 
#         'new': 'EHiElv',
#         'operation': '*'
#     },
#     {
#         'old': ('Vertical_Distance_To_Hydrology', 'Elevation'), 
#         'new': 'EViElv',
#         'operation': '*'
#     },
#     {
#         'old': ['Elevation', 'Vertical_Distance_To_Hydrology'],
#         'new': 'EVDtH',
#         'operation': '-'
#     },
#     {
#         'old': ['Elevation', 'Horizontal_Distance_To_Hydrology'],
#         'new': 'EHDtH',
#         'operation': 'df.Elevation - df.Horizontal_Distance_To_Hydrology * 0.2'
#     },
    # {
    #     'old': ['Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology'],
    #     'new': 'Euclidean_Distance_to_Hydrology',
    #     'operation': "(df['Horizontal_Distance_To_Hydrology']**2 + df['Vertical_Distance_To_Hydrology']**2)**0.5"
    # },
    # {
    #     'old': ['Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology'],
    #     'new': 'Manhattan_Distance_to_Hydrology',
    #     'operation': '+'
    # },
    # {
    #     'old': ['Horizontal_Distance_To_Hydrology', 'Horizontal_Distance_To_Fire_Points'],
    #     'new': 
    
# dataset_params['feature_crosses'] = [
#     {
#         'old': ['Horizontal_Distance_To_Roadways', 'Elevation'], 
#         'new': 'EHiElv',
#         'operation': '*'
#     },
#     {
#         'old': ('Vertical_Distance_To_Hydrology', 'Elevation'), 
#         'new': 'EViElv',
#         'operation': '*'
#     }
# ]

# dataset_params['feature_additions'] = [
#     {
#         'old': ['Elevation', 'Vertical_Distance_To_Hydrology'],
#         'new': 'EVDtH',
#         'operation': '-'
#     }
# ]



# Model Parameters

In [19]:
# # optuna 20211124, with corrected dataset and RobustScaler
# best_xgboost_params = {
#     'n_estimators': 9872,
#     'max_depth': 3,
#     'learning_rate': 0.12943882615104757,
#     'reg_alpha': 4.793236314677738,
#     'reg_lambda': 0.03427038053813167,
#     'subsample': 0.5026684329097286,
#     'min_child_weight': 3.2374430610042664,
#     'colsample_bytree': 0.9875504456465564,
#     'gamma': 4.691772640321729
# }

# # best as of 20211125, with corrected dataset and RobustScaler
# best_lightgbm_params = {
#     'n_estimators': 6986,
#     'max_depth': 3,
#     'learning_rate': 0.09080435106650955,
#     'reg_alpha': 19.060739534647425,
#     'reg_lambda': 0.12865332700612375,
#     'subsample': 0.5612404690403716,
#     'boosting_type': 'goss',
#     'min_child_samples': 17,
#     'num_leaves': 59,
#     'colsample_bytree': 0.5125554530181221
# }

# # best as of 20211126, with corrected dataset and RobustScaler
# best_catboost_params = {
#     'iterations': 17997,
#     'depth': 4,
#     'learning_rate': 0.05807421036756052,
#     'random_strength': 27,
#     'od_wait': 1664,
#     'reg_lambda': 57.67864249277457,
#     'border_count': 275,
#     'min_child_samples': 10,
#     'leaf_estimation_iterations': 2
# }

# # # 20211021 lv2 on the K-Means 8-cluster, synth dataset
# # lv2_xgboost_params = {
# #     'n_estimators': 1534,
# #     'max_depth': 4,
# #     'learning_rate': 0.0062941159127744535,
# #     'reg_alpha': 21.3946930650266,
# #     'reg_lambda': 0.021003786013817635,
# #     'subsample': 0.5726680367393964,
# #     'min_child_weight': 0.07566661785187714,
# #     'colsample_bytree': 0.7850419523745037,
# #     'gamma': 4.26660233356059
# # }

# # # 20211021 lv2 on the K-Means 8-cluster, synth dataset
# # lv2_lightgbm_params = {
# #     'n_estimators': 5776,
# #     'max_depth': 4,
# #     'learning_rate': 0.0010172282832994653,
# #     'reg_alpha': 0.013879765609402173,
# #     'reg_lambda': 0.002787031048344079,
# #     'subsample': 0.800000753298926,
# #     'boosting_type': 'gbdt',
# #     'min_child_samples': 11,
# #     'num_leaves': 190,
# #     'colsample_bytree': 0.9976443570341007
# # }

# # # 20211021 lv2 on the K-Means 8-cluster, synth dataset
# # lv2_catboost_params = {
# #     'iterations': 2000,
# #     'depth': 6,
# #     'learning_rate': 0.002984126581340097,
# #     'random_strength': 0,
# #     'od_wait': 334,
# #     'reg_lambda': 33.469738674488084,
# #     'border_count': 158,
# #     'min_child_samples': 8,
# #     'leaf_estimation_iterations': 4
# # }

# # # initial, non-default guess -- need to get optuna working (20211010)
# # # basic_widedeep_tabmlp_params = {
    
# # # }

# # # basic_widedeep_trainer_params = {
# # #     optimizers=AdamW()
# # # }

In [20]:
from lightgbm.basic import LightGBMError

## Training Parameters

In [21]:
training_params = {
    'general_random_state': SEED,
}

folds = 5
training_params['cross_val_strategy'] = StratifiedKFold(n_splits=folds, shuffle=True, random_state=SEED)

## Metadata

In [22]:
# baseline -- alter as needed later
exmodel_config = {
#     'general_random_state': SEED,
# #     'feature_generation': ['NaN_counts', 'SummaryStats', 'NaN_OneHots'],
#     'cross_val_strategy': KFold, 
#     'kfolds': 5, # if 1, that means just doing holdout
#     'test_size': 0.2,
    **training_params,
    **dataset_params
}

## Dataset Setup

## Weights and Biases Run Config

Below is the configuration for a Weights and Biases (`wandb`) run.

In [23]:
# wandb config:
wandb_config = {
    'name': f"{os.environ['WANDB_NOTEBOOK_NAME'][:-6]}_{datetime.now().strftime('%H%M%S')}", # just removes the .ipynb extension, leaving the notebook filename's stem
    'tags': ['study'],
    'notes': "Optuna study of feature-engineered notebook, using new (more model-, infrastructure-agnostic) code"
}

# Cross-Validator


In [24]:
training_params['cross_val_strategy']

StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

In [25]:
def cross_validate_model(arch:str, X=X, y=y, X_test=None, model_params:dict={}, training_params=training_params, dataset_params=dataset_params,
                         folds=list(range(folds)), exmodel_config=exmodel_config, wandb_config=wandb_config,  telegram=True, random_state=42, 
                         wandb_tracked=True, encode_cats=False):
    """
    Function to handle model training process in the context of cross-validation -- via hold-out or via k-fold.
    If exmodel_config['cross_val_strategy'] == None, then any kfolds= input is ignored; otherwise, the number specified is used.
    
    :param kfolds: int specifying number of k-folds to use in cross-validation
    :param exmodel_config: dict containing general config including for cross-validation -- `kfold=1` implies hold-out
    """
    # if exmodel_config['kfolds'] == 1: # holdout case
    #     print("Proceeding with holdout")
    #     X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, 
    #                                                           random_state=SEED)                 
    # else: # k-fold cross validation case
    #     # prepare for k-fold cross-validation; random-state here is notebook-wide, not per-model
    #     # shuffle on the initial sets, but not subsequently -- performing the same operation twice means a very different dataset
    #     if shuffle_kfolds:
    #         kfold = exmodel_config['cross_val_strategy'](n_splits=exmodel_config['kfolds'], shuffle=True, random_state=SEED)
    #     else:
    #         kfold = exmodel_config['cross_val_strategy'](n_splits=exmodel_config['kfolds'], shuffle=False)
    
    kfold = training_params['cross_val_strategy']
    
    if wandb_tracked:
        exmodel_config['arch'] = arch
        exmodel_config[f'{arch}_params'] = str(model_params)
        wandb.init(
            project="202112_Kaggle_tabular_playground",
            save_code=True,
            tags=wandb_config['tags'],
            name=wandb_config['name'],
            notes=wandb_config['notes'],
            config=exmodel_config
    )   
    
    # initialize lists for out-of-fold preds and ground truth
    oof_preds, oof_y = [], []
    
    # initialize a numpy.ndarray containing the fold-model's preds for test set
    
    # test_preds = np.zeros((X_test.shape[0]))
    # test_probs = np.zeros((X_test.shape[0]))
    # preprocessing
    # if using a GBM, simply use the RobustScaler
        # scaler = RobustScaler()
        # X = scaler.fit_transform(X)
        # X_test = scaler.transform(X_test)
    
    for fold, (train_ids, valid_ids) in enumerate(kfold.split(X,y)):
        if fold not in folds: # skip folds that are already trained, i.e. that haven't been specified
            continue
        else:
            print(f"FOLD {fold}")
            print("---------------------------------------------------")
            y_train, y_valid = y[train_ids], y[valid_ids] # y will be an np.ndarray already; handling will be same regardless of model
            if isinstance(X, np.ndarray):
                X_train, X_valid = X[train_ids], X[valid_ids]
            else:
                X_train, X_valid = X.iloc[train_ids,:], X.iloc[valid_ids,:] # bc need pandas.DataFrames for ce
                
                # scaling
                # category_encoding
                # if encode_cats:
                #     encoder = ce.WOEEncoder(cols=categoricals)
                #     encoder.fit(X_train,y_train)
                #     X_train = encoder.transform(X_train)
                #     X_valid = encoder.transform(X_valid)
                # # exmodel_config['feature_count'] = len(X.columns)
                #     wandb.log({
                #         'feature_count': X_train.shape[1],
                #         'instance_count': X_train.shape[0],
                #         'encoder': str(encoder)
                #     })
        
        # define models
        if arch == 'xgboost':
            if USE_GPU:
                model = XGBClassifier(
                    booster='gbtree',
                    tree_method='gpu_hist',
                    random_state=random_state,
                    n_jobs=-1, 
                    verbosity=1, 
                    # objective='binary:logistic',
                    objective='multi:softmax',
                    **model_params)
            else:
                model = XGBClassifier(
                    booster='gbtree',
                    tree_method='hist',
                    random_state=random_state,
                    n_jobs=-1,
                    verbosity=1,
                    objective='multi:softmax',
                    **model_params)
            if wandb_tracked:
                model.fit(X_train, y_train, callbacks=[wandb.xgboost.wandb_callback()])
            else:
                model.fit(X_train, y_train)
            
            y_valid_preds = model.predict(X_valid)
            # y_valid_probs = model.predict_proba(X_valid)
            
            # add the fold-model's OOF preds and ground truths to the out-of-loop lists
            oof_preds.extend(y_valid_preds)
            # oof_probs.extend(y_valid_probs)
            oof_y.extend(y_valid)
            
            # add the fold's predictions to the model's test-set predictions (will divide later)
            # test_preds += model.predict(X_test)
            # test_probs += model.predict_proba(X_test)


        elif arch == 'lightgbm':
            if USE_GPU:
                model = LGBMClassifier(
                    objective='binary',
                    random_state=random_state,
                    device_type='gpu',
                    max_bin=63, # 15 might be even better for GPU perf, but depends on dataset -- see https://lightgbm.readthedocs.io/en/latest/GPU-Performance.html
                    gpu_use_dp=False, # forces use of single precision rather than double for better perf, esp on consumer Nvidia chips
                    **model_params)
            else:
                model = LGBMClassifier(
                    objective='binary',
                    random_state=random_state,
                    device_type='cpu',
                    n_jobs=-1,
                    **model_params)

            if wandb_tracked:
                model.fit(X_train, y_train, callbacks=[wandb.lightgbm.wandb_callback()],)
            else:
                model.fit(X_train, y_train)
#             except LightGBMError:
#                 model = LGBMClassifier(
#                     objective='binary',
#                     random_state=random_state,
#                     device_type='cpu',
#                     n_jobs=-1,
#     #                 eval_metric='auc',
#     #                 device_type='gpu',
#     #                 max_bin=63, # 15 might be even better for GPU perf, but depends on dataset -- see https://lightgbm.readthedocs.io/en/latest/GPU-Performance.html
#     #                 gpu_use_dp=False, # forces use of single precision rather than double for better perf, esp on consumer Nvidia chips
#                     **params)
                
#                 if wandb_tracked:
#                     model.fit(X_train, y_train, callbacks=[wandb.lightgbm.wandb_callback()],)
#                 else:
#                     model.fit(X_train, y_train)
            y_valid_preds = model.predict(X_valid)
            # y_valid_probs = model.predict_proba(X_valid)
            
            # add the fold-model's OOF preds and ground truths to the out-of-loop lists
            oof_preds.extend(y_valid_preds)
            # oof_probs.extend(y_valid_probs)
            oof_y.extend(y_valid)
            
            # add the fold's predictions to the model's test-set predictions (will divide later)
            # test_preds += model.predict(X_test)
            # test_probs += model.predict_proba(X_test)[:,1]

            
        elif arch == 'catboost':
            if USE_GPU:
                model = CatBoostClassifier(
                    task_type='GPU',
                    silent=True,
                    random_state=random_state,
                    **model_params) 
            else:
                model = CatBoostClassifier(
                    task_type='CPU',
                    silent=True,
                    random_state=random_state,
                    **model_params)
        
            model.fit(X_train, y_train)
            
            y_valid_preds = model.predict(X_valid)
            # y_valid_probs = model.predict_proba(X_valid)[:,1] # this would only take one of 7 cols
            
            # add the fold-model's OOF preds and ground truths to the out-of-loop lists
            oof_preds.extend(y_valid_preds)
            # oof_probs.extend(y_valid_probs)
            oof_y.extend(y_valid)
            
            # add the fold's predictions to the model's test-set predictions (will divide later)
            # test_preds += model.predict(X_test).flatten()
            # test_probs += model.predict_proba(X_test)[:,1]
            
#         valid_loss = log_loss(y_valid, y_pred)
        # give the valid AUC score, for edification

        fold_accuracy = accuracy_score(y_true=y_valid, y_pred=y_valid_preds) # or should be preds?
        # fold_confusion = confusion_matrix(y_true=y_valid, y_pred=y_valid_preds)# , labels=list(range(7)))
        # fold_log_loss = log_loss(y_pred=y_valid_preds, y_true=y_valid,) #labels=list(range(7)))
        # fold_roc_auc = roc_auc_score(y_true=y_valid, y_score=y_valid_probs)
        # fold_f1_score = f1_score(
        # fold_fbeta_score = fbeta_score(
        
        if wandb_tracked:
            wandb.log({f'fold{fold}_accuracy': fold_accuracy,
                       # f'fold{fold}_confusion': fold_confusion,
                       # f'fold{fold}_log_loss': fold_log_loss,
                       # f'fold{fold}_roc_auc': fold_roc_auc,
                      })
        fold_human_results = f"{os.environ['WANDB_NOTEBOOK_NAME']}\nMetrics for fold {fold} are: \nAccuracy: {fold_accuracy}"
        print(fold_human_results)
        if telegram:
            send_tg_message(text=f"{arch} model's fold {fold} complete.\n"+fold_human_results)
        # dump(model, Path(runpath/f"{arch}_fold{fold}_rs{random_state}_model.joblib"))

    model_accuracy = accuracy_score(y_true=oof_y, y_pred=oof_preds) 
    # model_confusion = confusion_matrix(y_true=oof_y, y_pred=oof_preds, labels=list(range(7)))
    # model_log_loss = log_loss(y_pred=oof_preds, y_true=oof_y, labels=list(range(7)))
    # model_valid_auc = roc_auc_score(oof_y, oof_preds)
    model_human_results = f"{os.environ['WANDB_NOTEBOOK_NAME']}\nMetrics for model {arch} are: \nAccuracy: {model_accuracy}"
    print(model_human_results)
    if telegram:
        send_tg_message(text=f"{arch} model run complete.\n"+model_human_results)
    if wandb_tracked:
        wandb.log({f'model_accuracy': fold_accuracy,
                   # f'model_confusion': fold_confusion,
                   # f'model_log_loss': fold_log_loss,
                   # f'model_roc_auc': fold_roc_auc,
                   'model_params': str(model.get_params()),
                   'model_seed': random_state,
                  })
        wandb.finish()
    
    # finalize test preds
    # test_probs /= exmodel_config['kfolds']
    # test_preds /= exmodel_config['kfolds']
    
    
    # save OOF preds and test-set preds
#     if 'widedeep' in arch:
#         dump(oof_preds, Path(predpath/f"{wandb_config['name']}_{arch}_{exmodel_config['kfolds']}folds_{n_epochs}epochs-per-fold_rs{random_state}_oof_preds.joblib"))
#         dump(test_preds, Path(predpath/f"{wandb_config['name']}_{arch}_{exmodel_config['kfolds']}folds_{n_epochs}epochs-per-fold_rs{random_state}_test_preds.joblib"))
    
#     else:
#         dump(oof_preds, Path(predpath/f"{wandb_config['name']}_{arch}_{exmodel_config['kfolds']}folds_rs{random_state}_oof_preds.joblib"))
#         dump(test_preds, Path(predpath/f"{wandb_config['name']}_{arch}_{exmodel_config['kfolds']}folds_rs{random_state}_test_preds.joblib"))
    
    # if not (datapath/f"{exmodel_config['kfolds']}folds_rs{SEED}_oof_y.joblib").is_file():
    #     dump(oof_y, predpath/f"{exmodel_config['kfolds']}folds_rs{SEED}_oof_y.joblib")
    
#     if wandb_tracked:
# #         if 'widedeep' in arch:
#         wandb.log({'model_valid_auc': model_valid_auc,
# #                    'oof_preds': oof_preds,
# #                    'test_preds': test_preds,
#                    'model_params': str(model.parameters()) if 'widedeep' in arch else str(model.get_params()), 
#         #                    'model_params': str(model.get_params()),
#         })
# #         wandb.log({'model_valid_auc': model_valid_auc,
# #                    'oof_preds': oof_preds,
# #                    'test_preds': test_preds,
# # #                    'model_params': str(model.get_params()),
# #                   })
#         wandb.finish()
    # return oof_preds, test_preds#, model_confusion
    return model_accuracy
        

## Objective Function

In [26]:
arch = 'xgboost'

In [27]:
# originally from https://www.kaggle.com/satorushibata/optimize-catboost-hyperparameter-with-optuna-gpu
def objective(trial, arch=arch):
    """
    Wrapper around cross_validation_trainer to test different model hyperparameters
    """
    
    if arch == 'catboost':
        model_params = {
            'iterations' : trial.suggest_int('iterations', 2000, 30000),                         
            'depth' : trial.suggest_int('depth', 3, 10),                                       
            'learning_rate' : trial.suggest_loguniform('learning_rate', 0.001, 0.5),               
            'random_strength': trial.suggest_int('random_strength', 0, 100), 
    #         'objective': trial.suggest_categorical('objective', ['Logloss', 'CrossEntropy']),
    #         'bootstrap_type': trial.suggest_categorical('bootstrap_type', ['MVC', 'Bernoulli']),#, 'Poisson']),
            'od_wait': trial.suggest_int('od_wait', 20, 2000),
            'reg_lambda': trial.suggest_uniform('reg_lambda', 2, 70), # aka l2_leaf_reg
            'border_count': trial.suggest_int('border_count', 50, 275),
            'min_child_samples': trial.suggest_int('min_child_samples', 1, 20), # aka min_data_in_leaf
            'leaf_estimation_iterations': trial.suggest_int('leaf_estimation_iterations', 1, 5),
            # 'bagging_temperature' :trial.suggest_loguniform('bagging_temperature', 0.01, 100.00),
    #         'subsample': trial.suggest_uniform('subsample', 0.5, 1),
    #         'learning_rate' :trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
            # 'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter']),
    #         'max_leaves': trial.suggest_int('max_leaves', 32, 128)
        }
        
    elif arch == 'xgboost':
        model_params = {
            'n_estimators': trial.suggest_int('n_estimators', 500, 10000), # was 900-4500 for CPU
            'max_depth' : trial.suggest_int('depth', 3, 10),                                       
            'learning_rate' : trial.suggest_loguniform('learning_rate', 0.001, 0.3),               
            'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.001, 50),
            'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.001, 30),
            'subsample': trial.suggest_uniform('subsample', 0.5, 1),
    #         'booster': trial.suggest_categorical('boosting_type', ['gbtree', 'dart']),
            'min_child_weight': trial.suggest_uniform('min_child_weight', 0.001, 12),
            'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1),
            'gamma': trial.suggest_uniform('gamma', 0.1, 10)
        } 
    
    return cross_validate_model(arch, model_params=model_params, wandb_tracked=True, telegram=False)

In [28]:
wandbc = WeightsAndBiasesCallback(wandb_kwargs=wandb_config)

  wandbc = WeightsAndBiasesCallback(wandb_kwargs=wandb_config)
[34m[1mwandb[0m: Currently logged in as: [33mhushifang[0m (use `wandb login --relogin` to force relogin)


In [29]:
start_time = datetime.now().strftime('%Y%m%d%H%M%S')
study = optuna.create_study(direction = "maximize", 
                            sampler = TPESampler(seed=int(SEED)), 
                            study_name=f"{arch}_study-{start_time}")

# study = load(studypath/f"optuna_xgboost_study_106trials_20211004.joblib")


[32m[I 2021-12-10 20:06:45,373][0m A new study created in memory with name: xgboost_study-20211210200645[0m


In [30]:
# import torch

In [31]:
# xgboost.core.XGBoostError?

In [32]:
# study.optimize?

In [33]:
for x in range(1, 500):
    study.optimize(objective, n_trials = 1, callbacks = [wandbc], show_progress_bar=False)#, catch=(xgboost.core.XGBoostError,)) 
    dump(study, filename=studypath/f"optuna_{arch}_study-{start_time}.joblib")
#     dump(study.best_trial.params, filename=datapath/f'optuna_lightgbm_study_best-thru-{x*5}trials_20210927.joblib')

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…



FOLD 0
---------------------------------------------------




optuna_gbms_20211210_sage.ipynb
Metrics for fold 0 are: 
Accuracy: 0.96242375
FOLD 1
---------------------------------------------------




optuna_gbms_20211210_sage.ipynb
Metrics for fold 1 are: 
Accuracy: 0.9626225
FOLD 2
---------------------------------------------------




optuna_gbms_20211210_sage.ipynb
Metrics for fold 2 are: 
Accuracy: 0.96205125
FOLD 3
---------------------------------------------------




optuna_gbms_20211210_sage.ipynb
Metrics for fold 3 are: 
Accuracy: 0.96216625
FOLD 4
---------------------------------------------------




optuna_gbms_20211210_sage.ipynb
Metrics for fold 4 are: 
Accuracy: 0.962005
optuna_gbms_20211210_sage.ipynb
Metrics for model xgboost are: 
Accuracy: 0.96225375


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
fold0_accuracy,▁
fold1_accuracy,▁
fold2_accuracy,▁
fold3_accuracy,▁
fold4_accuracy,▁
model_accuracy,▁
model_seed,▁

0,1
fold0_accuracy,0.96242
fold1_accuracy,0.96262
fold2_accuracy,0.96205
fold3_accuracy,0.96217
fold4_accuracy,0.962
model_accuracy,0.962
model_params,{'objective': 'multi...
model_seed,42


[32m[I 2021-12-10 23:45:17,079][0m Trial 0 finished with value: 0.96225375 and parameters: {'n_estimators': 4058, 'depth': 10, 'learning_rate': 0.06504856968981275, 'reg_alpha': 0.6502468545951017, 'reg_lambda': 0.004994757081068292, 'subsample': 0.5779972601681014, 'min_child_weight': 0.6979452624062253, 'colsample_bytree': 0.9330880728874675, 'gamma': 6.051038616257767}. Best is trial 0 with value: 0.96225375.[0m


Error: You must call wandb.init() before wandb.config.update

In [None]:
wandb.log({'best_params': str(study.best_trial.params)})
wandb.finish()

In [None]:
study.best_trial.params

In [None]:
optuna.visualization.plot_parallel_coordinate(study)