# LightGBM Hyperparameter Sweep 20210922
Integrating some enhancements introduced in the XGBoost version, and implementing the frequent-serialization approach.

# Setup

In [1]:
# two manual flags (ex-config)
colab = False
gpu_available = True

In [2]:
# basic imports
from pathlib import Path
import os
import math
from datetime import datetime
import random
import multiprocessing
import pickle

In [3]:
%matplotlib inline
%config Completer.use_jedi = False
os.environ['WANDB_NOTEBOOK_NAME'] = f"sweep_lightgbm_{datetime.now().strftime('%Y%m%d')}.ipynb"

In [4]:
# handle Google Colab-specific library installation/updating
if colab:
    # much of the below inspired by or cribbed from the May 2021 Kaggle Tabular Playground winner, at 
    # https://colab.research.google.com/gist/academicsuspect/0aac7bd6e506f5f70295bfc9a3dc2250/tabular-may-baseline.ipynb?authuser=1#scrollTo=LJoVKJb5wN0L
    
    # Kaggle API for downloading the datasets
    !pip install --upgrade -q kaggle

    # weights and biases
    !pip install -qqqU wandb
    
    # Optuna for parameter search
    !pip install -q optuna

    # !pip install --upgrade xgboost

    # upgrade sklearn
    !pip install --upgrade scikit-learn

    # !pip install category_encoders
    # !pip install catboost
    !pip install --upgrade -q lightgbm

    # lighgbm gpu compatible
    # !git clone --recursive https://github.com/Microsoft/LightGBM
    # ! cd LightGBM && rm -rf build && mkdir build && cd build && cmake -DUSE_GPU=1 ../../LightGBM && make -j4 && cd ../python-package && python3 setup.py install --precompile --gpu;
    
    # # this part is from https://github.com/rapidsai/gputreeshap/issues/24
    # !pip install cmake --upgrade
    # # !pip install sklearn --upgrade
    # !git clone --recursive https://github.com/dmlc/xgboost
    # %cd /content/xgboost
    # !mkdir build
    # %cd build
    # !cmake .. -DUSE_CUDA=ON
    # !make -j4
    # %cd /content/xgboost/python-package
    # !python setup.py install --use-cuda --use-nccl
    # !/opt/bin/nvidia-smi
    # !pip install shap
    

Now, non-stdlib imports

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

# general ML tooling
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.metrics import log_loss, roc_auc_score
import wandb
from optuna.integration.wandb import WeightsAndBiasesCallback
# from wandb.xgboost import wandb_callback
# from wandb.lightgbm import wandb_callback
# from sklearn.impute import KNNImputer, StandardImputer
# import timm

import seaborn as sns

# from catboost import CatBoostClassifier
# from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
# from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MaxAbsScaler, StandardScaler, MinMaxScaler, RobustScaler, PolynomialFeatures
# from sklearn.feature_selection import SelectKBest, f_regression
from joblib import dump, load
# feature engineering tools
# from sklearn.feature_selection import mutual_info_regression
# import featuretools as ft


from optuna.samplers import TPESampler
import optuna
# import catboost
from sklearn.utils import resample
import sklearn.metrics

Now, datapath setup

In [6]:
# # This is the code for reading the train.csv and converting it to a .feather file
# df = pd.read_csv(datapath/'train.csv', index_col='id', low_memory=False)
# df.index.name = None
# df.to_feather(path='./dataset_df.feather')

In [7]:
if colab:
    # mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')
    
    # handling datapath
    datapath = Path('/content/drive/MyDrive/kaggle/tabular_playgrounds/sep2021/')
    
else:
    # if on local machine
    datapath = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/sep2021/')    
    


In [8]:

# n_trials = int(1000)
SEED = 42

In [9]:
# Function to seed everything
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
seed_everything(SEED)

## Ex-Model Config

In [10]:
# meta-config for preprocessing and cross-validation, but NOT for model parameters
# in the sweep version, this includes both ex-model parameters and defaults for model parameters
exmodel_config = {
    # model config
    "library": 'lightgbm',
#     "model": XGBClassifier,
#     "n_estimators": 100, 
#     "max_depth": 3,
#     "learning_rate": 0.1,
#     "test_size": 0.2,
#     "reg_lambda": None, 
    "scaler": "sklearn.preprocessing.StandardScaler()", # TODO: experiment with others (but imputation may be slow)
    "scale_b4_impute": False,
    "imputer": "sklearn.impute.SimpleImputer(strategy='median', add_indicator=True)",
    "knn_imputer_n_neighbors": None, # None if a different imputer is used
#     "feature_selector": SelectKBest,
#     "k_best": 80,
#     "feature_selection_scoring": f_regression,
    'random_state': SEED,
    'optuna': True,
    'optuna_trials': 500,
#     'subsample': 1,
#     'cross_val_strategy': None, # None for holdout, or the relevant sklearn class
#     'kfolds': 1, # if 1, that means just doing holdout
#     'test_size': 0.2,
    # these are XGBoost default (my choice) params 
#     "tree_method": "auto", # set to 'gpu_hist' to try GPU if available
#     "booster": 'gbtree', # dart may be marginally better, but will opt for this quicker approach as a default
#     "n_estimators": 200, 
#     "max_depth": 3,
#     "learning_rate": 0.1,
#     "n_jobs": -1,
#     "verbosity": 1,
#     "subsample": 1,
#     'features_created': False,
#     'feature_creator': None,
}

wandb_kwargs = {
    # wandb config
    'name': f"{os.environ['WANDB_NOTEBOOK_NAME'][:-6]}_{datetime.now().strftime('%H%M%S')}", # just removes the .ipynb extension, leaving the notebook filename's stem
    'project': '202109_Kaggle_tabular_playground',
    'tags': ['sweep'],
    'notes': "Trying LightGBM on GPU",
    'config': exmodel_config,
}

## Data Setup

**TODO** Write some conditional logic here to automate it -- possibly as part of a sklearn.*pipeline

In [11]:
X_source = 'X_NaNcounts_SummaryStats_imputed-Median-wIndicators-StandardScaled.feather'
X_train = pd.read_feather(datapath/X_source) 
y_train = load(datapath/'y.joblib')    
# X.index.name = 'id'
# y.index.name = 'id'
X = np.array(X_train)
y = np.array(y_train)

del X_train, y_train

In [12]:
exmodel_config['feature_count'] = X.shape[1]
exmodel_config['feature_generator'] = "Summary statistics"
exmodel_config['X_source'] = X_source

# Experiment setup

In [13]:
# wandb_kwargs = {
#     # wandb config:
#     'name': f"{os.environ['WANDB_NOTEBOOK_NAME'][:-6]}_{datetime.now().strftime('%H%M%S')}", # just removes the .ipynb extension, leaving the notebook filename's stem
#     'project': '202109_Kaggle_tabular_playground',
#     'tags': ['sweep'],
#     'notes': "Sweep for CatBoost using Optuna",
#     'config': exmodel_config,
# }

In [14]:
# originally from https://www.kaggle.com/satorushibata/optimize-catboost-hyperparameter-with-optuna-gpu
def objective(trial):
    # split the (original Kaggle training) data into partitions
    # if study.best_trial:
    #     print("Dumping best params, which are:")
    #     print(str(study.best_trial.params))
    #     dump(study.best_trial.params, filename=datapath/'optuna_catboost_best_20210920.joblib')
       
    # else:
    #     print("No best study yet")
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.33, random_state=int(SEED), shuffle=True)
    # create wrappers for the training and validation partitions
    # train_pool = catboost.Pool(X_train, y_train)
    # valid_pool = catboost.Pool(X_valid, y_valid)
    
    # experimental parameters
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 900, 7000),
        'max_depth' : trial.suggest_int('depth', 3, 10),                                       
        'learning_rate' : trial.suggest_loguniform('learning_rate', 0.001, 0.4),               
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.001, 30),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.001, 30),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1), # aka bagging_fraction
        'boosting_type': trial.suggest_categorical('boosting_type', ['gbdt', 'dart', 'goss']),
        'min_child_samples': trial.suggest_int('min_child_samples', 4, 75),
        'num_leaves': trial.suggest_int('num_leaves', 50, 250),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1), # aka feature_fraction
    }  

    # instantiate the model, with some parameters locked in, and experimnental ones passed via splat 
    model = LGBMClassifier(
        objective='binary',
        random_state=SEED,
        n_jobs=-1,
        eval_metric='auc',
        device_type='gpu',
#         max_bin=63,
        **params
    )       

    model.fit(X_train, y_train)
    # generate predictions
    preds = model.predict_proba(X_valid)[:,1]
    # rounds to the nearest integer, and the nearest even in case of _.5s

    # Evaluation
    valid_auc = roc_auc_score(y_valid, preds)
    print('ROC AUC Score of XGBoost =', valid_auc)
    wandb.log({'valid_auc': valid_auc,
              })

    return valid_auc

In [15]:
wandbc = WeightsAndBiasesCallback(wandb_kwargs=wandb_kwargs)

  wandbc = WeightsAndBiasesCallback(wandb_kwargs=wandb_kwargs)
[34m[1mwandb[0m: Currently logged in as: [33mhushifang[0m (use `wandb login --relogin` to force relogin)


In [16]:
study = optuna.create_study(direction = "maximize", 
                            sampler = TPESampler(seed=int(SEED)), 
                            study_name='lightgbm_20210927')

# study = load(datapath/f'optuna_lightgbm_study_5trials_20210922.joblib')


[32m[I 2021-09-27 13:01:06,186][0m A new study created in memory with name: lightgbm_20210927[0m


In [17]:

for x in range(1,101):
    study.optimize(objective, n_trials = 5, callbacks = [wandbc]) #n_jobs = multiprocessing.cpu_count())
    print(f"{x*5} trials complete")
    dump(study, filename=datapath/f'optuna_lightgbm_study_{x*5}trials_20210927.joblib')k
    dump(study.best_trial.params, filename=datapath/f'optuna_lightgbm_study_best-thru-{x*5}trials_20210927.joblib')



[32m[I 2021-09-27 13:09:31,700][0m Trial 0 finished with value: 0.810462615617223 and parameters: {'n_estimators': 3185, 'depth': 10, 'learning_rate': 0.0802956743641955, 'reg_alpha': 0.4789240251631179, 'reg_lambda': 0.004994757081068292, 'subsample': 0.5779972601681014, 'boosting_type': 'dart', 'min_child_samples': 54, 'num_leaves': 54, 'colsample_bytree': 0.9849549260809971}. Best is trial 0 with value: 0.810462615617223.[0m


ROC AUC Score of XGBoost = 0.810462615617223


[32m[I 2021-09-27 13:13:02,696][0m Trial 1 finished with value: 0.8137821186092775 and parameters: {'n_estimators': 5978, 'depth': 4, 'learning_rate': 0.002972483637079397, 'reg_alpha': 0.0066240595682091315, 'reg_lambda': 0.023021277110080198, 'subsample': 0.762378215816119, 'boosting_type': 'goss', 'min_child_samples': 14, 'num_leaves': 108, 'colsample_bytree': 0.6831809216468459}. Best is trial 1 with value: 0.8137821186092775.[0m


ROC AUC Score of XGBoost = 0.8137821186092775


[32m[I 2021-09-27 13:17:01,041][0m Trial 2 finished with value: 0.8135205911702023 and parameters: {'n_estimators': 3682, 'depth': 9, 'learning_rate': 0.003307982168695265, 'reg_alpha': 0.20058106556780586, 'reg_lambda': 0.4490677404109781, 'subsample': 0.5232252063599989, 'boosting_type': 'gbdt', 'min_child_samples': 72, 'num_leaves': 244, 'colsample_bytree': 0.9041986740582306}. Best is trial 1 with value: 0.8137821186092775.[0m


ROC AUC Score of XGBoost = 0.8135205911702023


[32m[I 2021-09-27 13:20:56,722][0m Trial 3 finished with value: 0.8129082137150812 and parameters: {'n_estimators': 2758, 'depth': 3, 'learning_rate': 0.06031361827702156, 'reg_alpha': 0.09345791438428568, 'reg_lambda': 0.0035186816415472676, 'subsample': 0.7475884550556351, 'boosting_type': 'dart', 'min_child_samples': 51, 'num_leaves': 112, 'colsample_bytree': 0.7600340105889054}. Best is trial 1 with value: 0.8137821186092775.[0m


ROC AUC Score of XGBoost = 0.8129082137150812


[32m[I 2021-09-27 13:28:46,677][0m Trial 4 finished with value: 0.8085273314781937 and parameters: {'n_estimators': 4235, 'depth': 4, 'learning_rate': 0.3333629787709382, 'reg_alpha': 2.953681335681259, 'reg_lambda': 16.078690668199, 'subsample': 0.9474136752138245, 'boosting_type': 'dart', 'min_child_samples': 18, 'num_leaves': 59, 'colsample_bytree': 0.6626651653816322}. Best is trial 1 with value: 0.8137821186092775.[0m


ROC AUC Score of XGBoost = 0.8085273314781937
5 trials complete


[32m[I 2021-09-27 13:40:30,889][0m Trial 5 finished with value: 0.8121872668628125 and parameters: {'n_estimators': 3271, 'depth': 5, 'learning_rate': 0.14335891845548843, 'reg_alpha': 0.039557414824905594, 'reg_lambda': 0.018104138546410155, 'subsample': 0.7713480415791243, 'boosting_type': 'dart', 'min_child_samples': 75, 'num_leaves': 205, 'colsample_bytree': 0.5993578407670862}. Best is trial 1 with value: 0.8137821186092775.[0m


ROC AUC Score of XGBoost = 0.8121872668628125


[32m[I 2021-09-27 13:41:27,743][0m Trial 6 finished with value: 0.8044587071834538 and parameters: {'n_estimators': 933, 'depth': 9, 'learning_rate': 0.06906932535689181, 'reg_alpha': 1.8359188752396374, 'reg_lambda': 2.838382119353614, 'subsample': 0.5370223258670452, 'boosting_type': 'goss', 'min_child_samples': 48, 'num_leaves': 116, 'colsample_bytree': 0.5317791751430119}. Best is trial 1 with value: 0.8137821186092775.[0m


ROC AUC Score of XGBoost = 0.8044587071834538


[32m[I 2021-09-27 13:43:27,727][0m Trial 7 finished with value: 0.8058125050275564 and parameters: {'n_estimators': 2797, 'depth': 5, 'learning_rate': 0.07915512627905745, 'reg_alpha': 0.715191107817528, 'reg_lambda': 9.37905380401463, 'subsample': 0.7361074625809747, 'boosting_type': 'goss', 'min_child_samples': 44, 'num_leaves': 204, 'colsample_bytree': 0.7468977981821954}. Best is trial 1 with value: 0.8137821186092775.[0m


ROC AUC Score of XGBoost = 0.8058125050275564


[32m[I 2021-09-27 13:46:37,930][0m Trial 8 finished with value: 0.8110870387121909 and parameters: {'n_estimators': 4089, 'depth': 6, 'learning_rate': 0.0011645069711410827, 'reg_alpha': 0.0030411861290916427, 'reg_lambda': 0.0013826500550053445, 'subsample': 0.8182052056318903, 'boosting_type': 'goss', 'min_child_samples': 21, 'num_leaves': 132, 'colsample_bytree': 0.8777755692715243}. Best is trial 1 with value: 0.8137821186092775.[0m


ROC AUC Score of XGBoost = 0.8110870387121909


[32m[I 2021-09-27 13:49:57,943][0m Trial 9 finished with value: 0.805242262053027 and parameters: {'n_estimators': 2295, 'depth': 3, 'learning_rate': 0.005674801345779974, 'reg_alpha': 0.005269959187361842, 'reg_lambda': 14.533463349735559, 'subsample': 0.9040601897822085, 'boosting_type': 'dart', 'min_child_samples': 17, 'num_leaves': 229, 'colsample_bytree': 0.7696711209578253}. Best is trial 1 with value: 0.8137821186092775.[0m


ROC AUC Score of XGBoost = 0.805242262053027
10 trials complete


[33m[W 2021-09-27 13:50:02,359][0m Trial 10 failed because of the following error: LightGBMError('Check failed: (best_split_info.right_count) > (0) at /home/sf/Software/LightGBM/src/treelearner/serial_tree_learner.cpp, line 663 .\n')[0m
Traceback (most recent call last):
  File "/home/sf/anaconda3/envs/tabular-gpu/lib/python3.8/site-packages/optuna/study/_optimize.py", line 213, in _run_trial
    value_or_values = func(trial)
  File "<ipython-input-14-5f93570f4ca9>", line 41, in objective
    model.fit(X_train, y_train)
  File "/home/sf/Software/LightGBM/python-package/lightgbm/sklearn.py", line 964, in fit
    super().fit(X, _y, sample_weight=sample_weight, init_score=init_score, eval_set=valid_sets,
  File "/home/sf/Software/LightGBM/python-package/lightgbm/sklearn.py", line 745, in fit
    self._Booster = train(
  File "/home/sf/Software/LightGBM/python-package/lightgbm/engine.py", line 293, in train
    booster.update(fobj=fobj)
  File "/home/sf/Software/LightGBM/python-package/

LightGBMError: Check failed: (best_split_info.right_count) > (0) at /home/sf/Software/LightGBM/src/treelearner/serial_tree_learner.cpp, line 663 .


In [None]:
dump(study, filename=datapath/'optuna_lightgbm_500trials-complete_20210927.joblib')
dump(study.best_trial.params, filename=datapath/'optuna_lightgbm_all-500trials-best_20210927.joblib')
# pickle.dump(study.best_trial.params, open('CatBoost_Hyperparameter.pickle', 'wb'))
# print('CatBoost Hyperparameter:', study.best_trial.params)

In [18]:
study.best_trial.params

{'n_estimators': 5978,
 'depth': 4,
 'learning_rate': 0.002972483637079397,
 'reg_alpha': 0.0066240595682091315,
 'reg_lambda': 0.023021277110080198,
 'subsample': 0.762378215816119,
 'boosting_type': 'goss',
 'min_child_samples': 14,
 'num_leaves': 108,
 'colsample_bytree': 0.6831809216468459}

In [None]:
wandb.log({'lightgbm_params': study.best_trial.params})
wandb.finish()