# Dataset Sweep usign XGBoost on GPU
Trying different variations on the dataset using PCA, other techniques

# Setup

In [1]:
# two manual flags (ex-config)
colab = False
gpu_available = True

In [2]:
# basic imports
from pathlib import Path
import os
import math
from datetime import datetime
import random
import multiprocessing
import pickle

In [3]:
%matplotlib inline
%config Completer.use_jedi = False
os.environ['WANDB_NOTEBOOK_NAME'] = f"sweep_xgboost_{datetime.now().strftime('%Y%m%d')}.ipynb"

In [4]:
# handle Google Colab-specific library installation/updating
if colab:
    # much of the below inspired by or cribbed from the May 2021 Kaggle Tabular Playground winner, at 
    # https://colab.research.google.com/gist/academicsuspect/0aac7bd6e506f5f70295bfc9a3dc2250/tabular-may-baseline.ipynb?authuser=1#scrollTo=LJoVKJb5wN0L
    
    # Kaggle API for downloading the datasets
    !pip install --upgrade -q kaggle

    # weights and biases
    !pip install -qqqU wandb
    
    # Optuna for parameter search
    !pip install -q optuna

    # !pip install --upgrade xgboost

    # upgrade sklearn
    !pip install --upgrade scikit-learn

    # !pip install category_encoders
    # !pip install catboost
#     !pip install --upgrade -q lightgbm

    # lighgbm gpu compatible
    # !git clone --recursive https://github.com/Microsoft/LightGBM
    # ! cd LightGBM && rm -rf build && mkdir build && cd build && cmake -DUSE_GPU=1 ../../LightGBM && make -j4 && cd ../python-package && python3 setup.py install --precompile --gpu;
    
    # # this part is from https://github.com/rapidsai/gputreeshap/issues/24
    # !pip install cmake --upgrade
    # # !pip install sklearn --upgrade
    # !git clone --recursive https://github.com/dmlc/xgboost
    # %cd /content/xgboost
    # !mkdir build
    # %cd build
    # !cmake .. -DUSE_CUDA=ON
    # !make -j4
    # %cd /content/xgboost/python-package
    # !python setup.py install --use-cuda --use-nccl
    # !/opt/bin/nvidia-smi
    # !pip install shap
    

Now, non-stdlib imports

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

# general ML tooling
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.metrics import log_loss, roc_auc_score
import wandb
from optuna.integration.wandb import WeightsAndBiasesCallback
# from wandb.xgboost import wandb_callback
# from wandb.lightgbm import wandb_callback
# from sklearn.impute import KNNImputer, StandardImputer
# import timm

import seaborn as sns

# from catboost import CatBoostClassifier
from xgboost import XGBClassifier
# from lightgbm import LGBMClassifier
# from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MaxAbsScaler, StandardScaler, MinMaxScaler, RobustScaler, PolynomialFeatures
# from sklearn.feature_selection import SelectKBest, f_regression
from joblib import dump, load
# feature engineering tools
# from sklearn.feature_selection import mutual_info_regression
# import featuretools as ft


from optuna.samplers import TPESampler
import optuna
# import catboost
from sklearn.utils import resample
import sklearn.metrics

In [6]:
from sklearn.decomposition import PCA, IncrementalPCA
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression


Now, datapath setup

In [7]:
# # This is the code for reading the train.csv and converting it to a .feather file
# df = pd.read_csv(datapath/'train.csv', index_col='id', low_memory=False)
# df.index.name = None
# df.to_feather(path='./dataset_df.feather')

In [8]:
if colab:
    # mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')
    
    # handling datapath
#     datapath = Path('/content/drive/MyDrive/kaggle/tabular_playgrounds/sep2021/')
    
else:
    # if on local machine
#     datapath = Path('/media/sf/easystore/kaggle_data/tabular_playgrounds/sep2021/')  
    root = Path('/home/sf/code/kaggle/tabular_playgrounds/oct2021/')
    datapath = root/'datasets'
    edapath = root/'EDA'
    modelpath = root/'models'
    predpath = root/'preds'
    subpath = root/'submissions'
    studypath = root/'optuna_studies'
    
    for pth in [root, datapath, edapath, modelpath, predpath, subpath]:
        pth.mkdir(exist_ok=True)

In [9]:

# n_trials = int(1000)
SEED = 42

In [10]:
# Function to seed everything
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
seed_everything(SEED)

## Ex-Model Config

In [11]:
# meta-config for preprocessing and cross-validation, but NOT for model parameters
# in the sweep version, this includes both ex-model parameters and defaults for model parameters
exmodel_config = {
    # model config
    "library": 'xgboost',
#     "model": XGBClassifier,
#     "n_estimators": 100, 
#     "max_depth": 3,
#     "learning_rate": 0.1,
#     "test_size": 0.2,
#     "reg_lambda": None, 
#     "scaler": "sklearn.preprocessing.StandardScaler()", # TODO: experiment with others (but imputation may be slow)
#     "scale_b4_impute": False,
#     "imputer": "sklearn.impute.SimpleImputer(strategy='median', add_indicator=True)",
#     "knn_imputer_n_neighbors": None, # None if a different imputer is used
#     "feature_selector": SelectKBest,
#     "k_best": 80,
#     "feature_selection_scoring": f_regression,
    'random_state': SEED,
    'optuna': True,
#     'optuna_trials': 50,
#     'subsample': 1,
#     'cross_val_strategy': None, # None for holdout, or the relevant sklearn class
#     'kfolds': 1, # if 1, that means just doing holdout
#     'test_size': 0.2,
    # these are XGBoost default (my choice) params 
#     "tree_method": "auto", # set to 'gpu_hist' to try GPU if available
#     "booster": 'gbtree', # dart may be marginally better, but will opt for this quicker approach as a default
#     "n_estimators": 200, 
#     "max_depth": 3,
#     "learning_rate": 0.1,
#     "n_jobs": -1,
#     "verbosity": 1,
#     "subsample": 1,
#     'features_created': False,
#     'feature_creator': None,
}

wandb_kwargs = {
    # wandb config
    'name': f"{os.environ['WANDB_NOTEBOOK_NAME'][:-6]}_{datetime.now().strftime('%H%M%S')}", # just removes the .ipynb extension, leaving the notebook filename's stem
    'project': '202110_Kaggle_tabular_playground',
    'tags': ['sweep'],
    'notes': "Sweep for preprocessing techniques on dataset",
    'config': exmodel_config,
}

## Data Setup

**TODO** Write some conditional logic here to automate it -- possibly as part of a sklearn.*pipeline

In [29]:
train_source = datapath/'train.feather'
df = pd.read_feather(path=train_source)
df.index.name = 'id'
y = df.target
features = [x for x in df.columns if x != 'target']
X = df[features]
# X.index.name = 'id'
# y.index.name = 'id'
# X = np.array(X_train)
# y = np.array(y_train)

# del df, X_train, y_train


# exmodel_config['feature_count'] = len(X.columns)
# exmodel_config['feature_count'] = X.shape[1]
# exmodel_config['instance_count'] = X.shape[0]

# exmodel_config['feature_generator'] = None
# exmodel_config['feature_generator'] = "Summary statistics"

# exmodel_config['train_source'] = str(train_source)
test_source = datapath/'test.feather'
# exmodel_config['test_source'] = str(test_source)
X_test = pd.read_feather(path=test_source)
X_test = X_test.iloc[:, 1:]
# X_test = np.array(X_test)

# Experiment setup

In [13]:
# wandb_kwargs = {
#     # wandb config:
#     'name': f"{os.environ['WANDB_NOTEBOOK_NAME'][:-6]}_{datetime.now().strftime('%H%M%S')}", # just removes the .ipynb extension, leaving the notebook filename's stem
#     'project': '202109_Kaggle_tabular_playground',
#     'tags': ['sweep'],
#     'notes': "Sweep for CatBoost using Optuna",
#     'config': exmodel_config,
# }

In [14]:
X.shape

(1000000, 285)

In [15]:
X_test.shape

(500000, 285)

In [16]:
X_test.head()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f275,f276,f277,f278,f279,f280,f281,f282,f283,f284
0,0.178216,0.435617,0.01023,0.202074,0.39017,0.324221,0.221722,0.738894,0.582588,0.34377,...,1,0,0,0,0,0,1,1,1,0
1,0.18125,0.476455,0.022413,0.283146,0.59802,0.349508,0.283467,0.721575,0.26899,0.208373,...,0,0,0,0,0,0,0,0,0,0
2,0.159721,0.451202,0.259649,0.365274,0.594634,0.413502,0.249318,0.642339,0.411104,0.246891,...,0,0,0,0,0,0,1,0,0,0
3,0.182424,0.520976,0.095344,0.327742,0.74183,0.358711,0.270077,0.601662,0.297742,0.252829,...,0,0,0,0,0,1,1,0,0,0
4,0.229329,0.336513,0.023511,0.300913,0.668738,0.481586,0.54566,0.667849,0.546045,0.202731,...,0,0,0,0,1,0,0,1,0,0


In [17]:
pca = PCA(n_components=55, random_state=42)
X_pca = pca.fit_transform(X)

In [18]:
poly = PolynomialFeatures(include_bias=False, interaction_only=True)

In [19]:
X_pca_poly = poly.fit_transform(X_pca)

In [20]:
X_pca_poly.shape

(1000000, 1540)

In [21]:
del X_pca, X

In [25]:
# sns.scatterplot(X_embedded, index=[i for i in range(X.shape[0])])

In [22]:
best_xgboost_params = {
        'n_estimators': 3878,
        'max_depth': 4,
        'learning_rate': 0.024785857161974977,
        'reg_alpha': 26.867682044658245,
        'reg_lambda': 10.839759074147148,
        'subsample': 0.8208581489835881,
        'min_child_weight': 8.829122644339664,
        'colsample_bytree': 0.906420714280384,
        'gamma': 1.472322916021486
    }

# instantiate the model, with some parameters locked in, and experimnental ones passed via splat 
model = XGBClassifier(
    objective='binary:logistic',
    verbosity=1,
    tree_method='gpu_hist',
    booster='gbtree', # not bothering with dart for time reasons
    random_state=SEED,
    **best_xgboost_params
#         n_jobs=-1,
#         **params
)    

In [23]:
y_np = np.array(y)

In [24]:
del y

In [25]:
type(X_pca_poly), type(y_np)

(numpy.ndarray, numpy.ndarray)

In [27]:
X_train, X_valid, y_train, y_valid = train_test_split(X_pca_poly, y_np, test_size=0.2, random_state=int(SEED), shuffle=True)

model.fit(X_train, y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.906420714280384,
              gamma=1.472322916021486, gpu_id=0, importance_type='gain',
              interaction_constraints='', learning_rate=0.024785857161974977,
              max_delta_step=0, max_depth=4, min_child_weight=8.829122644339664,
              missing=nan, monotone_constraints='()', n_estimators=3878,
              n_jobs=16, num_parallel_tree=1, random_state=42,
              reg_alpha=26.867682044658245, reg_lambda=10.839759074147148,
              scale_pos_weight=1, subsample=0.8208581489835881,
              tree_method='gpu_hist', validate_parameters=1, verbosity=1)

In [30]:
dump(X_pca_poly, '/media/sf/easystore/kaggle_data/tabular_playgrounds/oct2021/alt_datasets/X_pca_poly.joblib')

['/media/sf/easystore/kaggle_data/tabular_playgrounds/oct2021/alt_datasets/X_pca_poly.joblib']

In [31]:
dump(poly, '/media/sf/easystore/kaggle_data/tabular_playgrounds/oct2021/alt_datasets/poly.joblib')
dump(pca, '/media/sf/easystore/kaggle_data/tabular_playgrounds/oct2021/alt_datasets/pca.joblib')

['/media/sf/easystore/kaggle_data/tabular_playgrounds/oct2021/alt_datasets/pca.joblib']

In [28]:
# generate predictions
preds = model.predict_proba(X_valid)[:,1]
# rounds to the nearest integer, and the nearest even in case of _.5s

# Evaluation
valid_auc = roc_auc_score(y_valid, preds)
print('ROC AUC Score of XGBoost =', valid_auc)

ROC AUC Score of XGBoost = 0.7783978025549229


**Not great**. Let's compare with the original:

In [32]:
del X_pca_poly

In [33]:
train_source = datapath/'train.feather'
df = pd.read_feather(path=train_source)
df.index.name = 'id'
y = df.target
features = [x for x in df.columns if x != 'target']
X = df[features]
# X.index.name = 'id'
# y.index.name = 'id'
# X = np.array(X_train)
# y = np.array(y_train)

# del df, X_train, y_train


# exmodel_config['feature_count'] = len(X.columns)
# exmodel_config['feature_count'] = X.shape[1]
# exmodel_config['instance_count'] = X.shape[0]

# exmodel_config['feature_generator'] = None
# exmodel_config['feature_generator'] = "Summary statistics"

# exmodel_config['train_source'] = str(train_source)
test_source = datapath/'test.feather'
# exmodel_config['test_source'] = str(test_source)
X_test = pd.read_feather(path=test_source)
X_test = X_test.iloc[:, 1:]
# X_test = np.array(X_test)

In [34]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y_np, test_size=0.2, random_state=int(SEED), shuffle=True)

model.fit(X_train, y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.906420714280384,
              gamma=1.472322916021486, gpu_id=0, importance_type='gain',
              interaction_constraints='', learning_rate=0.024785857161974977,
              max_delta_step=0, max_depth=4, min_child_weight=8.829122644339664,
              missing=nan, monotone_constraints='()', n_estimators=3878,
              n_jobs=16, num_parallel_tree=1, random_state=42,
              reg_alpha=26.867682044658245, reg_lambda=10.839759074147148,
              scale_pos_weight=1, subsample=0.8208581489835881,
              tree_method='gpu_hist', validate_parameters=1, verbosity=1)

In [35]:
# generate predictions
preds = model.predict_proba(X_valid)[:,1]
# rounds to the nearest integer, and the nearest even in case of _.5s

# Evaluation
valid_auc = roc_auc_score(y_valid, preds)
print('ROC AUC Score of XGBoost =', valid_auc)

ROC AUC Score of XGBoost = 0.8572984856383443


In [14]:
# originally from https://www.kaggle.com/satorushibata/optimize-catboost-hyperparameter-with-optuna-gpu
def objective(trial):
    # split the (original Kaggle training) data into partitions
    # if study.best_trial:
    #     print("Dumping best params, which are:")
    #     print(str(study.best_trial.params))
    #     dump(study.best_trial.params, filename=datapath/'optuna_catboost_best_20210920.joblib')
    
#     pca_components = trial.suggest_int('pca_components', 50, 285)
    pca = PCA(n_components=pca_components, random_state=42)
    X_pca = pca.fit_transform(X)
#     dump(pca60, edapath/'PCA_60.joblibg')
    
    # else:
    #     print("No best study yet")
    X_train, X_valid, y_train, y_valid = train_test_split(X_pca, y, test_size=0.33, random_state=int(SEED), shuffle=True)
    # create wrappers for the training and validation partitions
    # train_pool = catboost.Pool(X_train, y_train)
    # valid_pool = catboost.Pool(X_valid, y_valid)
    
    # experimental parameters
#     params = {
#         'n_estimators': trial.suggest_int('n_estimators', 900, 6000), # was 900-4500 for CPU
#         'max_depth' : trial.suggest_int('depth', 3, 10),                                       
#         'learning_rate' : trial.suggest_loguniform('learning_rate', 0.001, 0.4),               
#         'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.001, 30),
#         'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.001, 30),
#         'subsample': trial.suggest_uniform('subsample', 0.5, 1),
# #         'booster': trial.suggest_categorical('boosting_type', ['gbtree', 'dart']),
#         'min_child_weight': trial.suggest_uniform('min_child_weight', 0.001, 10),
#         'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1),
#         'gamma': trial.suggest_uniform('gamma', 0.1, 30)
#     }  

    best_xgboost_params = {
        'n_estimators': 3878,
        'max_depth': 4,
        'learning_rate': 0.024785857161974977,
        'reg_alpha': 26.867682044658245,
        'reg_lambda': 10.839759074147148,
        'subsample': 0.8208581489835881,
        'min_child_weight': 8.829122644339664,
        'colsample_bytree': 0.906420714280384,
        'gamma': 1.472322916021486
    }

    # instantiate the model, with some parameters locked in, and experimnental ones passed via splat 
    model = XGBClassifier(
        objective='binary:logistic',
        verbosity=1,
        tree_method='gpu_hist',
        booster='gbtree', # not bothering with dart for time reasons
        random_state=SEED,
        **best_xgboost_params
#         n_jobs=-1,
#         **params
    )    

    model.fit(X_train, y_train)
    # generate predictions
    preds = model.predict_proba(X_valid)[:,1]
    # rounds to the nearest integer, and the nearest even in case of _.5s

    # Evaluation
    valid_auc = roc_auc_score(y_valid, preds)
    print('ROC AUC Score of XGBoost =', valid_auc)
    wandb.log({'valid_auc': valid_auc,
              })

    return valid_auc

In [15]:
wandbc = WeightsAndBiasesCallback(wandb_kwargs=wandb_kwargs)

  wandbc = WeightsAndBiasesCallback(wandb_kwargs=wandb_kwargs)
[34m[1mwandb[0m: Currently logged in as: [33mhushifang[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.4 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


In [16]:
study = optuna.create_study(direction = "maximize", 
                            sampler = TPESampler(seed=int(SEED)), 
                            study_name=f"pca_{datetime.now().strftime('%Y%m%d')}")

# study = load(studypath/f"optuna_xgboost_study_106trials_20211004.joblib")


[32m[I 2021-10-10 11:57:06,723][0m A new study created in memory with name: pca_20211010[0m


In [17]:
import torch

In [18]:
import xgboost

In [19]:
xgboost.core.XGBoostError?

[0;31mInit signature:[0m [0mxgboost[0m[0;34m.[0m[0mcore[0m[0;34m.[0m[0mXGBoostError[0m[0;34m([0m[0mself[0m[0;34m,[0m [0;34m/[0m[0;34m,[0m [0;34m*[0m[0margs[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m      Error thrown by xgboost trainer.
[0;31mFile:[0m           ~/anaconda3/envs/tabular-x/lib/python3.8/site-packages/xgboost/core.py
[0;31mType:[0m           type
[0;31mSubclasses:[0m     


In [20]:
for x in range(1,200):
    study.optimize(objective, n_trials = 1, callbacks = [wandbc], show_progress_bar=True, catch=(xgboost.core.XGBoostError,)) 
    dump(study, filename=datapath/f"optuna_dataset-pca_study_{x}trials_{datetime.now().strftime('%Y%m%d')}.joblib")
#     dump(study.best_trial.params, filename=datapath/f'optuna_lightgbm_study_best-thru-{x*5}trials_20210927.joblib')

  self._init_valid()


  0%|          | 0/1 [00:00<?, ?it/s]







ROC AUC Score of XGBoost = 0.8283967087232865
[32m[I 2021-10-10 11:59:12,446][0m Trial 0 finished with value: 0.8283967087232865 and parameters: {'pca_components': 138}. Best is trial 0 with value: 0.8283967087232865.[0m


  self._init_valid()


  0%|          | 0/1 [00:00<?, ?it/s]







ROC AUC Score of XGBoost = 0.8472107426301871
[32m[I 2021-10-10 12:02:45,979][0m Trial 1 finished with value: 0.8472107426301871 and parameters: {'pca_components': 274}. Best is trial 1 with value: 0.8472107426301871.[0m


  self._init_valid()


  0%|          | 0/1 [00:00<?, ?it/s]







ROC AUC Score of XGBoost = 0.8411313966895685
[32m[I 2021-10-10 12:06:07,082][0m Trial 2 finished with value: 0.8411313966895685 and parameters: {'pca_components': 222}. Best is trial 1 with value: 0.8472107426301871.[0m


  self._init_valid()


  0%|          | 0/1 [00:00<?, ?it/s]







ROC AUC Score of XGBoost = 0.8342598041011918
[32m[I 2021-10-10 12:09:03,422][0m Trial 3 finished with value: 0.8342598041011918 and parameters: {'pca_components': 191}. Best is trial 1 with value: 0.8472107426301871.[0m


  self._init_valid()


  0%|          | 0/1 [00:00<?, ?it/s]







ROC AUC Score of XGBoost = 0.806690388517508
[32m[I 2021-10-10 12:10:25,360][0m Trial 4 finished with value: 0.806690388517508 and parameters: {'pca_components': 86}. Best is trial 1 with value: 0.8472107426301871.[0m


  self._init_valid()


  0%|          | 0/1 [00:00<?, ?it/s]







ROC AUC Score of XGBoost = 0.806690388517508
[32m[I 2021-10-10 12:11:46,686][0m Trial 5 finished with value: 0.806690388517508 and parameters: {'pca_components': 86}. Best is trial 1 with value: 0.8472107426301871.[0m


  self._init_valid()


  0%|          | 0/1 [00:00<?, ?it/s]







ROC AUC Score of XGBoost = 0.7904181686815366
[32m[I 2021-10-10 12:12:47,694][0m Trial 6 finished with value: 0.7904181686815366 and parameters: {'pca_components': 63}. Best is trial 1 with value: 0.8472107426301871.[0m


  self._init_valid()


  0%|          | 0/1 [00:00<?, ?it/s]







ROC AUC Score of XGBoost = 0.8452885185751505
[32m[I 2021-10-10 12:15:43,607][0m Trial 7 finished with value: 0.8452885185751505 and parameters: {'pca_components': 254}. Best is trial 1 with value: 0.8472107426301871.[0m


  self._init_valid()


  0%|          | 0/1 [00:00<?, ?it/s]







ROC AUC Score of XGBoost = 0.8342598041011918
[32m[I 2021-10-10 12:18:38,689][0m Trial 8 finished with value: 0.8342598041011918 and parameters: {'pca_components': 191}. Best is trial 1 with value: 0.8472107426301871.[0m


  self._init_valid()


  0%|          | 0/1 [00:00<?, ?it/s]







ROC AUC Score of XGBoost = 0.8409412517853502
[32m[I 2021-10-10 12:21:58,921][0m Trial 9 finished with value: 0.8409412517853502 and parameters: {'pca_components': 217}. Best is trial 1 with value: 0.8472107426301871.[0m


  self._init_valid()


  0%|          | 0/1 [00:00<?, ?it/s]







ROC AUC Score of XGBoost = 0.8474176983005992
[32m[I 2021-10-10 12:25:32,672][0m Trial 10 finished with value: 0.8474176983005992 and parameters: {'pca_components': 281}. Best is trial 10 with value: 0.8474176983005992.[0m


  self._init_valid()


  0%|          | 0/1 [00:00<?, ?it/s]







ROC AUC Score of XGBoost = 0.8462401528718595
[32m[I 2021-10-10 12:29:05,642][0m Trial 11 finished with value: 0.8462401528718595 and parameters: {'pca_components': 270}. Best is trial 10 with value: 0.8474176983005992.[0m


  self._init_valid()


  0%|          | 0/1 [00:00<?, ?it/s]







ROC AUC Score of XGBoost = 0.8472224222376469
[32m[I 2021-10-10 12:32:39,229][0m Trial 12 finished with value: 0.8472224222376469 and parameters: {'pca_components': 278}. Best is trial 10 with value: 0.8474176983005992.[0m


  self._init_valid()


  0%|          | 0/1 [00:00<?, ?it/s]







ROC AUC Score of XGBoost = 0.8446363408619656
[32m[I 2021-10-10 12:35:35,341][0m Trial 13 finished with value: 0.8446363408619656 and parameters: {'pca_components': 239}. Best is trial 10 with value: 0.8474176983005992.[0m


  self._init_valid()


  0%|          | 0/1 [00:00<?, ?it/s]







ROC AUC Score of XGBoost = 0.8474194357944366
[32m[I 2021-10-10 12:39:09,826][0m Trial 14 finished with value: 0.8474194357944366 and parameters: {'pca_components': 283}. Best is trial 14 with value: 0.8474194357944366.[0m


  self._init_valid()


  0%|          | 0/1 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
# dump(study, filename=datapath/f"optuna_xgboost_100trials-complete_{datetime.now().strftime('%Y%m%d')}.joblib")
# dump(study.best_trial.params, filename=datapath/f"optuna_lightgbm_all-500trials-best_{datetime.now().strftime('%Y%m%d')}.joblib")
# pickle.dump(study.best_trial.params, open('CatBoost_Hyperparameter.pickle', 'wb'))
# print('CatBoost Hyperparameter:', study.best_trial.params)

In [None]:
study.best_trial.params

In [None]:
wandb.log({'xgboost_params': study.best_trial.params})
wandb.finish()

In [None]:
optuna.visualization.plot_parallel_coordinate(study)