In [1]:
%%time
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import os
import random
from optuna.samplers import TPESampler
import multiprocessing
import catboost as cat
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import pickle
from sklearn.utils import resample
from catboost import Pool
import sklearn.metrics

Wall time: 1.14 s


# Initial conditions

In [2]:
%%time
n_trials = int(1)
SEED = 123

Wall time: 0 ns


In [3]:
%%time
# Function to seed everything
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
seed_everything(SEED)

Wall time: 0 ns


# Preprocess

In [4]:
%%time
csv = pd.read_csv('preprocessed_train.csv', encoding='euc-kr')
csv = csv.drop(columns=['분석데이터'], axis=1)
label = csv['label']

X_train = csv[:8000]
y_train = label[:8000]
X = np.array(X_train)
y = np.array(y_train)

del X_train, y_train

Wall time: 477 ms


# Functions

In [5]:
%%time
def objective(trial):
    train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.33, random_state=int(SEED), shuffle=True)
    train_pool = Pool(train_x, train_y)
    test_pool = Pool(test_x, test_y)
    
    # Parameters
    params = {
        'iterations' : trial.suggest_int('iterations', 50, 300),                         
        'depth' : trial.suggest_int('depth', 4, 10),                                       
        'learning_rate' : trial.suggest_loguniform('learning_rate', 0.01, 0.3),               
        'random_strength' :trial.suggest_int('random_strength', 0, 100),                       
        'bagging_temperature' :trial.suggest_loguniform('bagging_temperature', 0.01, 100.00),
        'learning_rate' :trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
        'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter'])
    }
    # Learning
    model = cat.CatBoostClassifier(
        loss_function="Logloss",
        eval_metric="AUC",
        task_type="GPU",
        l2_leaf_reg=50,
        random_seed=SEED,
        border_count=64,
        **params
    )        
    model.fit(train_pool)
    # Predict
    preds = model.predict(test_pool)
    pred_labels = np.rint(preds)
    y_pred_boot = resample(pred_labels, n_samples = len(train_y))
    # Evaluation
    ROC_AUC_Score = roc_auc_score(train_y, y_pred_boot)
    print('ROC AUC Score of CatBoost =', ROC_AUC_Score)
    return ROC_AUC_Score

Wall time: 0 ns


# Optimisation

In [6]:
%%time
study = optuna.create_study(direction = "maximize", sampler = TPESampler(seed=int(SEED)))
study.optimize(objective, n_trials = n_trials, n_jobs = multiprocessing.cpu_count())

[32m[I 2021-10-14 00:53:40,823][0m A new study created in memory with name: no-name-8cab7a58-23c9-4187-9d9d-9f7f6e02c272[0m


0:	learn: 1.0000000	total: 26.7ms	remaining: 4.99s
1:	learn: 1.0000000	total: 44.5ms	remaining: 4.14s
2:	learn: 1.0000000	total: 61.9ms	remaining: 3.82s
3:	learn: 1.0000000	total: 81.1ms	remaining: 3.73s
4:	learn: 1.0000000	total: 98.6ms	remaining: 3.61s
5:	learn: 1.0000000	total: 117ms	remaining: 3.54s
6:	learn: 1.0000000	total: 135ms	remaining: 3.5s
7:	learn: 1.0000000	total: 154ms	remaining: 3.46s
8:	learn: 1.0000000	total: 174ms	remaining: 3.46s
9:	learn: 1.0000000	total: 193ms	remaining: 3.43s
10:	learn: 1.0000000	total: 213ms	remaining: 3.43s
11:	learn: 1.0000000	total: 233ms	remaining: 3.42s
12:	learn: 1.0000000	total: 252ms	remaining: 3.39s
13:	learn: 1.0000000	total: 270ms	remaining: 3.36s
14:	learn: 1.0000000	total: 292ms	remaining: 3.36s
15:	learn: 1.0000000	total: 311ms	remaining: 3.35s
16:	learn: 1.0000000	total: 332ms	remaining: 3.34s
17:	learn: 1.0000000	total: 354ms	remaining: 3.34s
18:	learn: 1.0000000	total: 374ms	remaining: 3.33s
19:	learn: 1.0000000	total: 394ms	rem

161:	learn: 1.0000000	total: 3.31s	remaining: 531ms
162:	learn: 1.0000000	total: 3.33s	remaining: 510ms
163:	learn: 1.0000000	total: 3.35s	remaining: 490ms
164:	learn: 1.0000000	total: 3.37s	remaining: 469ms
165:	learn: 1.0000000	total: 3.39s	remaining: 449ms
166:	learn: 1.0000000	total: 3.41s	remaining: 429ms
167:	learn: 1.0000000	total: 3.43s	remaining: 408ms
168:	learn: 1.0000000	total: 3.45s	remaining: 388ms
169:	learn: 1.0000000	total: 3.47s	remaining: 368ms
170:	learn: 1.0000000	total: 3.49s	remaining: 347ms
171:	learn: 1.0000000	total: 3.51s	remaining: 327ms
172:	learn: 1.0000000	total: 3.54s	remaining: 307ms
173:	learn: 1.0000000	total: 3.56s	remaining: 286ms
174:	learn: 1.0000000	total: 3.58s	remaining: 266ms
175:	learn: 1.0000000	total: 3.6s	remaining: 245ms
176:	learn: 1.0000000	total: 3.62s	remaining: 225ms
177:	learn: 1.0000000	total: 3.64s	remaining: 204ms
178:	learn: 1.0000000	total: 3.66s	remaining: 184ms
179:	learn: 1.0000000	total: 3.68s	remaining: 164ms
180:	learn: 1

[32m[I 2021-10-14 00:53:46,451][0m Trial 0 finished with value: 0.49331596988541015 and parameters: {'iterations': 188, 'depth': 7, 'learning_rate': 0.061349908126964024, 'random_strength': 14, 'bagging_temperature': 3.597955607541437, 'od_type': 'Iter'}. Best is trial 0 with value: 0.49331596988541015.[0m


Wall time: 5.63 s


In [7]:
%%time
# Save
pickle.dump(study.best_trial.params, open('CatBoost_Hyperparameter.pickle', 'wb'))
print('CatBoost Hyperparameter:', study.best_trial.params)

CatBoost Hyperparameter: {'iterations': 188, 'depth': 7, 'learning_rate': 0.061349908126964024, 'random_strength': 14, 'bagging_temperature': 3.597955607541437, 'od_type': 'Iter'}
Wall time: 1 ms
