In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [2]:
full_train = pd.read_csv("preprocessed_train.csv", encoding='euc-kr')
full_train = full_train.drop(columns=['분석데이터'], axis=1)

In [3]:
full_train = full_train.dropna()

In [4]:
X = full_train.drop(columns=['label'], axis=1)
y = full_train['label']

In [5]:
X = np.array(X)
y = np.array(y)

In [6]:
import catboost as cb
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [7]:
def objective(trial):
    train_x, valid_x, train_y, valid_y = train_test_split(X,y, test_size=0.3)

    param = {
        "objective": trial.suggest_categorical("objective", ["Logloss", "CrossEntropy"]),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
        "depth": trial.suggest_int("depth", 1, 12),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
        ),
        'iterations' : trial.suggest_int('iterations', 50, 300),                                  
        'learning_rate' : trial.suggest_loguniform('learning_rate', 0.01, 0.3),               
        'random_strength' :trial.suggest_int('random_strength', 0, 100),
        'learning_rate' :trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
        'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter'])
    }

    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1)

    gbm = cb.CatBoostClassifier(**param)

    gbm.fit(train_x, train_y, eval_set=[(valid_x, valid_y)], verbose=0, early_stopping_rounds=100)

    preds = gbm.predict(valid_x)
    pred_labels = np.rint(preds)
    accuracy = accuracy_score(valid_y, pred_labels)
    return accuracy

In [8]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50, timeout=600)

[32m[I 2021-10-15 15:31:25,311][0m A new study created in memory with name: no-name-9bc6965a-b2be-4b27-ab08-72b040c54523[0m
[32m[I 2021-10-15 15:33:02,331][0m Trial 0 finished with value: 0.8156666666666667 and parameters: {'objective': 'CrossEntropy', 'colsample_bylevel': 0.04537619602852152, 'depth': 10, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'iterations': 230, 'learning_rate': 0.016730002903039488, 'random_strength': 82, 'od_type': 'IncToDec', 'bagging_temperature': 5.000139893019259}. Best is trial 0 with value: 0.8156666666666667.[0m
[32m[I 2021-10-15 15:33:11,810][0m Trial 1 finished with value: 0.853 and parameters: {'objective': 'Logloss', 'colsample_bylevel': 0.021642405093141186, 'depth': 10, 'boosting_type': 'Plain', 'bootstrap_type': 'Bayesian', 'iterations': 237, 'learning_rate': 0.04795722676766824, 'random_strength': 71, 'od_type': 'Iter', 'bagging_temperature': 8.872996439991997}. Best is trial 1 with value: 0.853.[0m
[32m[I 2021-10-15 15:33

[32m[I 2021-10-15 15:35:35,863][0m Trial 9 finished with value: 0.8076666666666666 and parameters: {'objective': 'CrossEntropy', 'colsample_bylevel': 0.09883824874131143, 'depth': 5, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS', 'iterations': 187, 'learning_rate': 0.011285283076061591, 'random_strength': 2, 'od_type': 'Iter'}. Best is trial 5 with value: 0.8926666666666667.[0m
[32m[I 2021-10-15 15:35:46,899][0m Trial 10 finished with value: 0.8696666666666667 and parameters: {'objective': 'Logloss', 'colsample_bylevel': 0.013741256963370377, 'depth': 12, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS', 'iterations': 118, 'learning_rate': 0.12517398850031497, 'random_strength': 60, 'od_type': 'IncToDec'}. Best is trial 5 with value: 0.8926666666666667.[0m
[32m[I 2021-10-15 15:35:51,334][0m Trial 11 finished with value: 0.886 and parameters: {'objective': 'CrossEntropy', 'colsample_bylevel': 0.033973233466983443, 'depth': 7, 'boosting_type': 'Plain', 'bootstrap_type': 'Ber

[32m[I 2021-10-15 15:36:40,412][0m Trial 19 finished with value: 0.8923333333333333 and parameters: {'objective': 'CrossEntropy', 'colsample_bylevel': 0.010373709691950679, 'depth': 11, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS', 'iterations': 206, 'learning_rate': 0.17152725316860545, 'random_strength': 14, 'od_type': 'IncToDec'}. Best is trial 12 with value: 0.9013333333333333.[0m
[32m[I 2021-10-15 15:36:46,518][0m Trial 20 finished with value: 0.8403333333333334 and parameters: {'objective': 'Logloss', 'colsample_bylevel': 0.034051553566545795, 'depth': 8, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS', 'iterations': 280, 'learning_rate': 0.03223663606791479, 'random_strength': 90, 'od_type': 'IncToDec'}. Best is trial 12 with value: 0.9013333333333333.[0m
[32m[I 2021-10-15 15:36:55,583][0m Trial 21 finished with value: 0.9036666666666666 and parameters: {'objective': 'CrossEntropy', 'colsample_bylevel': 0.01038783494160387, 'depth': 11, 'boosting_type': 'Plain', '

[32m[I 2021-10-15 15:44:57,830][0m Trial 29 finished with value: 0.9076666666666666 and parameters: {'objective': 'CrossEntropy', 'colsample_bylevel': 0.043529438827711514, 'depth': 12, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'iterations': 204, 'learning_rate': 0.19719860541901787, 'random_strength': 34, 'od_type': 'IncToDec', 'subsample': 0.9558805603499683}. Best is trial 29 with value: 0.9076666666666666.[0m


In [9]:
print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

Number of finished trials: 30
Best trial:
  Value: 0.9076666666666666
  Params: 
    objective: CrossEntropy
    colsample_bylevel: 0.043529438827711514
    depth: 12
    boosting_type: Ordered
    bootstrap_type: Bernoulli
    iterations: 204
    learning_rate: 0.19719860541901787
    random_strength: 34
    od_type: IncToDec
    subsample: 0.9558805603499683
