In [2]:
import pandas as pd
import numpy as np
import optuna

import catboost as cb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [3]:
train = pd.read_csv('preprocessed_train.csv', encoding='euc-kr')

In [4]:
train.head()

Unnamed: 0,분석데이터,label,numstrings,avlength,printables,entropy,paths,urls,registry,MZ,...,dist_86,dist_87,dist_88,dist_89,dist_90,dist_91,dist_92,dist_93,dist_94,dist_95
0,1,1,144,12.298611,1771,5.356616,0,0,0,1,...,10,4,10,9,4,0,1,0,0,0
1,2,1,804,9.580846,7703,6.063542,0,0,0,6,...,43,121,84,78,47,36,40,45,27,36
2,3,0,2205,12.736054,28083,6.10705,9,0,0,6,...,326,268,239,286,199,148,154,37,48,36
3,4,0,2602,10.28824,26770,5.373013,8,0,0,1,...,336,230,206,245,76,0,26,702,1,5
4,5,1,8980,23.252339,208806,5.775223,0,28,16,3,...,731,882,1171,1010,322,64,327,84,75,244


In [5]:
data = train.drop(columns=['분석데이터', 'label'], axis=1)

In [6]:
target = train['label']

In [11]:
def objective(trial):
    train_x, valid_x, train_y, valid_y = train_test_split(data, target, test_size=0.2)

    param = {
        "objective": trial.suggest_categorical("objective", ["Logloss", "CrossEntropy"]),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
        "depth": trial.suggest_int("depth", 1, 12),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
        ),
        
        "used_ram_limit": "3gb",
        'learning_rate' : trial.suggest_loguniform('learning_rate', 0.01, 0.3),
        'iterations' : trial.suggest_int('iterations', 50, 300),
    }

    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1)

    gbm = cb.CatBoostClassifier(**param)

    gbm.fit(train_x, train_y, eval_set=[(valid_x, valid_y)], verbose=0, early_stopping_rounds=100)

    preds = gbm.predict(valid_x)
    pred_labels = np.rint(preds)
    accuracy = accuracy_score(valid_y, pred_labels)
    return accuracy

In [12]:
study = optuna.create_study(direction="maximize")

[32m[I 2021-10-15 14:02:50,142][0m A new study created in memory with name: no-name-45795ad2-0e99-4534-b726-423631f29e29[0m


In [13]:
study.optimize(objective, n_trials=100, timeout=600)

[32m[I 2021-10-15 14:02:51,646][0m Trial 0 finished with value: 0.79 and parameters: {'objective': 'Logloss', 'colsample_bylevel': 0.0388444171227008, 'depth': 2, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'learning_rate': 0.017746892780465602, 'iterations': 50, 'subsample': 0.2522072614284975}. Best is trial 0 with value: 0.79.[0m
[32m[I 2021-10-15 14:03:00,847][0m Trial 1 finished with value: 0.872 and parameters: {'objective': 'Logloss', 'colsample_bylevel': 0.05330382465196504, 'depth': 8, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'learning_rate': 0.037227458289772114, 'iterations': 210, 'subsample': 0.9854035108173256}. Best is trial 1 with value: 0.872.[0m
[32m[I 2021-10-15 14:03:08,715][0m Trial 2 finished with value: 0.87 and parameters: {'objective': 'Logloss', 'colsample_bylevel': 0.09659159377871517, 'depth': 6, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'learning_rate': 0.24342665092005517, 'iterations': 82, 'bagging_tempe

In [14]:
print("Number of finished trials: {}".format(len(study.trials)))

Number of finished trials: 12


In [15]:
print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

Number of finished trials: 12
Best trial:
  Value: 0.9095
  Params: 
    objective: CrossEntropy
    colsample_bylevel: 0.09901213513123053
    depth: 11
    boosting_type: Ordered
    bootstrap_type: MVS
    learning_rate: 0.11016052222039235
    iterations: 175
