In [1]:
import catboost as cb
import optuna
import pandas
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
# read in the train and test data from csv files
train_csv = pandas.read_csv("preprocessed_train.csv", encoding='euc-kr')
train_csv = train_csv.drop('분석데이터', axis=1) 
train_set = train_csv.loc[:7999]
test_set = train_csv.loc[8000:]

In [3]:
train_set

Unnamed: 0,label,numstrings,avlength,printables,entropy,paths,urls,registry,MZ,a_0,...,dist_86,dist_87,dist_88,dist_89,dist_90,dist_91,dist_92,dist_93,dist_94,dist_95
0,1,144,12.298611,1771,5.356616,0,0,0,1,2399,...,10,4,10,9,4,0,1,0,0,0
1,1,804,9.580846,7703,6.063542,0,0,0,6,183376,...,43,121,84,78,47,36,40,45,27,36
2,0,2205,12.736054,28083,6.107050,9,0,0,6,1178,...,326,268,239,286,199,148,154,37,48,36
3,0,2602,10.288240,26770,5.373013,8,0,0,1,56851,...,336,230,206,245,76,0,26,702,1,5
4,1,8980,23.252339,208806,5.775223,0,28,16,3,124274,...,731,882,1171,1010,322,64,327,84,75,244
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,0,8736,161.916094,1414499,5.045352,0,9,0,11,20409,...,20733,19933,27071,31742,75164,67,194,101,76,66
7996,1,6592,5.987561,39470,6.573862,0,0,0,21,183376,...,405,406,385,436,340,366,343,393,416,377
7997,1,5255,6.607041,34720,6.503973,0,7,0,15,147733,...,231,255,334,395,185,235,241,180,181,193
7998,1,11618,5.882940,68348,6.577309,1,5,0,41,96507,...,667,676,685,746,663,624,634,602,625,598


In [4]:
test_set

Unnamed: 0,label,numstrings,avlength,printables,entropy,paths,urls,registry,MZ,a_0,...,dist_86,dist_87,dist_88,dist_89,dist_90,dist_91,dist_92,dist_93,dist_94,dist_95
8000,1,3522,14.883589,52420,5.729609,103,0,0,6,462397,...,320,326,272,334,70,23,123,38,43,294
8001,1,846,14.718676,12452,5.646406,0,0,0,2,20251821,...,96,39,102,135,26,13,49,9,11,16
8002,0,16716,5.786552,96728,6.537585,1,0,0,37,6454,...,1177,829,1451,688,1387,822,890,648,949,705
8003,0,3013,26.163956,78832,5.457340,1,0,0,6,902116,...,712,470,580,489,159,13,23,12,16,11
8004,1,658,14.518237,9553,5.170602,0,0,0,2,42438,...,96,16,56,229,10,1,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1,2018,13.938057,28127,5.940442,0,70,0,11,255044,...,246,186,206,235,88,33,81,58,61,72
9996,0,1105,16.437104,18163,5.766962,0,11,0,3,181296,...,199,57,134,123,20,25,28,25,41,13
9997,0,4,58.500000,234,3.811827,0,0,0,1,68736,...,0,0,0,0,0,0,0,0,0,0
9998,1,3312,24.939312,82599,5.834730,0,39,0,8,90648,...,438,985,806,851,113,123,181,100,75,86


In [5]:
# split labels out of data sets    
train_label = train_set['label']
train_set = train_set.drop('label', axis=1) # remove labels
test_label = test_set['label']
test_set = test_set.drop('label', axis=1) # remove labels

In [6]:
full_train = train_set.dropna()

In [7]:
X = full_train
y = train_label

In [8]:
X = np.array(X)
y = np.array(y)

In [9]:
def objective(trial):
    train_x, valid_x, train_y, valid_y = train_test_split(X,y, test_size=0.3)

    param = {
        "objective": trial.suggest_categorical("objective", ["Logloss", "CrossEntropy"]),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
        "depth": trial.suggest_int("depth", 1, 12),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
        ),
        "used_ram_limit": "3gb",
    }

    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1)

    gbm = cb.CatBoostClassifier(**param)

    gbm.fit(train_x, train_y, eval_set=[(valid_x, valid_y)], verbose=0, early_stopping_rounds=100)

    preds = gbm.predict(valid_x)
    pred_labels = np.rint(preds)
    accuracy = accuracy_score(valid_y, pred_labels)
    return accuracy

In [10]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50, timeout=600)

[32m[I 2021-10-12 17:15:09,411][0m A new study created in memory with name: no-name-100df6c3-9762-44f3-8853-4e6641a5a30c[0m
[32m[I 2021-10-12 17:20:44,371][0m Trial 0 finished with value: 0.89125 and parameters: {'objective': 'CrossEntropy', 'colsample_bylevel': 0.08977304513427485, 'depth': 9, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'subsample': 0.8655683777910907}. Best is trial 0 with value: 0.89125.[0m
[32m[I 2021-10-12 17:21:56,075][0m Trial 1 finished with value: 0.8670833333333333 and parameters: {'objective': 'CrossEntropy', 'colsample_bylevel': 0.017282743222892975, 'depth': 9, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 6.073197453675245}. Best is trial 0 with value: 0.89125.[0m
[32m[I 2021-10-12 17:21:58,646][0m Trial 2 finished with value: 0.7983333333333333 and parameters: {'objective': 'CrossEntropy', 'colsample_bylevel': 0.0397147954905516, 'depth': 1, 'boosting_type': 'Plain', 'bootstrap_type': 'Bayesian

In [11]:
print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

Number of finished trials: 5
Best trial:
  Value: 0.8941666666666667
  Params: 
    objective: CrossEntropy
    colsample_bylevel: 0.07916682531951634
    depth: 10
    boosting_type: Plain
    bootstrap_type: Bernoulli
    subsample: 0.3090440812490579
