In [8]:
import pandas as pd
import numpy as np

from sklearn.impute import KNNImputer

from sklearn.model_selection import train_test_split

from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import ExtraTreesClassifier

from bayes_opt import BayesianOptimization

from sklearn.metrics import f1_score, confusion_matrix, classification_report

import warnings
warnings.filterwarnings("ignore")

In [9]:
train = pd.read_csv('train_dataset.csv', encoding='cp949')

# 필요없는 열 제거
drop_cols = ["Unnamed: 0", "url_chinese_present", "html_num_tags('applet')"]
train.drop(drop_cols, axis=1, inplace=True)

# benign(정상) -> 1, malicious(악성) -> -1 
train['Result_v1'] = train['Result_v1'].map({'benign': 1, 'malicious': -1})

## <span style="color: hotpink"> **1. 데이터 전처리** </span>

### <span style="color: skyblue"> **결측치 처리(KNN Imputer)** </span>

In [10]:
imputer = KNNImputer(n_neighbors=9)

imputer.fit(train)
x = imputer.transform(train)

train = pd.DataFrame(x, columns=train.columns, index=train.index)

### <span style="color: skyblue"> **학습 데이터 분할** </span>

In [11]:
# Target 설정
target = 'Result_v1'

# 데이터 분리
X = train.drop(target, axis=1)
y = train.loc[:, target]

# 학습 데이터 분할
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=2022)

## <span style="color: hotpink"> **2. 모델 하이퍼파라미터 튜닝(Bayesian optimization)** </span>

### <span style="color: skyblue"> **탐색 대상 함수 정의** </span>
- Catboost
- RandomForest
- XGboost
- LGBM
- Extratrees

In [12]:
# Catboost
def catboost_cv(subsample, depth, learning_rate, bagging_temperature, l2_leaf_reg):

    model = CatBoostClassifier(subsample=(subsample),
                               depth=int(depth),
                               learning_rate=(learning_rate),
                               bagging_temperature=(bagging_temperature),
                               l2_leaf_reg=(l2_leaf_reg),
                               random_state=2022,
                               silent=True
                               )

    model.fit(X_train, y_train)
    y_pred= model.predict(X_valid)
    f1 = f1_score(y_valid, y_pred)

    return f1

# Randomforest
def rf_cv(n_estimators, max_depth, min_samples_split, min_samples_leaf):

    model = RandomForestClassifier(n_estimators=int(n_estimators),
                                   max_depth=int(max_depth),
                                   min_samples_split=int(min_samples_split),
                                   min_samples_leaf=int(min_samples_leaf),
                                   random_state=2022
                                  )

    model.fit(X_train, y_train)
    y_pred= model.predict(X_valid)
    f1 = f1_score(y_valid, y_pred)

    return f1

# Xgboost
def xg_cv(learning_rate, max_depth, n_estimators, subsample):

    model = XGBClassifier(learning_rate=(learning_rate),
                          max_depth=int(max_depth),
                          n_estimators=int(n_estimators),
                          subsample=(subsample),
                          random_state=2022
                         )
                                      
    y_train_xg, y_valid_xg = y_train.copy(), y_valid.copy()
    y_train_xg = y_train_xg.map({1:1, -1:0})
    y_valid_xg = y_valid_xg.map({1:1, -1:0})                         

    model.fit(X_train, y_train_xg)
    y_pred= model.predict(X_valid)
    f1 = f1_score(y_valid_xg, y_pred)

    return f1

# LightGBM
def lgbm_cv(learning_rate, max_depth, n_estimators, num_leaves, subsample):

    model = LGBMClassifier(learning_rate=(learning_rate),
                            max_depth=int(max_depth),
                            n_estimators=int(n_estimators),
                            num_leaves=int(num_leaves),
                            subsample=(subsample),
                            random_state=2022
                                  )

    model.fit(X_train, y_train)
    y_pred= model.predict(X_valid)
    f1 = f1_score(y_valid, y_pred)

    return f1

# ExtraTrees 
def xtree_cv(n_estimators, max_depth, min_samples_split, min_samples_leaf,):

    model = ExtraTreesClassifier(n_estimators=int(n_estimators),
                                 max_depth=int(max_depth),
                                 min_samples_split=int(min_samples_split),
                                 min_samples_leaf=int(min_samples_leaf),
                                )

    model.fit(X_train, y_train)
    y_pred= model.predict(X_valid)
    f1 = f1_score(y_valid, y_pred)

    return f1

### <span style="color: skyblue"> **Catboost 하이퍼파라미터 튜닝** </span>

In [13]:
pds = {'subsample': (0.01, 1.0),
       'depth': (4, 16),
       'learning_rate': (0.001, 1.0),
       'bagging_temperature': (1.0, 10.0),
       'l2_leaf_reg': (1, 10)
      }

optimizer=BayesianOptimization(catboost_cv, pds, verbose=1, random_state=2022)    

optimizer.maximize(init_points=5,   
                   n_iter=100, 
                   acq='ei', 
                   xi=0.01)

print(optimizer.max)

|   iter    |  target   | baggin... |   depth   | l2_lea... | learni... | subsample |
-------------------------------------------------------------------------------------
| [95m 3       [0m | [95m 0.9723  [0m | [95m 1.185   [0m | [95m 15.64   [0m | [95m 8.492   [0m | [95m 0.2131  [0m | [95m 0.19    [0m |
| [95m 6       [0m | [95m 0.9723  [0m | [95m 2.181   [0m | [95m 14.59   [0m | [95m 8.457   [0m | [95m 0.6524  [0m | [95m 0.4638  [0m |
| [95m 11      [0m | [95m 0.9736  [0m | [95m 5.639   [0m | [95m 11.71   [0m | [95m 9.958   [0m | [95m 0.09335 [0m | [95m 0.7156  [0m |
| [95m 16      [0m | [95m 0.9737  [0m | [95m 4.141   [0m | [95m 13.7    [0m | [95m 9.768   [0m | [95m 0.0253  [0m | [95m 0.8288  [0m |
| [95m 32      [0m | [95m 0.975   [0m | [95m 7.367   [0m | [95m 13.3    [0m | [95m 3.97    [0m | [95m 0.06901 [0m | [95m 0.924   [0m |
| [95m 104     [0m | [95m 0.9763  [0m | [95m 8.675   [0m | [95m 15.32   [0m

### <span style="color: skyblue"> **RandomForest 하이퍼파라미터 튜닝** </span>

In [23]:
pds = {'n_estimators': (10, 2000),
       'max_depth': (1, 300),
       'min_samples_split': (2, 100),
       'min_samples_leaf': (1, 100)
      }

optimizer=BayesianOptimization(rf_cv, pds, verbose=1, random_state=2022)    

optimizer.maximize(init_points=10,  
                   n_iter=500, 
                   acq='ei')   

print(optimizer.max)

|   iter    |  target   | max_depth | min_sa... | min_sa... | n_esti... |
-------------------------------------------------------------------------
| [95m 2       [0m | [95m 0.918   [0m | [95m 205.9   [0m | [95m 49.21   [0m | [95m 89.97   [0m | [95m 1.298e+0[0m |
| [95m 6       [0m | [95m 0.9349  [0m | [95m 223.5   [0m | [95m 29.96   [0m | [95m 31.27   [0m | [95m 1.507e+0[0m |
| [95m 9       [0m | [95m 0.9429  [0m | [95m 292.8   [0m | [95m 4.744   [0m | [95m 79.84   [0m | [95m 722.2   [0m |
| [95m 12      [0m | [95m 0.9468  [0m | [95m 291.0   [0m | [95m 2.6     [0m | [95m 53.29   [0m | [95m 748.0   [0m |
| [95m 13      [0m | [95m 0.9562  [0m | [95m 298.0   [0m | [95m 4.261   [0m | [95m 3.721   [0m | [95m 726.5   [0m |
| [95m 17      [0m | [95m 0.9588  [0m | [95m 275.6   [0m | [95m 3.657   [0m | [95m 8.383   [0m | [95m 685.5   [0m |
| [95m 20      [0m | [95m 0.9654  [0m | [95m 254.0   [0m | [95m 1.651   [0m 

### <span style="color: skyblue"> **XGboost 하이퍼파라미터 튜닝** </span>

In [24]:
pds = {'learning_rate': (0.01, 1.0),
       'max_depth': (1, 300),
       'n_estimators': (10, 2000),
       'subsample': (0.1, 1.0)
      }

optimizer=BayesianOptimization(xg_cv, pds, verbose=1, random_state=2022)    

optimizer.maximize(init_points=10,  
                   n_iter=300, 
                   acq='ei')   

print(optimizer.max)

|   iter    |  target   | learni... | max_depth | n_esti... | subsample |
-------------------------------------------------------------------------
| [95m 2       [0m | [95m 0.9643  [0m | [95m 0.6886  [0m | [95m 146.6   [0m | [95m 1.796e+0[0m | [95m 0.6827  [0m |
| [95m 3       [0m | [95m 0.9658  [0m | [95m 0.898   [0m | [95m 216.6   [0m | [95m 1.664e+0[0m | [95m 0.8448  [0m |
| [95m 7       [0m | [95m 0.9683  [0m | [95m 0.02848 [0m | [95m 157.6   [0m | [95m 1.73e+03[0m | [95m 0.45    [0m |
| [95m 13      [0m | [95m 0.9723  [0m | [95m 0.01079 [0m | [95m 122.4   [0m | [95m 1.739e+0[0m | [95m 0.698   [0m |
| [95m 95      [0m | [95m 0.9735  [0m | [95m 0.03136 [0m | [95m 109.3   [0m | [95m 675.6   [0m | [95m 0.8344  [0m |
| [95m 123     [0m | [95m 0.9736  [0m | [95m 0.1319  [0m | [95m 191.4   [0m | [95m 1.277e+0[0m | [95m 0.562   [0m |
| [95m 178     [0m | [95m 0.975   [0m | [95m 0.4811  [0m | [95m 213.4   [0m 

### <span style="color: skyblue"> **LGBM 하이퍼파라미터 튜닝** </span>

In [25]:
pds = {'learning_rate': (0.01, 1.0),
       'max_depth': (1, 300),
       'n_estimators': (10, 2000),
       'num_leaves': (2, 600),
       'subsample': (0.01, 1.0)
      }

optimizer=BayesianOptimization(lgbm_cv, pds, verbose=1, random_state=2022)    

optimizer.maximize(init_points=10,  
                   n_iter=300, 
                   acq='ei')   

print(optimizer.max)

|   iter    |  target   | learni... | max_depth | n_esti... | num_le... | subsample |
-------------------------------------------------------------------------------------
| [95m 2       [0m | [95m 0.9696  [0m | [95m 0.4921  [0m | [95m 269.4   [0m | [95m 1.298e+0[0m | [95m 538.4   [0m | [95m 0.7239  [0m |
| [95m 4       [0m | [95m 0.9697  [0m | [95m 0.4999  [0m | [95m 102.5   [0m | [95m 1.243e+0[0m | [95m 586.6   [0m | [95m 0.1055  [0m |
| [95m 5       [0m | [95m 0.9723  [0m | [95m 0.7468  [0m | [95m 88.46   [0m | [95m 604.4   [0m | [95m 452.0   [0m | [95m 0.02848 [0m |
| [95m 9       [0m | [95m 0.9735  [0m | [95m 0.09057 [0m | [95m 13.69   [0m | [95m 600.6   [0m | [95m 219.5   [0m | [95m 0.4954  [0m |
| [95m 26      [0m | [95m 0.9736  [0m | [95m 0.2574  [0m | [95m 80.89   [0m | [95m 1.833e+0[0m | [95m 369.7   [0m | [95m 0.5243  [0m |
| [95m 119     [0m | [95m 0.9737  [0m | [95m 0.454   [0m | [95m 163.3   [0m

### <span style="color: skyblue"> **ExtraTrees 하이퍼파라미터 튜닝** </span>

In [26]:
pds = {'n_estimators': (10, 2000),
       'max_depth': (1, 300),
       'min_samples_split': (2, 100),
       'min_samples_leaf': (1, 100)
      }

optimizer=BayesianOptimization(xtree_cv, pds, verbose=1, random_state=2022)    

optimizer.maximize(init_points=10,  
                   n_iter=300, 
                   acq='ei')   

print(optimizer.max)

|   iter    |  target   | max_depth | min_sa... | min_sa... | n_esti... |
-------------------------------------------------------------------------
| [95m 2       [0m | [95m 0.8235  [0m | [95m 205.9   [0m | [95m 49.21   [0m | [95m 89.97   [0m | [95m 1.298e+0[0m |
| [95m 6       [0m | [95m 0.8241  [0m | [95m 223.5   [0m | [95m 29.96   [0m | [95m 31.27   [0m | [95m 1.507e+0[0m |
| [95m 9       [0m | [95m 0.8886  [0m | [95m 292.8   [0m | [95m 4.744   [0m | [95m 79.84   [0m | [95m 722.2   [0m |
| [95m 11      [0m | [95m 0.9195  [0m | [95m 296.2   [0m | [95m 2.823   [0m | [95m 86.48   [0m | [95m 680.8   [0m |
| [95m 14      [0m | [95m 0.9253  [0m | [95m 279.9   [0m | [95m 1.914   [0m | [95m 74.01   [0m | [95m 690.2   [0m |
| [95m 33      [0m | [95m 0.9319  [0m | [95m 266.6   [0m | [95m 2.576   [0m | [95m 42.04   [0m | [95m 711.8   [0m |
| [95m 34      [0m | [95m 0.937   [0m | [95m 252.3   [0m | [95m 1.919   [0m 