In [50]:
import pandas as pd

from sklearn.impute import KNNImputer

from pycaret.classification import *
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import ExtraTreesClassifier

from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import VotingClassifier

import warnings
warnings.filterwarnings("ignore")

In [51]:
train = pd.read_csv('train_dataset.csv')
test = pd.read_csv('test_dataset_v01.csv')

# 필요없는 열 제거
drop_cols = ["Unnamed: 0", "url_chinese_present", "html_num_tags('applet')"]

train.drop(drop_cols, axis=1, inplace=True)
test.drop(drop_cols, axis=1, inplace = True)

# benign(정상) -> 1, malicious(악성) 0 변경
# XGboost 학습을 위해 -1이 아닌 0으로 변경
train['Result_v1'] = train['Result_v1'].map({'benign': 1, 'malicious': 0})

In [52]:
# train, test 데이터셋 결측치 처리
imputer = KNNImputer(n_neighbors=9)
imputer.fit(train)
x = imputer.transform(train)
train = pd.DataFrame(x, columns=train.columns, index=train.index)

train_impute = train.drop('Result_v1', axis=1)
imputer.fit(train_impute)
x = imputer.transform(test)
test = pd.DataFrame(x, columns=test.columns, index=test.index)

In [53]:
# Target 설정
target = 'Result_v1'

# 데이터 분리
X = train.drop(target, axis=1)
y = train.loc[:, target]

## <span style="color: hotpink"> **1. 모델링(Modeling)** </span>
- Pycaret
- CatBoost
- RandomForest
- XGBoost
- LGBM
- ExtraTrees

### <span style="color: skyblue"> **Pycaret** </span>

In [21]:
setup_model = setup(train, 
                    target='Result_v1',
                    preprocess=False,   # True로 설정되면, 자체적인 Feature Engineering을 추가로 진행해 Predict가 불가능함
                    train_size=0.999,  
                    silent=True)   

Unnamed: 0,Description,Value
0,session_id,8416
1,Target,Result_v1
2,Target Type,Binary
3,Label Encoded,"0.0: 0, 1.0: 1"
4,Original Data,"(3664, 22)"
5,Missing Values,False
6,Numeric Features,19
7,Categorical Features,2
8,Transformed Train Set,"(3660, 21)"
9,Transformed Test Set,"(4, 21)"


In [22]:
top5 = compare_models(n_select=5, sort='Accuracy')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.9653,0.9949,0.9644,0.9672,0.9657,0.9306,0.9307,0.054
catboost,CatBoost Classifier,0.9634,0.9944,0.9623,0.9655,0.9638,0.9268,0.9269,0.497
et,Extra Trees Classifier,0.9566,0.9934,0.9596,0.9553,0.9573,0.9131,0.9134,0.05
gbc,Gradient Boosting Classifier,0.9484,0.9886,0.945,0.9529,0.9489,0.8967,0.8969,0.08
dt,Decision Tree Classifier,0.9423,0.9425,0.9343,0.9514,0.9425,0.8847,0.8852,0.005
ada,Ada Boost Classifier,0.9292,0.9791,0.9267,0.9337,0.93,0.8585,0.8588,0.054
knn,K Neighbors Classifier,0.8842,0.9474,0.9008,0.8751,0.8875,0.7681,0.7691,0.007
ridge,Ridge Classifier,0.7943,0.0,0.8917,0.7501,0.8145,0.5873,0.5991,0.003
lda,Linear Discriminant Analysis,0.7937,0.8728,0.8906,0.7499,0.814,0.5862,0.5979,0.004
svm,SVM - Linear Kernel,0.7336,0.0,0.78,0.7508,0.7307,0.466,0.4879,0.004


In [23]:
models = []
for i in top5:
    models.append(tune_model(i,
                             optimize='Accuracy',
                             choose_better=True,
                             n_iter=100))    # 100회

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9563,0.9698,0.9462,0.967,0.9565,0.9126,0.9128
1,0.9372,0.951,0.9677,0.9137,0.9399,0.8742,0.8757
2,0.9372,0.95,0.9409,0.9358,0.9383,0.8743,0.8743
3,0.9399,0.9615,0.9086,0.9713,0.9389,0.8799,0.8818
4,0.9399,0.9676,0.9355,0.9457,0.9405,0.8798,0.8798
5,0.929,0.9574,0.9301,0.9301,0.9301,0.8579,0.8579
6,0.9262,0.9449,0.9135,0.9389,0.926,0.8525,0.8528
7,0.9153,0.9387,0.9189,0.914,0.9164,0.8306,0.8306
8,0.9317,0.9528,0.9243,0.9396,0.9319,0.8634,0.8635
9,0.9153,0.9472,0.9027,0.9278,0.9151,0.8306,0.8309


In [None]:
model_pyc = blend_models(models, optimize = 'Accuracy')
model_pyc = tune_model(model_pyc, 
                    optimize = 'Accuracy', 
                    choose_better = True,
                    n_iter = 100)
model_pyc = finalize_model(model_pyc)

In [56]:
# 예측 및 내보내기
predict = model_pyc.predict(test)
result = pd.read_csv('sample_submission.csv')
result['expected'] = predict
result['expected'] = result['expected'].map({1: 1, 0: -1})
result.to_csv('result/pycaret.csv', index = False)

### <span style="color: skyblue"> **CatBoost** </span>

<img src='img/catboost_tuning.png'>

In [58]:
best_params_cat_1={'bagging_temperature': 5.145,
                 'depth': 15,
                 'l2_leaf_reg': 15.66,
                 'learning_rate': 0.6398,
                 'subsample': 0.7756
                }

In [59]:
model_cat_1 = CatBoostClassifier(**best_params_cat_1, verbose = 100, 
                               eval_metric='F1', random_state=42)

model_cat_1.fit(X, y)

0:	learn: 0.9060716	total: 34.1ms	remaining: 34.1s
100:	learn: 1.0000000	total: 13.7s	remaining: 2m 1s
200:	learn: 1.0000000	total: 26.7s	remaining: 1m 46s
300:	learn: 1.0000000	total: 39.6s	remaining: 1m 31s
400:	learn: 1.0000000	total: 52.1s	remaining: 1m 17s
500:	learn: 1.0000000	total: 1m 4s	remaining: 1m 4s
600:	learn: 1.0000000	total: 1m 17s	remaining: 51.1s
700:	learn: 1.0000000	total: 1m 29s	remaining: 38.1s
800:	learn: 1.0000000	total: 1m 41s	remaining: 25.3s
900:	learn: 1.0000000	total: 1m 53s	remaining: 12.5s
999:	learn: 1.0000000	total: 2m 6s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x164251850>

In [55]:
best_params_cat={'bagging_temperature': 7.367,
                 'depth': 13,
                 'l2_leaf_reg': 3.97,
                 'learning_rate': 0.06901,
                 'subsample': 0.924
                }

In [56]:
model_cat = CatBoostClassifier(**best_params_cat, verbose = 100, 
                               eval_metric='F1', random_state=42)

model_cat.fit(X, y)

0:	learn: 0.9181626	total: 89.9ms	remaining: 1m 29s
100:	learn: 0.9970374	total: 3.17s	remaining: 28.2s
200:	learn: 1.0000000	total: 6.34s	remaining: 25.2s
300:	learn: 1.0000000	total: 9.49s	remaining: 22.1s
400:	learn: 1.0000000	total: 12.3s	remaining: 18.4s
500:	learn: 1.0000000	total: 15s	remaining: 14.9s
600:	learn: 1.0000000	total: 17.5s	remaining: 11.6s
700:	learn: 1.0000000	total: 20.2s	remaining: 8.62s
800:	learn: 1.0000000	total: 22.9s	remaining: 5.69s
900:	learn: 1.0000000	total: 25.4s	remaining: 2.79s
999:	learn: 1.0000000	total: 27.9s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x1641f7b50>

In [57]:
# 예측 및 내보내기
predict = model_cat.predict(test)
result = pd.read_csv('sample_submission.csv')
result['expected'] = predict
result['expected'] = result['expected'].map({1: 1, 0: -1})
result.to_csv('result/catboost.csv', index = False)

### <span style="color: skyblue"> **RandomForest** </span>

In [40]:
best_params_rf={'max_depth': 15,
                'n_estimators': 1380,
               }

In [41]:
model_rf = RandomForestClassifier(**best_params_rf, random_state=2022)

model_rf.fit(X, y)

RandomForestClassifier(max_depth=15, n_estimators=1380, random_state=2022)

In [58]:
# 예측 및 내보내기
predict = model_rf.predict(test)
result = pd.read_csv('sample_submission.csv')
result['expected'] = predict
result['expected'] = result['expected'].map({1: 1, 0: -1})
result.to_csv('result/Randomforest.csv', index = False)

### <span style="color: skyblue"> **XGBoost** </span>

In [32]:
best_params_xg={'learning_rate': 0.4811,
                'max_depth': 213,
                'n_estimators': 903,
                'subsample': 0.952,
               }

In [33]:
model_xg = XGBClassifier(**best_params_xg, random_state=2022)

model_xg.fit(X, y)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.4811, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=213, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=903,
              n_jobs=0, num_parallel_tree=1, objective='binary:logistic',
              predictor='auto', random_state=2022, reg_alpha=0, ...)

In [59]:
# 예측 및 내보내기
predict = model_xg.predict(test)
result = pd.read_csv('sample_submission.csv')
result['expected'] = predict
result['expected'] = result['expected'].map({1: 1, 0: -1})
result.to_csv('result/xgboost.csv', index = False)

### <span style="color: skyblue"> **LGBM** </span>

In [35]:
best_params_lgbm={'learning_rate': 0.7013,
                  'max_depth': 233,
                  'n_estimators': 2,
                  'num_leaves': 357,
                  'subsample': 0.4237
                 }

In [36]:
model_lgbm = LGBMClassifier(**best_params_lgbm, random_state=2022)

model_lgbm.fit(X, y)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.7013, max_depth=233,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=2, n_jobs=-1, num_leaves=357, objective=None,
               random_state=2022, reg_alpha=0.0, reg_lambda=0.0, silent='warn',
               subsample=0.4237, subsample_for_bin=200000, subsample_freq=0)

In [60]:
# 예측 및 내보내기
predict = model_lgbm.predict(test)
result = pd.read_csv('sample_submission.csv')
result['expected'] = predict
result['expected'] = result['expected'].map({1: 1, 0: -1})
result.to_csv('result/lightgbm.csv', index = False)

### <span style="color: skyblue"> **ExtraTrees** </span>

In [38]:
best_params_xtree={'max_depth': 252,
                  'min_samples_leaf': 2,
                  'min_samples_split':3,
                  'n_estimators': 670
                  }

In [39]:
model_xtree = ExtraTreesClassifier(**best_params_xtree, random_state=2022)

model_xtree.fit(X, y)

ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=252, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=2, min_samples_split=3,
                     min_weight_fraction_leaf=0.0, n_estimators=670,
                     n_jobs=None, oob_score=False, random_state=2022, verbose=0,
                     warm_start=False)

In [61]:
# 예측 및 내보내기
predict = model_xtree.predict(test)
result = pd.read_csv('sample_submission.csv')
result['expected'] = predict
result['expected'] = result['expected'].map({1: 1, 0: -1})
result.to_csv('result/ExtraTrees.csv', index = False)

## <span style="color: hotpink"> **2. 앙상블(Ensemble)** </span>
- 스태킹
- 보팅

### <span style="color: skyblue"> **스태킹(Stacking)** </span>

In [41]:
estimators = [('randomforest', RandomForestClassifier(**best_params_rf, random_state=2022)),
              ('xgboost', XGBClassifier(**best_params_xg, random_state=2022)),
              ('lgbm', LGBMClassifier(**best_params_lgbm, random_state=2022)),
              ('xtree', ExtraTreesClassifier(**best_params_xtree, random_state=2022))
             ]

model_stc = StackingClassifier(estimators=estimators,
                           final_estimator= CatBoostClassifier(**best_params_cat, verbose = 100, 
                                                               eval_metric='F1', random_state=2022))

model_stc.fit(X, y)

0:	learn: 0.9650573	total: 212ms	remaining: 3m 31s
100:	learn: 0.9994618	total: 11.2s	remaining: 1m 39s
200:	learn: 1.0000000	total: 21.6s	remaining: 1m 25s
300:	learn: 1.0000000	total: 32.2s	remaining: 1m 14s
400:	learn: 1.0000000	total: 42.4s	remaining: 1m 3s
500:	learn: 1.0000000	total: 53.4s	remaining: 53.2s
600:	learn: 1.0000000	total: 1m 4s	remaining: 42.7s
700:	learn: 1.0000000	total: 1m 14s	remaining: 31.9s
800:	learn: 1.0000000	total: 1m 25s	remaining: 21.1s
900:	learn: 1.0000000	total: 1m 35s	remaining: 10.5s
999:	learn: 1.0000000	total: 1m 45s	remaining: 0us


StackingClassifier(cv=None,
                   estimators=[('randomforest',
                                RandomForestClassifier(bootstrap=True,
                                                       ccp_alpha=0.0,
                                                       class_weight=None,
                                                       criterion='gini',
                                                       max_depth=15,
                                                       max_features='auto',
                                                       max_leaf_nodes=None,
                                                       max_samples=None,
                                                       min_impurity_decrease=0.0,
                                                       min_impurity_split=None,
                                                       min_samples_leaf=1,
                                                       min_samples_split=2,
                             

In [62]:
# 예측 및 내보내기
predict = model_stc.predict(test)
result = pd.read_csv('sample_submission.csv')
result['expected'] = predict
result['expected'] = result['expected'].map({1: 1, 0: -1})
result.to_csv('result/stacking.csv', index = False)

### <span style="color: skyblue"> **보팅(Voting)** </span>

In [60]:
model_v1 = model_cat_1
model_v2 = model_cat

estimators=[('model_v1', model_v1), ('model_v2', model_v2)]
model_voting = VotingClassifier(estimators=estimators, voting='soft')

model_voting.fit(X, y)

0:	learn: 0.9060716	total: 31.3ms	remaining: 31.3s
100:	learn: 1.0000000	total: 13.6s	remaining: 2m
200:	learn: 1.0000000	total: 26.3s	remaining: 1m 44s
300:	learn: 1.0000000	total: 39.5s	remaining: 1m 31s
400:	learn: 1.0000000	total: 52.3s	remaining: 1m 18s
500:	learn: 1.0000000	total: 1m 4s	remaining: 1m 4s
600:	learn: 1.0000000	total: 1m 16s	remaining: 50.9s
700:	learn: 1.0000000	total: 1m 28s	remaining: 37.8s
800:	learn: 1.0000000	total: 1m 40s	remaining: 25s
900:	learn: 1.0000000	total: 1m 52s	remaining: 12.3s
999:	learn: 1.0000000	total: 2m 4s	remaining: 0us
0:	learn: 0.9181626	total: 26.9ms	remaining: 26.9s
100:	learn: 0.9970374	total: 3.04s	remaining: 27s
200:	learn: 1.0000000	total: 6.17s	remaining: 24.5s
300:	learn: 1.0000000	total: 9.71s	remaining: 22.6s
400:	learn: 1.0000000	total: 12.6s	remaining: 18.9s
500:	learn: 1.0000000	total: 15.5s	remaining: 15.4s
600:	learn: 1.0000000	total: 18.3s	remaining: 12.1s
700:	learn: 1.0000000	total: 21s	remaining: 8.94s
800:	learn: 1.0000

VotingClassifier(estimators=[('model_v1',
                              <catboost.core.CatBoostClassifier object at 0x164251850>),
                             ('model_v2',
                              <catboost.core.CatBoostClassifier object at 0x1641f7b50>)],
                 voting='soft')

In [61]:
# 예측 및 내보내기
predict = model_voting.predict(test)
result = pd.read_csv('sample_submission.csv')
result['expected'] = predict
result['expected'] = result['expected'].map({1: 1, 0: -1})
result.to_csv('result/voting.csv', index = False)