In [1]:
# data analysis and wrangling
import numpy as np
import pandas as pd
import json

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from xgboost import XGBClassifier
from xgboost import plot_importance
from sklearn.metrics import explained_variance_score
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV

# ignore Warnings
import warnings
warnings.filterwarnings('ignore')

from datetime import datetime

In [2]:
train_df = pd.read_csv("preprocessed_train.csv", encoding='euc-kr')

In [3]:
train_df.head()

Unnamed: 0,분석데이터,label,numstrings,avlength,printables,entropy,paths,urls,registry,MZ,...,dist_86,dist_87,dist_88,dist_89,dist_90,dist_91,dist_92,dist_93,dist_94,dist_95
0,1,1,144,12.298611,1771,5.356616,0,0,0,1,...,10,4,10,9,4,0,1,0,0,0
1,2,1,804,9.580846,7703,6.063542,0,0,0,6,...,43,121,84,78,47,36,40,45,27,36
2,3,0,2205,12.736054,28083,6.10705,9,0,0,6,...,326,268,239,286,199,148,154,37,48,36
3,4,0,2602,10.28824,26770,5.373013,8,0,0,1,...,336,230,206,245,76,0,26,702,1,5
4,5,1,8980,23.252339,208806,5.775223,0,28,16,3,...,731,882,1171,1010,322,64,327,84,75,244


In [4]:
train_df.dtypes

분석데이터           int64
label           int64
numstrings      int64
avlength      float64
printables      int64
               ...   
dist_91         int64
dist_92         int64
dist_93         int64
dist_94         int64
dist_95         int64
Length: 618, dtype: object

In [5]:
train_df_x = train_df.drop(['분석데이터','label'], axis=1)

In [6]:
train_df_y = train_df['label']

In [7]:
train_df_y = train_df_y.astype('int')

In [73]:
x_train, x_test, y_train, y_test = train_test_split(train_df_x, train_df_y, test_size=0.2, random_state=42)

In [74]:
x_train

Unnamed: 0,numstrings,avlength,printables,entropy,paths,urls,registry,MZ,a_0,a_1,...,dist_86,dist_87,dist_88,dist_89,dist_90,dist_91,dist_92,dist_93,dist_94,dist_95
9254,92,13.500000,1242,5.617271,0,0,0,1,16738,4323,...,9,11,5,12,3,0,2,3,2,0
1561,629,11.941176,7511,6.061898,0,0,0,4,36467,335,...,72,35,84,76,27,13,34,20,21,24
1670,362,5.748619,2081,6.047759,0,0,0,7,393967,32891,...,9,5,8,23,4,19,11,10,7,4
6087,27790,5.742965,159597,6.560686,0,0,0,75,4663,170,...,1796,1502,2255,1308,2056,1381,1569,1239,1632,1320
6669,2042,8.457884,17271,6.306899,0,9,0,3,16454,9185,...,138,138,141,171,114,84,122,99,98,82
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5734,728,21.414835,15590,5.339247,0,0,0,1,5083,22,...,151,19,23,172,2,0,0,0,0,0
5191,31774,11.639391,369830,5.401709,0,31,0,1,125002,6049,...,1930,2522,2864,4418,690,381,338,440,231,335
5390,596,7.041946,4197,6.054880,0,0,0,6,220163,20051,...,28,40,19,27,24,18,37,24,11,15
860,205,11.965854,2453,5.786017,0,0,0,1,43248,24338,...,13,15,16,11,1,1,0,1,1,3


In [75]:
x_test

Unnamed: 0,numstrings,avlength,printables,entropy,paths,urls,registry,MZ,a_0,a_1,...,dist_86,dist_87,dist_88,dist_89,dist_90,dist_91,dist_92,dist_93,dist_94,dist_95
6252,322,14.465839,4658,5.727650,1,0,0,1,16382,362,...,47,22,37,35,12,2,8,7,0,3
4684,905,11.585635,10485,5.863804,0,0,0,7,556624,141386,...,61,130,94,116,31,22,16,37,29,38
1731,71103,12.933730,919627,5.825820,0,0,0,42,32479,19871,...,3726,2856,7919,14652,1581,1086,2448,1175,1135,1118
4742,4162,9.887794,41153,6.116438,0,0,0,9,7687,502,...,212,156,196,217,165,149,141,144,142,5366
4521,64,16.078125,1029,5.794510,1,0,0,1,99747,7113,...,8,2,7,4,1,0,4,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6412,4,14.250000,57,4.404366,0,0,0,1,95534,7356,...,0,0,0,0,0,0,0,0,0,0
8285,16828,5.795401,97525,6.539994,1,0,0,39,37231,4295,...,1135,807,1476,658,1462,781,860,649,978,678
7853,3312,24.904287,82483,5.833135,0,39,0,8,30819,3000,...,437,969,803,851,116,123,181,100,75,86
1095,51256,11.418039,585243,5.888161,0,0,0,37,39342,2370,...,2888,2557,5104,8930,1519,962,1754,1001,1050,1016


In [76]:
y_train

9254    0
1561    1
1670    1
6087    0
6669    0
       ..
5734    1
5191    1
5390    0
860     1
7270    0
Name: label, Length: 8000, dtype: int32

In [77]:
y_test

6252    0
4684    1
1731    1
4742    1
4521    1
       ..
6412    1
8285    0
7853    1
1095    1
6929    0
Name: label, Length: 2000, dtype: int32

In [78]:
train_df_x.shape

(10000, 616)

In [79]:
x_train.dtypes

numstrings      int64
avlength      float64
printables      int64
entropy       float64
paths           int64
               ...   
dist_91         int64
dist_92         int64
dist_93         int64
dist_94         int64
dist_95         int64
Length: 616, dtype: object

In [15]:
y_train.dtypes

dtype('int32')

In [16]:
x_train.astype('float')

Unnamed: 0,numstrings,avlength,printables,entropy,paths,urls,registry,MZ,a_0,a_1,...,dist_86,dist_87,dist_88,dist_89,dist_90,dist_91,dist_92,dist_93,dist_94,dist_95
3042,1160.0,13.968103,16203.0,5.592541,0.0,0.0,0.0,2.0,390735.0,29870.0,...,93.0,130.0,121.0,103.0,9.0,2.0,19.0,2.0,5.0,18.0
688,7256.0,6.037486,43808.0,6.565055,0.0,16.0,0.0,17.0,1131891.0,2778.0,...,486.0,477.0,451.0,428.0,446.0,449.0,415.0,401.0,434.0,419.0
2401,4.0,139.500000,558.0,3.311878,0.0,0.0,0.0,1.0,175208.0,9429.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7919,30385.0,5.665756,172154.0,6.583076,0.0,12.0,0.0,112.0,565634.0,151579.0,...,1801.0,1812.0,1644.0,1829.0,1764.0,1606.0,1685.0,1709.0,1707.0,1707.0
5888,582.0,16.383162,9535.0,5.765152,0.0,1.0,0.0,1.0,80061.0,4706.0,...,69.0,55.0,89.0,176.0,48.0,1.0,0.0,12.0,111.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2385,12946.0,22.648926,293213.0,6.212143,0.0,93.0,0.0,26.0,24649.0,2051.0,...,2249.0,2223.0,3324.0,2459.0,1424.0,1420.0,1623.0,1453.0,1248.0,671.0
5118,6790.0,206.887923,1404769.0,5.018292,0.0,9.0,0.0,11.0,50303.0,42452.0,...,21705.0,19933.0,27071.0,31742.0,74191.0,67.0,194.0,101.0,76.0,66.0
3563,542.0,20.575646,11152.0,5.279839,11.0,0.0,0.0,2.0,4703.0,1493.0,...,75.0,47.0,79.0,70.0,10.0,8.0,4.0,7.0,1.0,1.0
5511,3468.0,11.835928,41047.0,6.048495,0.0,188.0,0.0,32.0,25817.0,549.0,...,532.0,261.0,288.0,290.0,151.0,129.0,115.0,135.0,104.0,114.0


In [17]:
# Support Vector Machines
svc = SVC()
svc.fit(x_train, y_train)
svc.score(x_test, y_test)

0.7025

In [18]:
# KNN
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(x_train, y_train)
knn.score(x_test, y_test)

0.804

In [19]:
# Gaussian Naive Bayes
gaussian = GaussianNB()
gaussian.fit(x_train, y_train)
gaussian.score(x_test, y_test)

0.5705

In [20]:
# Perceptron
perceptron = Perceptron()
perceptron.fit(x_train, y_train)
perceptron.score(x_test, y_test)

0.742

In [21]:
sgd = SGDClassifier()
sgd.fit(x_train, y_train)
sgd.score(x_test, y_test)

0.651

In [22]:
decision_tree = DecisionTreeClassifier()
decision_tree.fit(x_train, y_train)
decision_tree.score(x_test, y_test)

0.846

In [23]:
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(x_train, y_train)
random_forest.score(x_test, y_test)

0.891

In [24]:
x_train.shape

(8000, 616)

In [30]:
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

In [32]:
xgb_clf = xgb.XGBClassifier(silent=False,
                            n_estimators = 500,
                            booster='gbtree',
                            tree_method='gpu_hist',
                            preidctor= 'gpu_predictor',
                            scale_pos_weight=1,
                            learning_rate=0.02,
                            objective='binary:logistic', 
                            seed=42)

In [33]:
xgb_param_grid = {'max_depth': [3, 5, 7, 9],
                  'subsample': [0.6, 0.8, 1.0],
                  'min_child_weight': [1, 3, 5, 10],
                  'colsample_bytree': [0.6, 0.8, 1.0],
                  'gamma' : [0.5, 1, 1.5, 2, 5],
                 }

hr_grid = GridSearchCV(estimator=xgb_clf,
                       param_grid=xgb_param_grid,
                       scoring='roc_auc',
                       n_jobs=5,
                       cv=3,
                       refit=True, 
                       return_train_score=True)

start_time = timer(None)
hr_grid.fit(x_train, y_train)
timer(start_time)

Parameters: { "preidctor", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.



 Time taken: 3 hours 58 minutes and 38.7 seconds.


In [34]:
hr_grid_df = pd.DataFrame(hr_grid.cv_results_)
hr_grid_df.loc[:, ['mean_test_score', "params"]]

Unnamed: 0,mean_test_score,params
0,0.949371,"{'colsample_bytree': 0.6, 'gamma': 0.5, 'max_d..."
1,0.948366,"{'colsample_bytree': 0.6, 'gamma': 0.5, 'max_d..."
2,0.945477,"{'colsample_bytree': 0.6, 'gamma': 0.5, 'max_d..."
3,0.948744,"{'colsample_bytree': 0.6, 'gamma': 0.5, 'max_d..."
4,0.947434,"{'colsample_bytree': 0.6, 'gamma': 0.5, 'max_d..."
...,...,...
715,0.960819,"{'colsample_bytree': 1.0, 'gamma': 5, 'max_dep..."
716,0.959442,"{'colsample_bytree': 1.0, 'gamma': 5, 'max_dep..."
717,0.959502,"{'colsample_bytree': 1.0, 'gamma': 5, 'max_dep..."
718,0.959925,"{'colsample_bytree': 1.0, 'gamma': 5, 'max_dep..."


In [35]:
hr_grid_df[hr_grid_df['rank_test_score'] == 1]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_colsample_bytree,param_gamma,param_max_depth,param_min_child_weight,param_subsample,params,...,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
277,82.392673,0.384263,0.029675,0.002494,0.8,0.5,9,1,0.8,"{'colsample_bytree': 0.8, 'gamma': 0.5, 'max_d...",...,0.969508,0.97082,0.970882,0.001148,1,1.0,1.0,1.0,1.0,0.0


In [39]:
## 최고성능
best_score = hr_grid.best_score_
# 최고성능을 내는 행을 찾아냄
best_row = hr_grid.best_index_

# 최적 초모수: max_depth, subsample
best_max_depth = hr_grid.best_params_["max_depth"]
best_max_subsample = hr_grid.best_params_["subsample"]
best_min_child_weight = hr_grid.best_params_["min_child_weight"]
best_colsample_bytree = hr_grid.best_params_["colsample_bytree"]
best_gamma = hr_grid.best_params_["gamma"]


nl = '\n'
print(f'예측모형성능(AUC):  \t {best_score:.3f}{nl}\
        인덱스:           \t {best_row}{nl}\
        max_depth:      \t {best_max_depth}{nl}\
        subsample:      \t {best_max_subsample}{nl}\
        colsample_bytree:      \t {best_colsample_bytree}{nl}\
        min_child_weight:      \t {best_min_child_weight}{nl}\
        gamma:      \t {best_gamma}')

예측모형성능(AUC):  	 0.971
        인덱스:           	 277
        max_depth:      	 9
        subsample:      	 0.8
        colsample_bytree:      	 0.8
        min_child_weight:      	 1
        gamma:      	 0.5


In [40]:
pred = hr_grid.predict(x_test)

In [41]:
accuracy_score(y_test, pred)

0.915

In [42]:
classification_report(y_test, pred)

'              precision    recall  f1-score   support\n\n           0       0.91      0.90      0.91       900\n           1       0.92      0.92      0.92      1100\n\n    accuracy                           0.92      2000\n   macro avg       0.91      0.91      0.91      2000\nweighted avg       0.91      0.92      0.91      2000\n'

In [81]:
xgb_final = xgb.XGBClassifier(silent=False,
                              n_estimators = 600,
                              booster='gbtree',
                              tree_method='gpu_hist',
                              preidctor= 'gpu_predictor',
                              scale_pos_weight=1,
                              learning_rate=0.01,
                              objective='binary:logistic',
                              max_depth = 9,
                              subsample = 0.8,
                              colsample_bytree = 0.8,
                              min_child_weight = 1,
                              gamma = 0.5,
                              seed=42)

In [82]:
xgb_final

XGBClassifier(base_score=None, booster='gbtree', colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=0.8, gamma=0.5,
              gpu_id=None, importance_type='gain', interaction_constraints=None,
              learning_rate=0.01, max_delta_step=None, max_depth=9,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=600, n_jobs=None, num_parallel_tree=None,
              preidctor='gpu_predictor', random_state=None, reg_alpha=None,
              reg_lambda=None, scale_pos_weight=1, seed=42, silent=False,
              subsample=0.8, tree_method='gpu_hist', validate_parameters=None,
              verbosity=None)

In [83]:
final_pred = xgb_final.fit(x_train, y_train).predict(x_test)

Parameters: { "preidctor", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [84]:
final_pred

array([0, 1, 1, ..., 1, 1, 0])

In [85]:
classification_report(y_test, final_pred)

'              precision    recall  f1-score   support\n\n           0       0.92      0.90      0.91       900\n           1       0.92      0.94      0.93      1100\n\n    accuracy                           0.92      2000\n   macro avg       0.92      0.92      0.92      2000\nweighted avg       0.92      0.92      0.92      2000\n'

In [86]:
accuracy_score(y_test, final_pred)

0.921

In [71]:
filename = 'xgb_model_by_minseok.model'

In [87]:
xgb_final.save_model(filename)