In [96]:
# data analysis and wrangling
import numpy as np
import pandas as pd
import json

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from xgboost import XGBClassifier
from xgboost import plot_importance
from sklearn.metrics import explained_variance_score
from sklearn.metrics import classification_report, accuracy_score

from keras.models import Sequential
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from tensorflow import keras
import tensorflow as tf

# ignore Warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
train_df = pd.read_csv("preprocessed_train.csv", encoding='euc-kr')

In [3]:
train_df.head()

Unnamed: 0,분석데이터,label,numstrings,avlength,printables,entropy,paths,urls,registry,MZ,...,dist_86,dist_87,dist_88,dist_89,dist_90,dist_91,dist_92,dist_93,dist_94,dist_95
0,1,1,144,12.298611,1771,5.356616,0,0,0,1,...,10,4,10,9,4,0,1,0,0,0
1,2,1,804,9.580846,7703,6.063542,0,0,0,6,...,43,121,84,78,47,36,40,45,27,36
2,3,0,2205,12.736054,28083,6.10705,9,0,0,6,...,326,268,239,286,199,148,154,37,48,36
3,4,0,2602,10.28824,26770,5.373013,8,0,0,1,...,336,230,206,245,76,0,26,702,1,5
4,5,1,8980,23.252339,208806,5.775223,0,28,16,3,...,731,882,1171,1010,322,64,327,84,75,244


In [4]:
train_df.dtypes

분석데이터           int64
label           int64
numstrings      int64
avlength      float64
printables      int64
               ...   
dist_91         int64
dist_92         int64
dist_93         int64
dist_94         int64
dist_95         int64
Length: 618, dtype: object

In [5]:
train_df_x = train_df.drop(['분석데이터','label'], axis=1)

In [6]:
train_df_y = train_df['label']

In [7]:
train_df_y = train_df_y.astype('int')

In [8]:
x_train, x_test, y_train, y_test = train_test_split(train_df_x, train_df_y, test_size=0.2, random_state=42, stratify=train_df_y)

In [9]:
x_train

Unnamed: 0,numstrings,avlength,printables,entropy,paths,urls,registry,MZ,a_0,a_1,...,dist_86,dist_87,dist_88,dist_89,dist_90,dist_91,dist_92,dist_93,dist_94,dist_95
3042,1160,13.968103,16203,5.592541,0,0,0,2,390735,29870,...,93,130,121,103,9,2,19,2,5,18
688,7256,6.037486,43808,6.565055,0,16,0,17,1131891,2778,...,486,477,451,428,446,449,415,401,434,419
2401,4,139.500000,558,3.311878,0,0,0,1,175208,9429,...,0,0,0,0,0,0,0,0,0,0
7919,30385,5.665756,172154,6.583076,0,12,0,112,565634,151579,...,1801,1812,1644,1829,1764,1606,1685,1709,1707,1707
5888,582,16.383162,9535,5.765152,0,1,0,1,80061,4706,...,69,55,89,176,48,1,0,12,111,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2385,12946,22.648926,293213,6.212143,0,93,0,26,24649,2051,...,2249,2223,3324,2459,1424,1420,1623,1453,1248,671
5118,6790,206.887923,1404769,5.018292,0,9,0,11,50303,42452,...,21705,19933,27071,31742,74191,67,194,101,76,66
3563,542,20.575646,11152,5.279839,11,0,0,2,4703,1493,...,75,47,79,70,10,8,4,7,1,1
5511,3468,11.835928,41047,6.048495,0,188,0,32,25817,549,...,532,261,288,290,151,129,115,135,104,114


In [10]:
x_test

Unnamed: 0,numstrings,avlength,printables,entropy,paths,urls,registry,MZ,a_0,a_1,...,dist_86,dist_87,dist_88,dist_89,dist_90,dist_91,dist_92,dist_93,dist_94,dist_95
4856,2016,13.085813,26381,6.024467,1,14,0,3,18560,416,...,133,143,189,164,56,29,105,25,38,33
488,4,43.250000,173,4.148725,0,0,0,1,349327,2935,...,1,1,0,0,0,0,0,0,0,0
7193,3,60.333333,181,3.884033,0,0,0,1,3058,50,...,0,0,0,0,0,0,0,0,0,0
9317,3446,18.289611,63026,5.796207,3,94,2,22,12583,509,...,410,625,388,542,83,15,201,24,347,45
72,1491,10.094567,15051,5.932281,0,0,0,1,82020,13439,...,178,45,121,58,21,27,56,37,22,29
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5044,149216,5.882171,877714,6.579395,1,1,0,501,23524,429,...,9027,8845,8741,9247,8740,8855,9306,8700,9056,9146
9504,264,10.359848,2735,6.015467,0,0,0,1,4825,95,...,14,9,26,27,5,3,15,4,7,3
3437,2757,9.026478,24886,5.759953,4,0,0,1,19979,1386,...,261,139,409,474,86,36,417,129,150,346
5854,5609,18.947049,106274,6.230731,4,64,5,12,7429,463,...,845,1178,1107,989,513,224,286,225,166,246


In [11]:
y_train

3042    0
688     1
2401    0
7919    1
5888    0
       ..
2385    1
5118    0
3563    1
5511    0
1212    0
Name: label, Length: 8000, dtype: int32

In [12]:
y_test

4856    1
488     1
7193    0
9317    1
72      0
       ..
5044    1
9504    0
3437    1
5854    1
3981    1
Name: label, Length: 2000, dtype: int32

In [13]:
train_df_x.shape

(10000, 616)

In [14]:
x_train.dtypes

numstrings      int64
avlength      float64
printables      int64
entropy       float64
paths           int64
               ...   
dist_91         int64
dist_92         int64
dist_93         int64
dist_94         int64
dist_95         int64
Length: 616, dtype: object

In [15]:
y_train.dtypes

dtype('int32')

In [16]:
x_train.astype('float')

Unnamed: 0,numstrings,avlength,printables,entropy,paths,urls,registry,MZ,a_0,a_1,...,dist_86,dist_87,dist_88,dist_89,dist_90,dist_91,dist_92,dist_93,dist_94,dist_95
3042,1160.0,13.968103,16203.0,5.592541,0.0,0.0,0.0,2.0,390735.0,29870.0,...,93.0,130.0,121.0,103.0,9.0,2.0,19.0,2.0,5.0,18.0
688,7256.0,6.037486,43808.0,6.565055,0.0,16.0,0.0,17.0,1131891.0,2778.0,...,486.0,477.0,451.0,428.0,446.0,449.0,415.0,401.0,434.0,419.0
2401,4.0,139.500000,558.0,3.311878,0.0,0.0,0.0,1.0,175208.0,9429.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7919,30385.0,5.665756,172154.0,6.583076,0.0,12.0,0.0,112.0,565634.0,151579.0,...,1801.0,1812.0,1644.0,1829.0,1764.0,1606.0,1685.0,1709.0,1707.0,1707.0
5888,582.0,16.383162,9535.0,5.765152,0.0,1.0,0.0,1.0,80061.0,4706.0,...,69.0,55.0,89.0,176.0,48.0,1.0,0.0,12.0,111.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2385,12946.0,22.648926,293213.0,6.212143,0.0,93.0,0.0,26.0,24649.0,2051.0,...,2249.0,2223.0,3324.0,2459.0,1424.0,1420.0,1623.0,1453.0,1248.0,671.0
5118,6790.0,206.887923,1404769.0,5.018292,0.0,9.0,0.0,11.0,50303.0,42452.0,...,21705.0,19933.0,27071.0,31742.0,74191.0,67.0,194.0,101.0,76.0,66.0
3563,542.0,20.575646,11152.0,5.279839,11.0,0.0,0.0,2.0,4703.0,1493.0,...,75.0,47.0,79.0,70.0,10.0,8.0,4.0,7.0,1.0,1.0
5511,3468.0,11.835928,41047.0,6.048495,0.0,188.0,0.0,32.0,25817.0,549.0,...,532.0,261.0,288.0,290.0,151.0,129.0,115.0,135.0,104.0,114.0


In [17]:
# Support Vector Machines
svc = SVC()
svc.fit(x_train, y_train)
svc.score(x_test, y_test)

0.7025

In [18]:
# KNN
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(x_train, y_train)
knn.score(x_test, y_test)

0.804

In [19]:
# Gaussian Naive Bayes
gaussian = GaussianNB()
gaussian.fit(x_train, y_train)
gaussian.score(x_test, y_test)

0.5705

In [20]:
# Perceptron
perceptron = Perceptron()
perceptron.fit(x_train, y_train)
perceptron.score(x_test, y_test)

0.742

In [21]:
sgd = SGDClassifier()
sgd.fit(x_train, y_train)
sgd.score(x_test, y_test)

0.6645

In [22]:
decision_tree = DecisionTreeClassifier()
decision_tree.fit(x_train, y_train)
decision_tree.score(x_test, y_test)

0.833

In [23]:
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(x_train, y_train)
random_forest.score(x_test, y_test)

0.89

In [24]:
x_train.shape

(8000, 616)

In [25]:
xgb_clf = xgb.XGBClassifier(silent=False,
                            booster='gbtree',
                            scale_pos_weight=1,
                            learning_rate=0.01,
                            colsample_bytree = 0.4,
                            subsample = 0.8,
                            objective='binary:logistic', 
                            n_estimators=100, 
                            max_depth=4, 
                            gamma=10, 
                            seed=777)

In [26]:
pred = xgb_clf.fit(x_train, y_train).predict(x_test)

Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [27]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.84      0.73      0.78       900
           1       0.80      0.89      0.84      1100

    accuracy                           0.81      2000
   macro avg       0.82      0.81      0.81      2000
weighted avg       0.82      0.81      0.81      2000



In [28]:
accuracy_list = []

max_depth_list = [3,5,7,9,10]

for max_depth in max_depth_list:
    xgb_model = xgb.XGBClassifier(max_depth=max_depth, seed=777)
    xgb_pred = xgb_model.fit(x_train, y_train).predict(x_test)
    xgb_accuracy = accuracy_score(y_test, xgb_pred) 
    accuracy_list.append(xgb_accuracy)
    
xgb_df = pd.DataFrame({'tree depth':max_depth_list, 'accuracy':accuracy_list})
xgb_df.head()    



Unnamed: 0,tree depth,accuracy
0,3,0.8915
1,5,0.9065
2,7,0.9125
3,9,0.9155
4,10,0.9155


In [29]:
# cnn ----------------------------------------------

In [80]:
x_train.values

array([[1.16000000e+03, 1.39681034e+01, 1.62030000e+04, ...,
        2.00000000e+00, 5.00000000e+00, 1.80000000e+01],
       [7.25600000e+03, 6.03748622e+00, 4.38080000e+04, ...,
        4.01000000e+02, 4.34000000e+02, 4.19000000e+02],
       [4.00000000e+00, 1.39500000e+02, 5.58000000e+02, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [5.42000000e+02, 2.05756458e+01, 1.11520000e+04, ...,
        7.00000000e+00, 1.00000000e+00, 1.00000000e+00],
       [3.46800000e+03, 1.18359285e+01, 4.10470000e+04, ...,
        1.35000000e+02, 1.04000000e+02, 1.14000000e+02],
       [8.81000000e+02, 1.59489217e+01, 1.40510000e+04, ...,
        5.00000000e+00, 4.00000000e+00, 1.30000000e+01]])

In [79]:
x_train

Unnamed: 0,numstrings,avlength,printables,entropy,paths,urls,registry,MZ,a_0,a_1,...,dist_86,dist_87,dist_88,dist_89,dist_90,dist_91,dist_92,dist_93,dist_94,dist_95
3042,1160,13.968103,16203,5.592541,0,0,0,2,390735,29870,...,93,130,121,103,9,2,19,2,5,18
688,7256,6.037486,43808,6.565055,0,16,0,17,1131891,2778,...,486,477,451,428,446,449,415,401,434,419
2401,4,139.500000,558,3.311878,0,0,0,1,175208,9429,...,0,0,0,0,0,0,0,0,0,0
7919,30385,5.665756,172154,6.583076,0,12,0,112,565634,151579,...,1801,1812,1644,1829,1764,1606,1685,1709,1707,1707
5888,582,16.383162,9535,5.765152,0,1,0,1,80061,4706,...,69,55,89,176,48,1,0,12,111,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2385,12946,22.648926,293213,6.212143,0,93,0,26,24649,2051,...,2249,2223,3324,2459,1424,1420,1623,1453,1248,671
5118,6790,206.887923,1404769,5.018292,0,9,0,11,50303,42452,...,21705,19933,27071,31742,74191,67,194,101,76,66
3563,542,20.575646,11152,5.279839,11,0,0,2,4703,1493,...,75,47,79,70,10,8,4,7,1,1
5511,3468,11.835928,41047,6.048495,0,188,0,32,25817,549,...,532,261,288,290,151,129,115,135,104,114


In [82]:
y_train

3042    0
688     1
2401    0
7919    1
5888    0
       ..
2385    1
5118    0
3563    1
5511    0
1212    0
Name: label, Length: 8000, dtype: int32

In [83]:
y_train.values

array([0, 1, 0, ..., 1, 0, 0])

In [78]:
x_train.shape[1]

616

In [142]:
tmodel = Sequential()
tmodel.add(Dense(input_dim=x_train.shape[1], units=128,
                 kernel_initializer='normal', bias_initializer='zeros'))
tmodel.add(Activation('relu'))

for i in range(0, 8):
    tmodel.add(Dense(units=64, kernel_initializer='normal',
                     bias_initializer='zeros'))
    tmodel.add(Activation('relu'))
    tmodel.add(Dropout(.25))

tmodel.add(Dense(units=2))
tmodel.add(Activation('linear'))

tmodel.compile(loss='mean_squared_error', optimizer='rmsprop', metrics=['accuracy'])

In [None]:
tmodel.fit(x_train.values, y_train.values, epochs=600, verbose=2)

Epoch 1/600
250/250 - 2s - loss: 2.1668 - accuracy: 0.4786
Epoch 2/600
250/250 - 2s - loss: 0.4407 - accuracy: 0.5030
Epoch 3/600
250/250 - 2s - loss: 0.4406 - accuracy: 0.5002
Epoch 4/600
250/250 - 2s - loss: 0.3121 - accuracy: 0.4965
Epoch 5/600
250/250 - 2s - loss: 0.2946 - accuracy: 0.5291
Epoch 6/600
250/250 - 2s - loss: 0.2746 - accuracy: 0.5236
Epoch 7/600
250/250 - 2s - loss: 0.2010 - accuracy: 0.5066
Epoch 8/600
250/250 - 2s - loss: 0.1851 - accuracy: 0.4810
Epoch 9/600
250/250 - 2s - loss: 0.1905 - accuracy: 0.4809
Epoch 10/600
250/250 - 2s - loss: 0.1796 - accuracy: 0.4971
Epoch 11/600
250/250 - 2s - loss: 0.2050 - accuracy: 0.4823
Epoch 12/600
250/250 - 2s - loss: 0.1791 - accuracy: 0.5042
Epoch 13/600
250/250 - 2s - loss: 0.1737 - accuracy: 0.5101
Epoch 14/600
250/250 - 2s - loss: 0.1767 - accuracy: 0.5102
Epoch 15/600
250/250 - 2s - loss: 0.1978 - accuracy: 0.4904
Epoch 16/600
250/250 - 2s - loss: 0.1880 - accuracy: 0.5146
Epoch 17/600
250/250 - 2s - loss: 0.1769 - accura

In [141]:
tmodel.evaluate(x_train, y_train)



[0.10647700726985931, 0.531125009059906]