In [1]:
# data analysis and wrangling
import numpy as np
import pandas as pd
import json

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold, StratifiedKFold

import xgboost as xgb
from xgboost import XGBClassifier
from xgboost import plot_importance
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier

import tensorflow as tf

from tensorflow.keras import datasets, layers, models

# ignore Warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
train_df = pd.read_csv("preprocessed_train.csv", encoding='euc-kr')

In [3]:
train_df.head()

Unnamed: 0,분석데이터,label,numstrings,avlength,printables,entropy,paths,urls,registry,MZ,...,dist_86,dist_87,dist_88,dist_89,dist_90,dist_91,dist_92,dist_93,dist_94,dist_95
0,1,1,144,12.298611,1771,5.356616,0,0,0,1,...,10,4,10,9,4,0,1,0,0,0
1,2,1,804,9.580846,7703,6.063542,0,0,0,6,...,43,121,84,78,47,36,40,45,27,36
2,3,0,2205,12.736054,28083,6.10705,9,0,0,6,...,326,268,239,286,199,148,154,37,48,36
3,4,0,2602,10.28824,26770,5.373013,8,0,0,1,...,336,230,206,245,76,0,26,702,1,5
4,5,1,8980,23.252339,208806,5.775223,0,28,16,3,...,731,882,1171,1010,322,64,327,84,75,244


In [4]:
train_df.dtypes

분석데이터           int64
label           int64
numstrings      int64
avlength      float64
printables      int64
               ...   
dist_91         int64
dist_92         int64
dist_93         int64
dist_94         int64
dist_95         int64
Length: 618, dtype: object

In [5]:
train_df2 = train_df.drop(columns=['분석데이터'], axis=1)

In [6]:
train_df_x = train_df2.drop(['label'], axis=1)

In [7]:
train_df_y = train_df2['label']

In [8]:
train_df_x = train_df.drop(['분석데이터','label'], axis=1)
train_df_y = train_df['label']
train_df_y = train_df_y.astype('int')

In [9]:
x_train, x_test, y_train, y_test = train_test_split(train_df_x, train_df_y, test_size=0.2, random_state=42, stratify=train_df_y)

In [10]:
x_train

Unnamed: 0,numstrings,avlength,printables,entropy,paths,urls,registry,MZ,a_0,a_1,...,dist_86,dist_87,dist_88,dist_89,dist_90,dist_91,dist_92,dist_93,dist_94,dist_95
3042,1160,13.968103,16203,5.592541,0,0,0,2,390735,29870,...,93,130,121,103,9,2,19,2,5,18
688,7256,6.037486,43808,6.565055,0,16,0,17,1131891,2778,...,486,477,451,428,446,449,415,401,434,419
2401,4,139.500000,558,3.311878,0,0,0,1,175208,9429,...,0,0,0,0,0,0,0,0,0,0
7919,30385,5.665756,172154,6.583076,0,12,0,112,565634,151579,...,1801,1812,1644,1829,1764,1606,1685,1709,1707,1707
5888,582,16.383162,9535,5.765152,0,1,0,1,80061,4706,...,69,55,89,176,48,1,0,12,111,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2385,12946,22.648926,293213,6.212143,0,93,0,26,24649,2051,...,2249,2223,3324,2459,1424,1420,1623,1453,1248,671
5118,6790,206.887923,1404769,5.018292,0,9,0,11,50303,42452,...,21705,19933,27071,31742,74191,67,194,101,76,66
3563,542,20.575646,11152,5.279839,11,0,0,2,4703,1493,...,75,47,79,70,10,8,4,7,1,1
5511,3468,11.835928,41047,6.048495,0,188,0,32,25817,549,...,532,261,288,290,151,129,115,135,104,114


In [11]:
x_test

Unnamed: 0,numstrings,avlength,printables,entropy,paths,urls,registry,MZ,a_0,a_1,...,dist_86,dist_87,dist_88,dist_89,dist_90,dist_91,dist_92,dist_93,dist_94,dist_95
4856,2016,13.085813,26381,6.024467,1,14,0,3,18560,416,...,133,143,189,164,56,29,105,25,38,33
488,4,43.250000,173,4.148725,0,0,0,1,349327,2935,...,1,1,0,0,0,0,0,0,0,0
7193,3,60.333333,181,3.884033,0,0,0,1,3058,50,...,0,0,0,0,0,0,0,0,0,0
9317,3446,18.289611,63026,5.796207,3,94,2,22,12583,509,...,410,625,388,542,83,15,201,24,347,45
72,1491,10.094567,15051,5.932281,0,0,0,1,82020,13439,...,178,45,121,58,21,27,56,37,22,29
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5044,149216,5.882171,877714,6.579395,1,1,0,501,23524,429,...,9027,8845,8741,9247,8740,8855,9306,8700,9056,9146
9504,264,10.359848,2735,6.015467,0,0,0,1,4825,95,...,14,9,26,27,5,3,15,4,7,3
3437,2757,9.026478,24886,5.759953,4,0,0,1,19979,1386,...,261,139,409,474,86,36,417,129,150,346
5854,5609,18.947049,106274,6.230731,4,64,5,12,7429,463,...,845,1178,1107,989,513,224,286,225,166,246


In [12]:
y_train

3042    0
688     1
2401    0
7919    1
5888    0
       ..
2385    1
5118    0
3563    1
5511    0
1212    0
Name: label, Length: 8000, dtype: int32

In [13]:
y_test

4856    1
488     1
7193    0
9317    1
72      0
       ..
5044    1
9504    0
3437    1
5854    1
3981    1
Name: label, Length: 2000, dtype: int32

In [14]:
y_train.dtypes

dtype('int32')

In [15]:
x = x_train.to_numpy()
x = np.transpose(x)
# 모든 columns의 가장 큰 값 찾기
x2 = x
counti=0
for i in x: 
    max = 0
    k = 0.0
    countj = 0
    for j in i:
        if j > max:
            max = j
    for j in i:
        x2[counti][countj] = j / max
        countj=countj+1
    counti=counti+1

# 데이터 2 by 2로 가공
x2 = np.transpose(x2)
len(x2[0])


x3 = np.zeros((8000,22,28))
x4 = np.zeros((8000,22,28,1))
for i in range(8000):
    x3[i] = x2[i].reshape((22,28))
    for j in range(22):
        for k in range(28):
            x4[i][j][k][0] = x3[i][j][k]

In [16]:
x_train = x4

In [17]:
x_train.shape

(8000, 22, 28, 1)

In [18]:
y_train = y_train.to_numpy()

In [19]:
x_test.shape

(2000, 616)

In [20]:
xt = x_test.to_numpy()
xt = np.transpose(xt)
# 모든 columns의 가장 큰 값 찾기
xt2 = xt
counti=0
for i in xt: 
    max = 0
    k = 0.0
    countj = 0
    for j in i:
        if j > max:
            max = j
    for j in i:
        xt2[counti][countj] = j / max
        countj=countj+1
    counti=counti+1
    
# 데이터 2 by 2로 가공
xt2 = np.transpose(xt2)
len(x2[0])


xt3 = np.zeros((2000,22,28))
xt4 = np.zeros((2000,22,28,1))
for i in range(2000):
    xt3[i] = xt2[i].reshape((22,28))
    for j in range(22):
        for k in range(28):
            xt4[i][j][k][0] = xt3[i][j][k]

In [21]:
x_test = xt4

In [22]:
x_test.shape

(2000, 22, 28, 1)

In [23]:
# 합성곱 층 만들기

model = models.Sequential()
model.add(layers.Conv2D(32, (3,3), activation='relu', input_shape=(22, 28, 1)))
model.add(layers.MaxPooling2D((2,2)))
model.add(layers.Dropout(0.25))
model.add(layers.Conv2D(64, (3,3), activation='relu'))
model.add(layers.MaxPooling2D((2,2)))
model.add(layers.Dropout(0.25))
model.add(layers.Conv2D(64, (3,3), activation='relu'))

In [24]:
# 마지막에 Dense 층 추가하기
model.add(layers.Dropout(0.25))
model.add(layers.Flatten())
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dropout(0.25))
model.add(layers.Dense(2, activation='softmax'))

In [25]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 20, 26, 32)        320       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 10, 13, 32)        0         
_________________________________________________________________
dropout (Dropout)            (None, 10, 13, 32)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 8, 11, 64)         18496     
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 4, 5, 64)          0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 4, 5, 64)          0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 2, 3, 64)          3

In [None]:
# 모델 컴파일과 훈련하기
model.compile(optimizer='adam', 
              loss='sparse_categorical_crossentropy', 
              metrics=['accuracy'])
model.fit(x_train, y_train, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
 30/250 [==>...........................] - ETA: 0s - loss: 0.2953 - accuracy: 0.8802

In [None]:
# 모델 평가
test_loss, test_acc = model.evaluate(x_test, y_test, verbose=2)
print(test_acc)