In [4]:
from google.colab import drive
drive.mount('/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /gdrive


**1. 데이터 전처리 코드 (이전 파일과 동일함)**

In [5]:

import pandas as pd
import numpy as np
import glob

# 데이터 한꺼번에 불러와서 1개의 dataframe에 merge
all_files = glob.glob("/gdrive/My Drive/ARP/*.csv")
file_list = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0, low_memory=False)
    file_list.append(df)

arp_data = pd.concat(file_list, axis=0, ignore_index=True)
arp_data = arp_data.loc[:, ['ip.src', 'ip.dst', 'eth.src', 'eth.dst', 'tcp.seq', 'label']]
arp_data = arp_data.dropna()

#데이터 모습 확인
print(len(arp_data))
print(arp_data.columns)

#라벨링
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

#생성
encoder = LabelEncoder()

arp_data['ip.src'] = encoder.fit_transform(arp_data['ip.src'])
arp_data['ip.dst'] = encoder.fit_transform(arp_data['ip.dst'])
arp_data['eth.src'] = encoder.fit_transform(arp_data['eth.src'])
arp_data['eth.dst'] = encoder.fit_transform(arp_data['eth.dst'])

# Min-Max Scaling (정규화)
scaler = MinMaxScaler(feature_range=(0,1))
arp_data['tcp.seq'] = scaler.fit_transform(arp_data[['tcp.seq']])

#확인
arp_data['ip.src']
arp_data['label'] #int형

#이제 label과 데이터로 DF를 나눠준다
df_labels= arp_data['label']
df_packets=arp_data.drop(['label'],axis='columns')

#리스트로 되돌리기
packets= df_packets.values.tolist()
labels= df_labels.values.tolist()

#길이 확인
len(packets)
#len(labels)
len(packets[0]) #5 = 'ip.src', 'ip.dst', 'eth.src', 'eth.dst', 'tcp.seq',


189055
Index(['ip.src', 'ip.dst', 'eth.src', 'eth.dst', 'tcp.seq', 'label'], dtype='object')


5

**2. 교차 검증, Accuracy, Precision rate, Recall rate, F1-Score**

In [7]:
from keras import backend as K
#사용자 정의 metircs함수
def f1score(y_target, y_pred):
    _recall = recall(y_target, y_pred)
    _precision = precision(y_target, y_pred)
    _f1score = ( 2 * _recall * _precision) / (_recall + _precision+ K.epsilon())
    return _f1score

def recall(y_target, y_pred):
    y_target_yn = K.round(K.clip(y_target, 0, 1))
    y_pred_yn = K.round(K.clip(y_pred, 0, 1)) 
    count_true_positive = K.sum(y_target_yn * y_pred_yn) 
    count_true_positive_false_negative = K.sum(y_target_yn)
    recall = count_true_positive / (count_true_positive_false_negative + K.epsilon())
    return recall


def precision(y_target, y_pred):
    y_pred_yn = K.round(K.clip(y_pred, 0, 1)) 
    y_target_yn = K.round(K.clip(y_target, 0, 1)) 
    count_true_positive = K.sum(y_target_yn * y_pred_yn) 
    count_true_positive_false_positive = K.sum(y_pred_yn)
    precision = count_true_positive / (count_true_positive_false_positive + K.epsilon())
    return precision


Using TensorFlow backend.


In [8]:
#cross validation
import tensorflow as tf

from sklearn.model_selection import KFold
num_folds = 5 #폴드 수

#폴드별 정확도를 담을 리스트
ac=[]
re=[]
f1=[]
pr=[]


kfold = KFold(n_splits=num_folds, shuffle=True)

#test와 train으로 나눠주기
#전체189055개 패킷의 70%는 약 132,338
train_set = packets[:132338]
test_set = packets[132338:]
train_label =labels[:132338]
test_label = labels[132338:]

train_set = np.asarray(train_set)
train_label = np.asarray(train_label)
test_set = np.asarray(test_set)
test_label = np.asarray(test_label)

#잘 나눠졌나 확인하기
print(len(train_set) + len(test_set))
print(len(test_label) + len(train_label))

train_set = np.expand_dims(train_set, axis=2) #reshape
test_set = np.expand_dims(test_set, axis=2) #reshape


# Merge inputs and targets
inputs = np.concatenate((test_set, train_set), axis=0)
targets = np.concatenate((test_label, train_label), axis=0)


foldnum = 1
for train, test in kfold.split(inputs, targets):
  print("----------------fold 5/"+str(foldnum)+"----------------")
  model = tf.keras.Sequential([
  tf.keras.layers.Conv1D(16, (2), padding='same', activation='relu', input_shape=(5, 1)),
  tf.keras.layers.MaxPooling1D(2,padding='same'),
  tf.keras.layers.Conv1D(16, (2), padding='same', activation='relu'),
  tf.keras.layers.MaxPooling1D(2, padding='same'),
  tf.keras.layers.Flatten(),
  tf.keras.layers.Dense(512, activation='relu'),
  tf.keras.layers.Dense(256, activation='relu'),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dense(15, activation='softmax')
  ])
  #compile
  model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy', precision, recall, f1score])
  #결과
  history = model.fit(inputs[train], targets[train], epochs=5) 
  _loss, _acc, _precision, _recall, _f1score = model.evaluate(inputs[test], targets[test], verbose=0)
  ac.append(_acc)
  pr.append(_precision)
  re.append(_recall)
  f1.append(_f1score)
  print('loss: {:.3f}, accuracy: {:.3f}, precision: {:.3f}, recall: {:.3f}, f1score: {:.3f}'.format(_loss, _acc, _precision, _recall, _f1score))

  #다음 fold로
  foldnum = foldnum + 1



189055
189055
----------------fold 5/1----------------
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
loss: 0.005, accuracy: 0.999, precision: 0.533, recall: 0.788, f1score: 0.619
----------------fold 5/2----------------
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
loss: 0.002, accuracy: 1.000, precision: 0.534, recall: 0.791, f1score: 0.620
----------------fold 5/3----------------
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
loss: 0.016, accuracy: 0.997, precision: 0.538, recall: 0.793, f1score: 0.624
----------------fold 5/4----------------
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
loss: 0.005, accuracy: 0.999, precision: 0.537, recall: 0.792, f1score: 0.623
----------------fold 5/5----------------
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
loss: 0.008, accuracy: 0.998, precision: 0.533, recall: 0.787, f1score: 0.619


In [9]:
print("----------------------전체 fold의 평균------------------------------")
print("Accuracy =" + str(sum(ac)/len(ac)))
print("Precesion =" + str(sum(pr)/len(pr)))
print("Recall =" + str(sum(re)/len(re)))
print(" F1-Score =" + str(sum(f1)/len(f1)))

----------------------전체 fold의 평균------------------------------
Accuracy =0.9986829161643982
Precesion =0.5349143266677856
Recall =0.7901861190795898
 F1-Score =0.6209705471992493


**3. 공격, 정상 여부 출력**

In [10]:
#기록하기
test=np.asarray(packets[0])
print('classification result+', model.predict_classes(test.reshape(1,5,1)))
print('정답='+str(labels[0]))


test2=np.asarray(packets[12478])
print('classification result+', model.predict_classes(test2.reshape(1,5,1)))
print('정답='+str(labels[12478]))

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).
classification result+ [0]
정답=0
classification result+ [1]
정답=1
