# Data Pretreatment

In [1]:
import pandas as pd

## Network Intrusion Detection Dataset Features

이 리스트는 네트워크 침입 탐지 데이터셋의 특성(features)을 설명합니다. 이 데이터셋은 네트워크 트래픽 분석을 통해 침입 여부를 감지하는 데 사용됩니다.

1. **duration**: 연결 지속 시간.
2. **protocol_type**: 통신에 사용되는 프로토콜의 종류.
3. **service**: 서비스 유형.
4. **flag**: 연결 상태 플래그.
5. **src_bytes**: 송신자에서 전송된 바이트 수.
6. **dst_bytes**: 수신자에게 전송된 바이트 수.
7. **land**: 출발지와 목적지 IP 및 포트가 동일한지 여부.
8. **wrong_fragment**: 잘못된 프래그먼트 수.
9. **urgent**: 긴급한 패킷 여부.
10. **hot**: "hot" 기능의 수치.
11. **num_failed_logins**: 실패한 로그인 시도 횟수.
12. **logged_in**: 로그인 상태 여부.
13. **num_compromised**: 탐지된 타협된 호스트 수.
14. **root_shell**: 루트 권한 셸 획득 여부.
15. **su_attempted**: su 명령어 시도 여부.
16. **num_root**: 루트 권한 로그인 횟수.
17. **num_file_creations**: 파일 생성 횟수.
18. **num_shells**: 셸 실행 횟수.
19. **num_access_files**: 접근 파일 수.
20. **num_outbound_cmds**: 외부로 나가는 명령어 수 (이상적으로는 0).
21. **is_host_login**: 호스트 로그인 여부.
22. **is_guest_login**: 게스트 로그인 여부.
23. **count**: 주어진 시간 내의 연결 시도 횟수.
24. **srv_count**: 서비스별 연결 시도 횟수.
25. **serror_rate**: 에러가 있는 연결 비율.
26. **srv_serror_rate**: 서비스별 에러가 있는 연결 비율.
27. **rerror_rate**: 리턴 에러 비율.
28. **srv_rerror_rate**: 서비스별 리턴 에러 비율.
29. **same_srv_rate**: 동일한 서비스 비율.
30. **diff_srv_rate**: 다른 서비스 비율.
31. **srv_diff_host_rate**: 서비스별 다른 호스트 비율.
32. **dst_host_count**: 목적지 호스트 수.
33. **dst_host_srv_count**: 목적지 호스트의 서비스 수.
34. **dst_host_same_srv_rate**: 목적지 호스트의 동일한 서비스 비율.
35. **dst_host_diff_srv_rate**: 목적지 호스트의 다른 서비스 비율.
36. **dst_host_same_src_port_rate**: 목적지 호스트로부터 동일한 송신 포트 비율.
37. **dst_host_srv_diff_host_rate**: 목적지 호스트의 서비스별 다른 호스트 비율.
38. **dst_host_serror_rate**: 목적지 호스트의 에러가 있는 연결 비율.
39. **dst_host_srv_serror_rate**: 목적지 호스트의 서비스별 에러가 있는 연결 비율.
40. **dst_host_rerror_rate**: 목적지 호스트의 리턴 에러 비율.
41. **dst_host_srv_rerror_rate**: 목적지 호스트의 서비스별 리턴 에러 비율.
42. **attack_type**: 공격 유형.
43. **success_pred**: 성공 여부 예측 값.

이러한 특성들은 네트워크 보안 및 침입 감지 분야에서 중요한 정보를 제공하는데 사용됩니다.

---

In [2]:
kddTrainDataOriginal = pd.read_csv("Datasets/KDD/KDDTrain+.csv")

In [3]:
kddTrainDataOriginal

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class
0,0.0,0.0,0.0,5.0,491.0,0.0,0.0,0.0,0.0,0.0,...,25.0,0.17,0.03,0.17,0.00,0.00,0.00,0.05,0.00,0.0
1,0.0,1.0,53.0,5.0,146.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.00,0.60,0.88,0.00,0.00,0.00,0.00,0.00,0.0
2,0.0,0.0,10.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,...,26.0,0.10,0.05,0.00,0.00,1.00,1.00,0.00,0.00,1.0
3,0.0,0.0,19.0,5.0,232.0,8153.0,0.0,0.0,0.0,0.0,...,255.0,1.00,0.00,0.03,0.04,0.03,0.01,0.00,0.01,0.0
4,0.0,0.0,19.0,5.0,199.0,420.0,0.0,0.0,0.0,0.0,...,255.0,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125968,0.0,0.0,10.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,...,25.0,0.10,0.06,0.00,0.00,1.00,1.00,0.00,0.00,1.0
125969,8.0,1.0,10.0,5.0,105.0,145.0,0.0,0.0,0.0,0.0,...,244.0,0.96,0.01,0.01,0.00,0.00,0.00,0.00,0.00,0.0
125970,0.0,0.0,3.0,5.0,2231.0,384.0,0.0,0.0,0.0,0.0,...,30.0,0.12,0.06,0.00,0.00,0.72,0.00,0.01,0.00,0.0
125971,0.0,0.0,4.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8.0,0.03,0.05,0.00,0.00,1.00,1.00,0.00,0.00,1.0


In [4]:
kddTrainDataOriginal.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125973 entries, 0 to 125972
Data columns (total 42 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   duration                     125973 non-null  float64
 1   protocol_type                125973 non-null  float64
 2   service                      125973 non-null  float64
 3   flag                         125973 non-null  float64
 4   src_bytes                    125973 non-null  float64
 5   dst_bytes                    125973 non-null  float64
 6   land                         125973 non-null  float64
 7   wrong_fragment               125973 non-null  float64
 8   urgent                       125973 non-null  float64
 9   hot                          125973 non-null  float64
 10  num_failed_logins            125973 non-null  float64
 11  logged_in                    125973 non-null  float64
 12  num_compromised              125973 non-null  float64
 13 

## Test Dataset

In [5]:
kddTestDataOriginal = pd.read_csv("Datasets/KDD/KDDTest+.csv")

In [6]:
kddTestDataOriginal.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22544 entries, 0 to 22543
Data columns (total 42 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   duration                     22544 non-null  float64
 1   protocol_type                22544 non-null  float64
 2   service                      22544 non-null  float64
 3   flag                         22544 non-null  float64
 4   src_bytes                    22544 non-null  float64
 5   dst_bytes                    22544 non-null  float64
 6   land                         22544 non-null  float64
 7   wrong_fragment               22544 non-null  float64
 8   urgent                       22544 non-null  float64
 9   hot                          22544 non-null  float64
 10  num_failed_logins            22544 non-null  float64
 11  logged_in                    22544 non-null  float64
 12  num_compromised              22544 non-null  float64
 13  root_shell      

# Models

In [7]:
from keras import models, layers

2023-08-20 19:17:46.987725: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-08-20 19:17:47.016056: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [8]:
model = models.Sequential([
    layers.Input((41, 1)),
    layers.Dense(41, activation='relu'),
    layers.BatchNormalization(),
    
    layers.Conv1D(84, kernel_size=5, activation='relu'),
    layers.BatchNormalization(),
    layers.Conv1D(42, kernel_size=5, activation='relu'),
    layers.BatchNormalization(),

    layers.Flatten(),
    layers.Dense(8, activation='relu'),
    layers.Dense(1, activation='sigmoid')
], name="CNN-Models")

2023-08-20 19:17:48.373080: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-08-20 19:17:48.404263: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-08-20 19:17:48.404443: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

In [9]:
model = models.Sequential([
    layers.Input((41, 1)),
    layers.Dense(20, activation='relu'),
    layers.BatchNormalization(),
    
    layers.Conv1D(42, kernel_size=5, activation='relu'),
    layers.BatchNormalization(),
    layers.Conv1D(82, kernel_size=5, activation='relu'),
    layers.BatchNormalization(),

    layers.Flatten(),
    layers.Dense(20, activation='relu'),
    layers.Dense(1, activation='sigmoid')
], name="CNN-Models-2")

In [10]:
from keras import losses, optimizers, metrics

model.compile(
    loss=losses.binary_crossentropy, 
    optimizer=optimizers.RMSprop(), 
    metrics=[metrics.BinaryAccuracy()]
)

In [12]:
from sklearn.model_selection import train_test_split

#예측해야하는 label 데이터를 제외한 feature 데이터
X = kddTrainDataOriginal.drop(columns=['class'])

#예측해야하는 label 데이터
Y = kddTrainDataOriginal['class']


# sklearn의 train_test_split으로 쉽게 분리 가능
x_train, x_test, y_train, y_test = train_test_split(X, Y)

In [18]:
model.fit(
    x_train, 
    y_train, 
    validation_data=(x_test, y_test), 
    epochs=10, 
    batch_size=30
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10

KeyboardInterrupt: 

In [None]:
from sklearn.metrics import accuracy_score
import numpy as np

pred_y = model.predict(kddTestData)

acc = {}
for i in np.arange(0, 0.5, 0.005):
    pred = (pred_y > i).astype(int)
    acc.update({i: accuracy_score(kddTestCheckData, pred)})

In [None]:
acc

In [None]:
# confusion matrix 사용을 위한 라이브러리
from sklearn.metrics import confusion_matrix
from matplotlib.pyplot import plot as plt

# confusion matrix 그리는 함수 
def plot_confusion_matrix(con_mat, labels, title='Confusion Matrix', cmap=plt.cm.get_cmap('Blues'), normalize=False):
    plt.imshow(con_mat, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    marks = np.arange(len(labels))
    nlabels = []
    for k in range(len(con_mat)):
        n = sum(con_mat[k])
        nlabel = '{0}(n={1})'.format(labels[k],n)
        nlabels.append(nlabel)
    plt.xticks(marks, labels)
    plt.yticks(marks, nlabels)

    thresh = con_mat.max() / 2.
    if normalize:
        for i, j in itertools.product(range(con_mat.shape[0]), range(con_mat.shape[1])):
            plt.text(j, i, '{0}%'.format(con_mat[i, j] * 100 / n), horizontalalignment="center", color="white" if con_mat[i, j] > thresh else "black")
    else:
        for i, j in itertools.product(range(con_mat.shape[0]), range(con_mat.shape[1])):
            plt.text(j, i, con_mat[i, j], horizontalalignment="center", color="white" if con_mat[i, j] > thresh else "black")
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()

# 예측값과 참값 
pred_labels = np.argmax(predictions, axis=1)
true_labels = test_set.labels

#메인 실행 
confusion_matrix = confusion_matrix(true_labels, pred_labels)
plot_confusion_matrix(confusion_matrix, labels=labels, normalize=True)