In [None]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import tensorflow as tf

from sklearn.metrics import accuracy_score, precision_score, recall_score, precision_recall_curve 
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers, losses
from tensorflow.keras.datasets import fashion_mnist
from tensorflow.keras.models import Model

In [None]:
data_frame = pd.read_csv('./data/IP/DHCP.csv')

In [None]:
data_frame = data_frame.fillna(0)

In [None]:
server = data_frame['Svr_detect'].values + data_frame['Svr_connect'].values + data_frame['Ss_request'].values
data_frame['server'] = server

In [None]:
data_frame = data_frame.drop(['Svr_detect', 'Svr_connect', 'Ss_request'],axis=1)

In [None]:
server_label = pd.read_csv('./server_label.csv')['y'].fillna(0)
ss_label = pd.read_csv('./ss_label.csv')['y'].fillna(0)
server_label = server_label.values.astype(bool)
ss_label = ss_label.values.astype(bool)
data_frame['y'] = np.logical_or(server_label, ss_label).astype(float)
data_frame

In [None]:
data_frame['server'].plot(figsize=(30,12))
plt.xlabel('Time')
plt.ylabel('Value')
plt.legend(loc='upper left')
plt.xticks(np.arange(0, 52560, 1000),fontsize=15,rotation=90)
plt.grid(True)
plt.show()

In [None]:
#idx_half = data_frame.index[data_frame['Timestamp'] == '20210630_2350-0000'].tolist()[0]
#print(idx_half)
#train_data = data_frame[:idx_half+1]
#test_data = data_frame[idx_half+1:]
#print(train_data.shape)
#print(test_data.shape)

In [None]:
idx_half = data_frame.index[data_frame['Timestamp'] == '20210630_2350-0000'].tolist()[0]
train_data = data_frame[:18064]
val_data = data_frame[18064:26000]
test_data = data_frame[idx_half+1:]
print(train_data.shape)
print(val_data.shape)
print(test_data.shape)

In [None]:
train_data = train_data.drop(['Timestamp'],axis=1)
test_data = test_data.drop(['Timestamp'],axis=1)
val_data = val_data.drop(['Timestamp'],axis=1)

In [None]:
train_values = train_data.values
test_values = test_data.values
val_values = val_data.values

train_data = train_values[:, 0:-1]
test_data = test_values[:, 0:-1]
val_data = val_values[:, 0:-1]
train_labels = train_values[:, -1]
test_labels = test_values[:, -1]
val_labels = val_values[:, -1]
print(train_data.shape)
print(test_data.shape)
print(val_data.shape)
print(train_labels.shape)
print(test_labels.shape)
print(val_labels.shape)

In [None]:
# 정규화
min_val = tf.reduce_min(train_data)
max_val = tf.reduce_max(train_data)

train_data = (train_data - min_val) / (max_val - min_val)
test_data = (test_data - min_val) / (max_val - min_val)
val_data = (val_data - min_val) / (max_val - min_val)

train_data = tf.cast(train_data, tf.float32)
test_data = tf.cast(test_data, tf.float32)
val_data = tf.cast(val_data, tf.float32)

In [None]:
train_labels = train_labels.astype(bool)
print(train_labels)
test_labels = test_labels.astype(bool)
print(test_labels)

In [None]:
normal_train_data = train_data[~train_labels]
print(f"정상 데이터(Train)의 shape: {normal_train_data.shape}")
normal_test_data = test_data[~test_labels]
print(f"정상 데이터(Test)의 shape: {normal_test_data.shape}")

In [None]:
anomalous_train_data = train_data[train_labels]
print(f"비정상 데이터(Train)의 shape: {anomalous_train_data.shape}")
anomalous_test_data = test_data[test_labels]
print(f"비정상 데이터(Test)의 shape: {anomalous_test_data.shape}")

In [None]:
# 모델 빌드
class AnomalyDetector(Model):
    def __init__(self):
        super(AnomalyDetector, self).__init__()
        self.encoder = tf.keras.Sequential([
            layers.Dense(128, activation="relu"),
            layers.Dense(16, activation="relu"),
            layers.Dense(8, activation="relu")])

        self.decoder = tf.keras.Sequential([
            layers.Dense(1, activation="relu"),
            layers.Dense(2, activation="relu"),
            layers.Dense(2, activation="sigmoid")])

    def call(self, x):
            encoded = self.encoder(x)
            decoded = self.decoder(encoded)
            return decoded
autoencoder = AnomalyDetector()

In [None]:
autoencoder.compile(optimizer='adam', loss='mae')

In [None]:
# 훈련은 정상 데이터로만 훈련, 테스트는 비정상도 포함
history = autoencoder.fit(normal_train_data, normal_train_data, 
          epochs=30, 
          batch_size=128,
          validation_data=(val_data, val_data),
          shuffle=True)

In [None]:
# 학습 결과 plot
plt.plot(history.history["loss"], label="Training Loss")
plt.plot(history.history["val_loss"], label="Validation Loss")
plt.legend()

In [None]:
# 훈련 세트에서 정상에 대한 평균 오차를 계산, 재구성 오류 plot
reconstructions = autoencoder.predict(val_data)
train_loss = tf.keras.losses.mae(val_data, reconstructions)
threshold = np.mean(train_loss) + np.std(train_loss)
print("Threshold: ", threshold)

In [None]:
def predict(model, data, threshold):
    reconstructions = model(data)
    loss = tf.keras.losses.mae(reconstructions, data)
    return tf.math.less(loss, threshold)

In [None]:
preds = predict(autoencoder, test_data, threshold)
preds = ~preds
preds = np.array(preds)
preds = preds.astype(float)
print(Counter(preds))


In [None]:
preds = pd.DataFrame(preds, columns=['Prediction'])
preds.to_csv('IP_answer.csv', index=False)