In [None]:
import os

import torch
import torch.nn as nn

import pandas as pd
import numpy as np

from tqdm.notebook import tqdm
from torch.nn import functional as F
from torch.utils.data import DataLoader,Dataset
from datetime import datetime, timedelta
from torch.utils.data import Dataset
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_curve, auc

In [None]:
### Static Data
inside_ip = [
    "192.168.10.50", "205.174.165.68", "192.168.10.51", "205.174.165.66","192.168.10.19", "192.168.10.17",
    "192.168.10.16", "192.168.10.12", "192.168.10.9", "192.168.10.5", "192.168.10.8", "192.168.10.14", "192.168.10.15",
    "192.168.10.25", "8.8.8.8", "192.168.10.1"
    ]
dataset_path = '_'
data_col = ['ts', 'te', 'td', 'sa', 'da', 'sp', 'dp', 'pr', 'ipkt', 'ibyt', 'opkt', 'obyt', 'Label']
feature_col = ['td', 'ipkt', 'ibyt', 'opkt', 'obyt']
device = f'cuda:0' if torch.cuda.is_available() else 'cpu'

In [None]:
### Load Dataset + ip processing
train_tmp = []
test_tmp = []
ip_set = set(inside_ip)

with open(rf"{dataset_path}\Monday.csv", 'r', encoding='utf-8') as f:
    for line in tqdm(f.readlines()[1:]):
        tmp = line.strip().split(',')
        if tmp[3] in ip_set:
            train_tmp.append(tmp)
        else:
            tmp[3], tmp[4] = tmp[4], tmp[3]
            tmp[5], tmp[6] = tmp[6], tmp[5]
            tmp[8], tmp[10] = tmp[10], tmp[8]
            tmp[9], tmp[11] = tmp[11], tmp[9]
            train_tmp.append(tmp)
            
train_dataset  = pd.DataFrame(train_tmp, columns = data_col)

test_day = ['Tuesday', 'Wednesday', 'Thursday', 'Friday']
for day in test_day:
    with open(rf"{dataset_path}\{day}.csv", 'r', encoding='utf-8') as f:
        for line in tqdm(f.readlines()[1:]):
            tmp = line.strip().split(',')
            if tmp[3] in ip_set:
                test_tmp.append(tmp)
            else:
                tmp[3], tmp[4] = tmp[4], tmp[3]
                tmp[5], tmp[6] = tmp[6], tmp[5]
                tmp[8], tmp[10] = tmp[10], tmp[8]
                tmp[9], tmp[11] = tmp[11], tmp[9]
            test_tmp.append(tmp)
test_dataset = pd.DataFrame(test_tmp, columns = data_col)

In [None]:
train_dataset['ts'] = pd.to_datetime(train_dataset['ts'])
train_dataset['te'] = pd.to_datetime(train_dataset['te'])
test_dataset['ts'] = pd.to_datetime(test_dataset['ts'])
test_dataset['te'] = pd.to_datetime(test_dataset['te'])
train_dataset = train_dataset.astype({'td':'float', 'ipkt':'float', 'ibyt':'float', 'obyt':'float', 'opkt':'float'})
test_dataset = test_dataset.astype({'td':'float', 'ipkt':'float', 'ibyt':'float', 'obyt':'float', 'opkt':'float'})

In [None]:
statistic_dict = {key : {'MEAN': 0, 'STD' : 0} for key in feature_col}
for key in feature_col:
    statistic_dict[key]['MEAN'] = np.mean(train_dataset[key])
    statistic_dict[key]['STD'] = np.std(train_dataset[key])
for key in feature_col:
    train_dataset[key] = (train_dataset[key] - statistic_dict[key]['MEAN']) / statistic_dict[key]['STD']
    test_dataset[key] = (test_dataset[key] - statistic_dict[key]['MEAN']) / statistic_dict[key]['STD']

In [None]:
test_dataset['index'] = [i for i in range(len(test_dataset))]
train_dataset['index'] = [i for i in range(len(train_dataset))]

### data preprocessing

In [None]:
def build_dataset(data, timewindow=5, seq_size=10, test=True):
    document = []
    ret = []
    st = data['ts'][0]
    et = st + timedelta(minutes=timewindow)
    label_list = []
    window_size = int((max(data['ts']) - min(data['ts']))/ timedelta(minutes=timewindow)) + 2
    for idx in tqdm(range(0,window_size)):
        tmp_pd = data[(st <= data['ts']) & (data['ts'] < et)]
        ip_list = list(tmp_pd.da.unique())
        ip_group = tmp_pd.groupby('da')
        for ip in ip_list:
            tmp_list = ip_group.get_group(ip)[feature_col].values
            tmp_idx = list(ip_group.get_group(ip)["index"].values)
            if test:
                tmp_label = set(ip_group.get_group(ip)['Label'].unique()) - set("BENIGN")
                if len(tmp_label) == 0:
                    label = "BENIGN"
                else:
                    label = list(tmp_label)[0]
            if len(tmp_list) < seq_size:
                tmp_list = np.pad(tmp_list, ((0, seq_size - len(tmp_list)), (0, 0)))
                ret.append(tmp_list)
                document.append(tmp_idx)
                if test:
                    label_list.append(label)
                
            else:
                for i in range(len(tmp_list) - seq_size+1):
                    ret.append(tmp_list[i:i+seq_size])
                    document.append(tmp_idx[i:i+seq_size])
                    if test:
                        label_list.append(label)
        st = et
        et = st + timedelta(minutes=timewindow)
    if test != True:
        label_list = ["BENIGN"] * len(ret)
    return (ret, label_list, document)

In [None]:
epoch = 16
batch_size = 16
latent = 8
hidden_size = 16
feature_size = 5
timewindow = 10
seq_size = 30

In [None]:
class NewDataset(Dataset):
    def __init__(self, dataset, timewindow, seq_size, test):
        super(NewDataset, self).__init__()
        self.dataset = dataset
        self.data = []
        self.label = []
        self.document = []
        self.build_data(dataset, test)
        
    def build_data(self, dataset, test):
        self.data, self.label, self.document = build_dataset(dataset, timewindow, seq_size, test)
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        data = torch.FloatTensor(self.data[idx])
        label = self.label[idx]
        return data, label

In [None]:
train_data = NewDataset(train_dataset, timewindow, seq_size, False)
test_data = NewDataset(test_dataset, timewindow, seq_size, True)

In [None]:
# torch.save(train_dataset, 'train.pkl')
# torch.save(test_dataset, 'test.pkl')

In [None]:
train_dataloader = DataLoader(dataset=train_data, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(dataset=test_data, batch_size=batch_size, shuffle=False)

In [None]:
## Model
class GRU(nn.Module):

    def __init__(self, input_size, hidden_size, latent_dim,dropout=0, bidirectional=False):
        super(GRU, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.latent_dim = latent_dim
        self.num_direction = int(bidirectional) + 1

        self.gru_enc = nn.GRU(input_size, hidden_size,  dropout=dropout, batch_first=True, num_layers=3)
        self.lat_layer = nn.GRU(hidden_size, latent_dim, batch_first=True, dropout=dropout, num_layers=2)
        self.gru_dec = nn.GRU(latent_dim, input_size,  batch_first=True, dropout=dropout, num_layers=3)

    def forward(self, input_):
        
        output = input_
        
        output, _ = self.gru_enc(output)
        
        en_vec, _ = self.lat_layer(output)
        
        output, _ = self.gru_dec(en_vec)
        
        return [output, input_, en_vec]

In [None]:
model = GRU(feature_size, hidden_size, latent)
model.to(device)

In [None]:
criterion = nn.MSELoss()
optimizer = torch.optim.AdamW(model.parameters(), weight_decay=0.1)

In [None]:
pbar = tqdm(range(epoch), desc="training")
train_loss = []
for e in pbar:
    losses = []
    for batch in train_dataloader:
        data, _ = batch
        data = data.to(device)
        result = model(data)[0]
        loss = criterion(data, result)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        losses.append(loss.item())
        pbar.set_postfix(epoch=f"{e + 1} of {epoch}", loss=f"{losses[-1]:.5f}")
    train_loss.append(sum(losses)/len(losses))

In [None]:
# torch.save(model, 'model.pt')

In [None]:
import matplotlib.pyplot as plt

plt.plot(train_loss)

In [None]:
label_list = []
predict_loss = []
with torch.inference_mode():
    model.eval()
    for batch in tqdm(test_dataloader):
        data, label = batch
        data = data.to(device)
        result = model(data)[0]
        loss = torch.mean(torch.mean(F.mse_loss(result, data, reduction='none'),dim=1), dim=1)
        for i in loss:
            predict_loss.append(i)
        losses = criterion(data, result)
        predict_loss.append(losses)
        label_list.append(label)

In [None]:
threshold = np.mean(train_loss[:5])

In [None]:
detected = []
for i in range(len(predict_loss)):
    if predict_loss[i] >= threshold:
        detected.append((label_list[i][0], predict_loss[i]))
    if label_list[i][0] != 'BENIGN':
        print(label_list[i][0], predict_loss[i])

In [None]:
loss_list = [[] for _ in range(len(test_dataset))]

for i in tqdm(range(len(test_data.document))):
    idx_list = test_data.document[i]
    tmp_loss = predict_loss[i]
    for idx in idx_list:
        loss_list[idx].append(tmp_loss)
        
sibal = []

for loss_group in tqdm(loss_list):
    sibal.append(torch.stack(loss_group, dim=0))

In [None]:
st_dict = {"MAX":[], "MIN":[], "MEAN":[]}

for i in tqdm(range(len(sibal))):
    st_dict["MEAN"].append(torch.mean(sibal[i]).cpu())
    st_dict["MAX"].append(torch.max(sibal[i]).cpu())    
    st_dict["MIN"].append(torch.min(sibal[i]).cpu())    

In [None]:
test_dataset['predict_mean'] = st_dict["MEAN"]
test_dataset['predict_min'] = st_dict['MIN']
test_dataset['predict_max'] = st_dict["MAX"]

In [None]:
tp = len(test_dataset[(test_dataset["Label"] != "BENIGN") & (test_dataset["predict_max"] >= threshold)])
fp = len(test_dataset[(test_dataset["Label"] == "BENIGN") & (test_dataset["predict_max"] >= threshold)])
tn = len(test_dataset[(test_dataset["Label"] == "BENIGN") & (test_dataset["predict_max"] < threshold)])
fn = len(test_dataset[(test_dataset["Label"] != "BENIGN") & (test_dataset["predict_max"] < threshold)])

In [None]:
precision = tp/(tp+fp)
recall = tp/(tp+fn)
f1 = 2/((1/precision) + (1/recall))
print(precision, recall, f1)

In [None]:
attack_df = test_dataset[test_dataset["Label"] != "BENIGN"]
benign_df = test_dataset[test_dataset["Label"] == "BENIGN"]
total_df = pd.concat((benign_df, attack_df), ignore_index=True)
total_df.loc[total_df["Label"] != 'BENIGN', "Label"] = 1
total_df.loc[total_df["Label"] == 'BENIGN', "Label"] = 0
fpr_list, tpr_list, threshold_list = roc_curve(total_df["Label"].astype(int), total_df['predict_max'])
print(f"total,{auc(fpr_list, tpr_list)}")

In [None]:
total_df