In [34]:
import os

import torch
import torch.nn as nn

import pandas as pd
import numpy as np

from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA
from tqdm.notebook import tqdm
from torch.nn import functional as F
from torch.utils.data import DataLoader,Dataset
from datetime import datetime, timedelta
from torch.utils.data import Dataset
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import StandardScaler

In [95]:
### Static Data
inside_ip = [
    "192.168.10.50", "205.174.165.68", "192.168.10.51", "205.174.165.66","192.168.10.19", "192.168.10.17",
    "192.168.10.16", "192.168.10.12", "192.168.10.9", "192.168.10.5", "192.168.10.8", "192.168.10.14", "192.168.10.15",
    "192.168.10.25", "8.8.8.8", "192.168.10.1"
    ]
dataset_path = r"_"
data_col = ['ts', 'te', 'td', 'sa', 'da', 'sp', 'dp', 'pr', 'ipkt', 'ibyt', 'opkt', 'obyt', 'Label']
feature_col = ['td', 'ipkt', 'ibyt', 'opkt', 'obyt']
device = f'cuda:0' if torch.cuda.is_available() else 'cpu'

In [96]:
### Load Dataset + ip processing
train_tmp = []
test_tmp = []
ip_set = set(inside_ip)

with open(rf"{dataset_path}\Monday.csv", 'r', encoding='utf-8') as f:
    for line in tqdm(f.readlines()[1:]):
        tmp = line.strip().split(',')
        if tmp[3] in ip_set:
            train_tmp.append(tmp)
        else:
            tmp[3], tmp[4] = tmp[4], tmp[3]
            tmp[5], tmp[6] = tmp[6], tmp[5]
            tmp[8], tmp[10] = tmp[10], tmp[8]
            tmp[9], tmp[11] = tmp[11], tmp[9]
            train_tmp.append(tmp)
            
train_dataset  = pd.DataFrame(train_tmp, columns = data_col)

test_day = ['Tuesday', 'Wednesday', 'Thursday', 'Friday']
for day in test_day:
    with open(rf"{dataset_path}\{day}.csv", 'r', encoding='utf-8') as f:
        for line in tqdm(f.readlines()[1:]):
            tmp = line.strip().split(',')
            if tmp[3] in ip_set:
                test_tmp.append(tmp)
            else:
                tmp[3], tmp[4] = tmp[4], tmp[3]
                tmp[5], tmp[6] = tmp[6], tmp[5]
                tmp[8], tmp[10] = tmp[10], tmp[8]
                tmp[9], tmp[11] = tmp[11], tmp[9]
            test_tmp.append(tmp)
test_dataset = pd.DataFrame(test_tmp, columns = data_col)

  0%|          | 0/529918 [00:00<?, ?it/s]

  0%|          | 0/445909 [00:00<?, ?it/s]

  0%|          | 0/692703 [00:00<?, ?it/s]

  0%|          | 0/458968 [00:00<?, ?it/s]

  0%|          | 0/703245 [00:00<?, ?it/s]

In [97]:
train_dataset['ts'] = pd.to_datetime(train_dataset['ts'])
train_dataset['te'] = pd.to_datetime(train_dataset['te'])
test_dataset['ts'] = pd.to_datetime(test_dataset['ts'])
test_dataset['te'] = pd.to_datetime(test_dataset['te'])
train_dataset = train_dataset.astype({'td':'float', 'ipkt':'float', 'ibyt':'float', 'obyt':'float', 'opkt':'float'})
test_dataset = test_dataset.astype({'td':'float', 'ipkt':'float', 'ibyt':'float', 'obyt':'float', 'opkt':'float'})

In [98]:
# statistic_dict = {key : {'MEAN': 0, 'STD' : 0} for key in feature_col}
# for key in feature_col:
#     statistic_dict[key]['MEAN'] = np.mean(train_dataset[key])
#     statistic_dict[key]['STD'] = np.std(train_dataset[key])
# for key in feature_col:
#     train_dataset[key] = (train_dataset[key] - statistic_dict[key]['MEAN']) / statistic_dict[key]['STD']
#     test_dataset[key] = (test_dataset[key] - statistic_dict[key]['MEAN']) / statistic_dict[key]['STD']

In [99]:
test_dataset['index'] = [i for i in range(len(test_dataset))]
train_dataset['index'] = [i for i in range(len(train_dataset))]

### data preprocessing

In [174]:
def build_dataset(data, timewindow=5, test=True):
    document = []
    ret = []
    st = data['ts'][0]
    et = st + timedelta(minutes=timewindow)
    label_list = []
    window_size = int((max(data['ts']) - min(data['ts']))/ timedelta(minutes=timewindow)) + 2
    for idx in tqdm(range(0,window_size)):
        tmp_pd = data[(st <= data['ts']) & (data['ts'] < et)]
        ip_list = list(tmp_pd.da.unique())
        ip_group = tmp_pd.groupby('da')
        for ip in ip_list:
            tmp_list = ip_group.get_group(ip)[feature_col]
            tmp_idx = list(ip_group.get_group(ip)["index"].values)
            if test:
                tmp_label = set(ip_group.get_group(ip)['Label'].unique()) - set("BENIGN")
                if len(tmp_label) == 0:
                    label = "BENIGN"
                else:
                    label = list(tmp_label)[0]
            ret.append([len(tmp_list['td']), np.mean(tmp_list['td']), np.mean(tmp_list['ipkt']), np.mean(tmp_list['opkt']),
                       np.mean(tmp_list['ibyt']), np.mean(tmp_list['obyt']), np.std(tmp_list['td']), np.std(tmp_list['ipkt']), 
                       np.std(tmp_list['opkt']), np.std(tmp_list['ibyt']), np.std(tmp_list['obyt']), np.max(tmp_list['td']), 
                       np.max(tmp_list['ipkt']), np.max(tmp_list['opkt']), np.max(tmp_list['ibyt']), np.max(tmp_list['obyt']), 
                        np.min(tmp_list['td']), np.min(tmp_list['ipkt']), np.min(tmp_list['opkt']),
                       np.min(tmp_list['ibyt']), np.min(tmp_list['obyt']),])
            document.append(tmp_idx)
            if test:
                label_list.append(label)
        st = et
        et = st + timedelta(minutes=timewindow)

    if test != True:
        label_list = ["BENIGN"] * len(ret)
    return (ret, label_list, document)

In [175]:
timewindow = 30

In [176]:
std_scaler = StandardScaler()
class NewDataset(Dataset):
    def __init__(self, dataset, timewindow, test):
        super(NewDataset, self).__init__()
        self.dataset = dataset
        self.data = []
        self.label = []
        self.document = []
        self.build_data(dataset, timewindow, test)
        
    def build_data(self, dataset, timewindow, test):
        self.data, self.label, self.document = build_dataset(dataset, timewindow, test)
        if test != True:
            std_scaler.fit(self.data)
            self.data = std_scaler.transform(self.data)
        else:
            self.data = std_scaler.transform(self.data)
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        data = torch.FloatTensor(self.data[idx])
        label = self.label[idx]
        return data, label

In [177]:
train_data = NewDataset(train_dataset, timewindow, False)
test_data = NewDataset(test_dataset, timewindow, True)

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/169 [00:00<?, ?it/s]

In [178]:
# torch.save(train_dataset, 'train.pkl')
# torch.save(test_dataset, 'test.pkl')

In [179]:
def inner_product(va, vb):
    dot = 0
    for i in range(len(va)):
        dot += (va[i] * vb[i])
    return dot

def normaly_set(v):
    sum = 0
    for i in range(0, len(v)):
        sum += (v[i] * v[i])
    return sum ** 0.5

def cal_cos_sim(va, vb):
    # Dot and norm
    dot = inner_product(va, vb)
    norm_a = normaly_set(va)
    norm_b = normaly_set(vb)

    # Cosine similarity
    if norm_a == 0 or norm_b == 0: return 0.
    return dot / (norm_a*norm_b)

In [180]:
model_DB = DBSCAN(eps = 0.01, min_samples = 5, metric = 'cosine', n_jobs=4)

In [181]:
model_DB = model_DB.fit(train_data.data)

In [182]:
predict = model_DB.fit_predict(test_data.data)

In [183]:
predict_dict = {}
for i in tqdm(predict):
    if i not in predict_dict:
        predict_dict[i] = 0
    predict_dict[i] += 1

  0%|          | 0/86526 [00:00<?, ?it/s]

In [184]:
predict_dict

{0: 82822,
 1: 60,
 -1: 3248,
 2: 6,
 3: 5,
 4: 11,
 5: 7,
 10: 7,
 32: 8,
 37: 4,
 6: 50,
 7: 13,
 30: 5,
 48: 5,
 19: 8,
 8: 5,
 41: 7,
 34: 12,
 24: 6,
 33: 5,
 9: 21,
 47: 6,
 14: 7,
 21: 9,
 11: 10,
 29: 5,
 38: 8,
 26: 6,
 12: 6,
 13: 10,
 15: 8,
 16: 5,
 17: 7,
 18: 8,
 20: 9,
 43: 5,
 35: 8,
 22: 12,
 36: 5,
 23: 22,
 25: 6,
 28: 8,
 27: 5,
 46: 5,
 31: 6,
 40: 3,
 45: 8,
 39: 5,
 42: 5,
 44: 4}

In [185]:
pred_list = [0] * len(test_dataset)

for i in tqdm(range(len(test_data.document))):
    idx_list = test_data.document[i]
    tmp_pred = predict[i]
    for idx in idx_list:
        pred_list[idx] = tmp_pred

  0%|          | 0/86526 [00:00<?, ?it/s]

In [186]:
test_dataset['pred'] = pred_list

In [187]:
tp = len(test_dataset[(test_dataset["Label"] != "BENIGN") & (test_dataset["pred"] < 0)])
fp = len(test_dataset[(test_dataset["Label"] == "BENIGN") & (test_dataset["pred"] < 0)])
tn = len(test_dataset[(test_dataset["Label"] == "BENIGN") & (test_dataset["pred"] >= 0)])
fn = len(test_dataset[(test_dataset["Label"] != "BENIGN") & (test_dataset["pred"] >= 0)])

In [188]:
test_dataset[(test_dataset["Label"] != "BENIGN") & (test_dataset["pred"] >= 0)]

Unnamed: 0,ts,te,td,sa,da,sp,dp,pr,ipkt,ibyt,opkt,obyt,Label,index,pred
77188,2017-07-04 02:09:00,2017-07-04 02:09:14,14003899.0,192.168.10.50,172.16.0.1,22,46398,6,25.0,2200.0,35.0,2745.0,SSH-Patator,77188,0
77189,2017-07-04 02:09:00,2017-07-04 02:09:14,14005429.0,192.168.10.50,172.16.0.1,22,46396,6,25.0,2200.0,35.0,2745.0,SSH-Patator,77189,0
77190,2017-07-04 02:09:00,2017-07-04 02:09:12,12497639.0,192.168.10.50,172.16.0.1,22,46392,6,22.0,2008.0,32.0,2745.0,SSH-Patator,77190,0
77191,2017-07-04 02:09:00,2017-07-04 02:09:14,14034050.0,192.168.10.50,172.16.0.1,22,46406,6,27.0,2552.0,38.0,5673.0,SSH-Patator,77191,0
77192,2017-07-04 02:09:00,2017-07-04 02:09:14,14046356.0,192.168.10.50,172.16.0.1,22,46402,6,28.0,2824.0,39.0,5673.0,SSH-Patator,77192,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3551292,2017-07-07 12:59:00,2017-07-07 12:59:01,1032755.0,192.168.10.5,205.174.165.73,52376,8080,6,3.0,18.0,3.0,0.0,Bot,3551292,0
3551317,2017-07-07 12:59:00,2017-07-07 12:59:01,1045087.0,192.168.10.15,205.174.165.73,54013,8080,6,3.0,18.0,3.0,0.0,Bot,3551317,0
3551318,2017-07-07 12:59:00,2017-07-07 12:59:01,1045087.0,192.168.10.15,205.174.165.73,54013,8080,6,3.0,18.0,3.0,0.0,Bot,3551318,0
3551331,2017-07-07 12:59:00,2017-07-07 12:59:00,997161.0,192.168.10.8,205.174.165.73,4999,8080,6,3.0,18.0,3.0,0.0,Bot,3551331,0


In [189]:
test_dataset[(test_dataset["Label"] != "BENIGN") & (test_dataset["pred"] < 0)]

Unnamed: 0,ts,te,td,sa,da,sp,dp,pr,ipkt,ibyt,opkt,obyt,Label,index,pred
179696,2017-07-04 03:00:00,2017-07-04 03:00:12,12772224.0,192.168.10.50,172.16.0.1,22,51364,6,20.0,2008.0,33.0,2745.0,SSH-Patator,179696,-1
179697,2017-07-04 03:00:00,2017-07-04 03:00:12,12750658.0,192.168.10.50,172.16.0.1,22,51362,6,22.0,2008.0,32.0,2745.0,SSH-Patator,179697,-1
179698,2017-07-04 03:00:00,2017-07-04 03:00:14,14327322.0,192.168.10.50,172.16.0.1,22,51360,6,21.0,2008.0,32.0,2745.0,SSH-Patator,179698,-1
179699,2017-07-04 03:00:00,2017-07-04 03:00:14,14458907.0,192.168.10.50,172.16.0.1,22,51358,6,21.0,2008.0,32.0,2745.0,SSH-Patator,179699,-1
179700,2017-07-04 03:00:00,2017-07-04 03:00:00,102.0,192.168.10.50,172.16.0.1,22,51336,6,1.0,0.0,1.0,0.0,SSH-Patator,179700,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3432656,2017-07-07 10:59:00,2017-07-07 10:59:00,72540.0,192.168.10.15,205.174.165.73,53103,8080,6,3.0,134.0,4.0,207.0,Bot,3432656,-1
3432659,2017-07-07 10:59:00,2017-07-07 10:59:00,34.0,192.168.10.15,205.174.165.73,53103,8080,6,1.0,6.0,1.0,6.0,Bot,3432659,-1
3432662,2017-07-07 10:59:00,2017-07-07 10:59:00,89102.0,192.168.10.8,205.174.165.73,3046,8080,6,3.0,134.0,4.0,210.0,Bot,3432662,-1
3432663,2017-07-07 10:59:00,2017-07-07 10:59:00,89102.0,192.168.10.8,205.174.165.73,3046,8080,6,3.0,134.0,4.0,210.0,Bot,3432663,-1


In [190]:
precision = tp/(tp+fp)
recall = tp/(tp+fn)
f1 = 2/((1/precision) + (1/recall))
print(precision, recall, f1)

0.036551580698835275 0.015720659956417974 0.0219854731242791
