In [65]:
import pandas as pd
import numpy as np
import torch
from sklearn import preprocessing
from dgl.data import DGLDataset
import dgl
import time
import networkx as nx
import category_encoders as ce
import torch.nn as nn
import torch.nn.functional as F
import dgl.function as fn
import torch
import tqdm
import math

from typing import *
from sklearn.preprocessing import StandardScaler, Normalizer
import socket
import struct
import random
from sklearn.model_selection import train_test_split

In [66]:
file_name = "Self-Collected-NF-CSE-CICIDS.parquet"
data = pd.read_parquet(file_name)

In [67]:
data.Label.value_counts()

Label
0    636608
1    349522
Name: count, dtype: int64

In [68]:
data.rename(columns=lambda x: x.strip(), inplace=True)
data['IPV4_SRC_ADDR'] = data["IPV4_SRC_ADDR"].apply(str)
data['L4_SRC_PORT'] = data["L4_SRC_PORT"].apply(str)
data['IPV4_DST_ADDR'] = data["IPV4_DST_ADDR"].apply(str)
data['L4_DST_PORT'] = data["L4_DST_PORT"].apply(str)

In [69]:
data.drop(columns=["L4_SRC_PORT", "L4_DST_PORT"], inplace=True)

In [70]:
data.Attack.unique()

array(['DoS attacks-GoldenEye', 'Benign', 'FTP-BruteForce',
       'DoS attacks-Hulk', 'Infilteration', 'DoS attacks-SlowHTTPTest',
       'DoS attacks-Slowloris', 'SQL Injection', 'SSH-Bruteforce',
       'Brute Force -Web', 'Brute Force -XSS'], dtype=object)

In [71]:
# Scale the dataset based on your needs (machine capacity) ideally ~500k rows
data_attack = data[data['Label'] == 1].sample(frac=0.2, random_state=13)
data_benign = data[data['Label'] == 0].sample(frac=1, random_state=13)
data = pd.concat([data_attack, data_benign], axis=0)
data = data.sample(frac=0.5, random_state=13).reset_index(drop=True)
data.Label.value_counts()

Label
0    318222
1     35034
Name: count, dtype: int64

In [72]:
data.groupby(by="Attack").count()

Unnamed: 0_level_0,IPV4_SRC_ADDR,IPV4_DST_ADDR,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,OUT_BYTES,OUT_PKTS,TCP_FLAGS,CLIENT_TCP_FLAGS,...,NUM_PKTS_1024_TO_1514_BYTES,TCP_WIN_MAX_IN,TCP_WIN_MAX_OUT,ICMP_TYPE,ICMP_IPV4_TYPE,DNS_QUERY_ID,DNS_QUERY_TYPE,DNS_TTL_ANSWER,FTP_COMMAND_RET_CODE,Label
Attack,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Benign,318222,318222,318222,318222,318222,318222,318222,318222,318222,318222,...,318222,318222,318222,318222,318222,318222,318222,318222,318222,318222
Brute Force -Web,2895,2895,2895,2895,2895,2895,2895,2895,2895,2895,...,2895,2895,2895,2895,2895,2895,2895,2895,2895,2895
Brute Force -XSS,1739,1739,1739,1739,1739,1739,1739,1739,1739,1739,...,1739,1739,1739,1739,1739,1739,1739,1739,1739,1739
DoS attacks-GoldenEye,9672,9672,9672,9672,9672,9672,9672,9672,9672,9672,...,9672,9672,9672,9672,9672,9672,9672,9672,9672,9672
DoS attacks-Hulk,5810,5810,5810,5810,5810,5810,5810,5810,5810,5810,...,5810,5810,5810,5810,5810,5810,5810,5810,5810,5810
DoS attacks-SlowHTTPTest,3355,3355,3355,3355,3355,3355,3355,3355,3355,3355,...,3355,3355,3355,3355,3355,3355,3355,3355,3355,3355
DoS attacks-Slowloris,5289,5289,5289,5289,5289,5289,5289,5289,5289,5289,...,5289,5289,5289,5289,5289,5289,5289,5289,5289,5289
FTP-BruteForce,760,760,760,760,760,760,760,760,760,760,...,760,760,760,760,760,760,760,760,760,760
Infilteration,4967,4967,4967,4967,4967,4967,4967,4967,4967,4967,...,4967,4967,4967,4967,4967,4967,4967,4967,4967,4967
SQL Injection,504,504,504,504,504,504,504,504,504,504,...,504,504,504,504,504,504,504,504,504,504


In [73]:
X = data.drop(columns=["Attack", "Label"])
y = data[["Attack", "Label"]]

X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=13, stratify=y)

In [74]:
encoder = ce.TargetEncoder(cols=['TCP_FLAGS','L7_PROTO','PROTOCOL',
                                  'CLIENT_TCP_FLAGS','SERVER_TCP_FLAGS','ICMP_TYPE',
                                  'ICMP_IPV4_TYPE','DNS_QUERY_ID','DNS_QUERY_TYPE',
                                  'FTP_COMMAND_RET_CODE'])
encoder.fit(X_train, y_train.Label)

# Transform on training set
X_train = encoder.transform(X_train)

# Transform on testing set
X_test = encoder.transform(X_test)

In [75]:
X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)
X_train.fillna(0, inplace=True)
X_test.fillna(0, inplace=True)

In [76]:
scaler = Normalizer()
cols_to_norm = list(set(list(X_train.iloc[:, 2:].columns))) # Ignore first two as the represents IP addresses
scaler.fit(X_train[cols_to_norm])

# Transform on training set
X_train[cols_to_norm] = scaler.transform(X_train[cols_to_norm])
X_train['h'] = X_train.iloc[:, 2:].values.tolist()

# Transform on testing set
X_test[cols_to_norm] = scaler.transform(X_test[cols_to_norm])
X_test['h'] = X_test.iloc[:, 2:].values.tolist()

train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

In [77]:
X_train.head()

Unnamed: 0,IPV4_SRC_ADDR,IPV4_DST_ADDR,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,OUT_BYTES,OUT_PKTS,TCP_FLAGS,CLIENT_TCP_FLAGS,...,NUM_PKTS_1024_TO_1514_BYTES,TCP_WIN_MAX_IN,TCP_WIN_MAX_OUT,ICMP_TYPE,ICMP_IPV4_TYPE,DNS_QUERY_ID,DNS_QUERY_TYPE,DNS_TTL_ANSWER,FTP_COMMAND_RET_CODE,h
351545,145.82.199.44,182.129.67.88,1.965582e-08,1.822779e-08,0.000275,9.909622e-07,0.001179,3.963849e-07,6.720525e-09,6.973776e-09,...,0.0,0.00015,9.9e-05,1.965582e-08,1.965582e-08,1.965582e-08,1.965582e-08,-1.981924e-07,2.073847e-08,"[1.9655819145400368e-08, 1.8227790718695328e-0..."
137232,145.82.199.44,182.129.67.88,3.922741e-07,4.584857e-09,0.026908,9.097318e-05,0.024215,7.910712e-05,1.341225e-07,1.391767e-07,...,1.6e-05,0.002164,0.002148,3.922741e-07,3.922741e-07,3.922741e-07,3.922741e-07,-3.955356e-06,4.138807e-07,"[3.922740972325911e-07, 4.5848566531532795e-09..."
17579,103.57.14.23,182.129.67.88,2.072046e-07,6.485942e-07,0.001103,1.462492e-05,0.000635,8.357095e-06,0.0,1.337563e-10,...,0.0,0.134215,0.136137,2.072046e-07,2.072046e-07,2.072046e-07,2.072046e-07,-2.089274e-06,2.186175e-07,"[2.0720461815271374e-07, 6.48594187295139e-07,..."
55026,103.57.14.23,182.129.67.88,4.382958e-09,4.064529e-09,1.1e-05,1.32582e-07,0.0,0.0,4.416093e-08,4.417976e-08,...,0.0,0.002839,0.0,4.382958e-09,4.382958e-09,4.382958e-09,4.382958e-09,-4.4194e-08,4.624373e-09,"[4.382958322053996e-09, 4.0645290044734464e-09..."
336057,103.57.14.23,190.201.33.51,1.836563e-07,5.74883e-07,0.000978,1.296283e-05,0.000563,7.407332e-06,0.0,1.185552e-10,...,0.0,0.118962,0.120665,1.836563e-07,1.836563e-07,1.836563e-07,1.836563e-07,-1.851833e-06,1.937722e-07,"[1.8365631932842353e-07, 5.748830418858853e-07..."


In [78]:
lab_enc = preprocessing.LabelEncoder()
lab_enc.fit(data["Attack"])

# Transform on training set
train["Attack"] = lab_enc.transform(train["Attack"])

# Transform on testing set
test["Attack"] = lab_enc.transform(test["Attack"])

In [79]:
# Training graph

train_g = nx.from_pandas_edgelist(train, "IPV4_SRC_ADDR", "IPV4_DST_ADDR",
            ["h", "Label", "Attack"], create_using=nx.MultiGraph())

train_g = train_g.to_directed()
train_g = dgl.from_networkx(train_g, edge_attrs=['h', 'Attack', 'Label'])
nfeat_weight = torch.ones([train_g.number_of_nodes(),
train_g.edata['h'].shape[1]])
train_g.ndata['h'] = nfeat_weight

# Testing graph
test_g = nx.from_pandas_edgelist(test, "IPV4_SRC_ADDR", "IPV4_DST_ADDR",
            ["h", "Label", "Attack"], create_using=nx.MultiGraph())

test_g = test_g.to_directed()
test_g = dgl.from_networkx(test_g, edge_attrs=['h', 'Attack', 'Label'])
nfeat_weight = torch.ones([test_g.number_of_nodes(),
test_g.edata['h'].shape[1]])
test_g.ndata['h'] = nfeat_weight

In [80]:
import torch.nn as nn
import torch.nn.functional as F
import dgl.function as fn
import tqdm
import gc

class SAGELayer(nn.Module):
    def __init__(self, ndim_in, edims, ndim_out, activation):
      super(SAGELayer, self).__init__()
      self.W_apply = nn.Linear(ndim_in + edims , ndim_out)
      self.activation = F.relu
      self.W_edge = nn.Linear(128 * 2, 256)
      self.reset_parameters()

    def reset_parameters(self):
      gain = nn.init.calculate_gain('relu')
      nn.init.xavier_uniform_(self.W_apply.weight, gain=gain)

    def message_func(self, edges):
      return {'m':  edges.data['h']}

    def forward(self, g_dgl, nfeats, efeats):
      with g_dgl.local_scope():
        g = g_dgl
        g.ndata['h'] = nfeats
        g.edata['h'] = efeats
        g.update_all(self.message_func, fn.mean('m', 'h_neigh'))
        g.ndata['h'] = F.relu(self.W_apply(torch.cat([g.ndata['h'], g.ndata['h_neigh']], 2)))

        # Compute edge embeddings
        u, v = g.edges()
        edge = self.W_edge(torch.cat((g.srcdata['h'][u], g.dstdata['h'][v]), 2))
        return g.ndata['h'], edge

In [81]:
class SAGE(nn.Module):
    def __init__(self, ndim_in, ndim_out, edim,  activation):
      super(SAGE, self).__init__()
      self.layers = nn.ModuleList()
      self.layers.append(SAGELayer(ndim_in, edim, 128, F.relu))

    def forward(self, g, nfeats, efeats, corrupt=False):
      if corrupt:
        e_perm = torch.randperm(g.number_of_edges())
        #n_perm = torch.randperm(g.number_of_nodes())
        efeats = efeats[e_perm]
        #nfeats = nfeats[n_perm]
      for i, layer in enumerate(self.layers):
        #nfeats = layer(g, nfeats, efeats)
        nfeats, e_feats = layer(g, nfeats, efeats)
      #return nfeats.sum(1)
      return nfeats.sum(1), e_feats.sum(1)

In [82]:
class Discriminator(nn.Module):
    def __init__(self, n_hidden):
      super(Discriminator, self).__init__()
      self.weight = nn.Parameter(torch.Tensor(n_hidden, n_hidden))
      self.reset_parameters()

    def uniform(self, size, tensor):
      bound = 1.0 / math.sqrt(size)
      if tensor is not None:
        tensor.data.uniform_(-bound, bound)

    def reset_parameters(self):
      size = self.weight.size(0)
      self.uniform(size, self.weight)

    def forward(self, features, summary):
      features = torch.matmul(features, torch.matmul(self.weight, summary))
      return features

In [83]:
class DGI(nn.Module):
    def __init__(self, ndim_in, ndim_out, edim, activation):
      super(DGI, self).__init__()
      self.encoder = SAGE(ndim_in, ndim_out, edim,  F.relu)
      #self.discriminator = Discriminator(128)
      self.discriminator = Discriminator(256)
      self.loss = nn.BCEWithLogitsLoss()

    def forward(self, g, n_features, e_features):
      positive = self.encoder(g, n_features, e_features, corrupt=False)
      negative = self.encoder(g, n_features, e_features, corrupt=True)
      self.loss = nn.BCEWithLogitsLoss()

    def forward(self, g, n_features, e_features):
      positive = self.encoder(g, n_features, e_features, corrupt=False)
      negative = self.encoder(g, n_features, e_features, corrupt=True)

      positive = positive[1]
      negative = negative[1]

      summary = torch.sigmoid(positive.mean(dim=0))

      positive = self.discriminator(positive, summary)
      negative = self.discriminator(negative, summary)

      l1 = self.loss(positive, torch.ones_like(positive))
      l2 = self.loss(negative, torch.zeros_like(negative))

      return l1 + l2

In [84]:
ndim_in = train_g.ndata['h'].shape[1]
hidden_features = 128
ndim_out = 128
num_layers = 1
edim = train_g.edata['h'].shape[1]
learning_rate = 1e-3
epochs = 4000

In [85]:
dgi = DGI(ndim_in,
    ndim_out,
    edim,
    F.relu)

dgi = dgi.to('cuda')

dgi_optimizer = torch.optim.Adam(dgi.parameters(),
                lr=1e-3,
                weight_decay=0.)

In [86]:
# Format node and edge features for E-GraphSAGE
train_g.ndata['h'] = torch.reshape(train_g.ndata['h'],
                                   (train_g.ndata['h'].shape[0], 1,
                                    train_g.ndata['h'].shape[1]))

train_g.edata['h'] = torch.reshape(train_g.edata['h'],
                                   (train_g.edata['h'].shape[0], 1,
                                    train_g.edata['h'].shape[1]))

In [87]:
# Convert to GPU
train_g = train_g.to('cuda')

In [88]:
cnt_wait = 0
best = 1e9
best_t = 0
dur = []
node_features = train_g.ndata['h']
edge_features = train_g.edata['h']

for epoch in range(epochs):
    dgi.train()
    if epoch >= 3:
        t0 = time.time()

    dgi_optimizer.zero_grad()
    loss = dgi(train_g, node_features, edge_features)
    loss.backward()
    dgi_optimizer.step()

    if loss < best:
        best = loss
        best_t = epoch
        cnt_wait = 0
        torch.save(dgi.state_dict(), 'best_dgi_CSE_self_collected.pkl')
    else:
        cnt_wait += 1

  # if cnt_wait == patience:
  #     print('Early stopping!')
  #     break

    if epoch >= 3:
        dur.append(time.time() - t0)

    if epoch % 50 == 0:

        print("Epoch {:05d} | Time(s) {:.4f} | Loss {:.4f} | "
            "ETputs(KTEPS) {:.2f}".format(epoch, np.mean(dur),
              loss.item(),
              train_g.num_edges() / np.mean(dur) / 1000))

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Epoch 00000 | Time(s) nan | Loss 3.9095 | ETputs(KTEPS) nan
Epoch 00050 | Time(s) 0.4888 | Loss 1.3885 | ETputs(KTEPS) 1011.76
Epoch 00100 | Time(s) 0.4891 | Loss 1.3863 | ETputs(KTEPS) 1011.18
Epoch 00150 | Time(s) 0.4896 | Loss 1.3863 | ETputs(KTEPS) 1010.07
Epoch 00200 | Time(s) 0.4903 | Loss 1.3863 | ETputs(KTEPS) 1008.77
Epoch 00250 | Time(s) 0.4906 | Loss 1.3863 | ETputs(KTEPS) 1008.14
Epoch 00300 | Time(s) 0.4908 | Loss 1.3863 | ETputs(KTEPS) 1007.71
Epoch 00350 | Time(s) 0.4910 | Loss 1.3863 | ETputs(KTEPS) 1007.30
Epoch 00400 | Time(s) 0.4912 | Loss 1.3863 | ETputs(KTEPS) 1006.93
Epoch 00450 | Time(s) 0.4913 | Loss 1.3863 | ETputs(KTEPS) 1006.53
Epoch 00500 | Time(s) 0.4914 | Loss 1.3863 | ETputs(KTEPS) 1006.44
Epoch 00550 | Time(s) 0.4916 | Loss 1.3863 | ETputs(KTEPS) 1006.00
Epoch 00600 | Time(s) 0.4918 | Loss 1.3863 | ETputs(KTEPS) 1005.51
Epoch 00650 | Time(s) 0.4921 | Loss 1.3863 | ETputs(KTEPS) 1005.09
Epoch 00700 | Time(s) 0.4922 | Loss 1.3863 | ETputs(KTEPS) 1004.70
Ep

KeyboardInterrupt: 

In [89]:
dgi.load_state_dict(torch.load('best_dgi_CSE_self_collected.pkl'))

  dgi.load_state_dict(torch.load('best_dgi_CSE_self_collected.pkl'))


<All keys matched successfully>

In [90]:
training_emb = dgi.encoder(train_g, train_g.ndata['h'], train_g.edata['h'])[1]
training_emb = training_emb.detach().cpu().numpy()

In [91]:
test_g.ndata['h'] = torch.reshape(test_g.ndata['h'],
                                   (test_g.ndata['h'].shape[0], 1,
                                    test_g.ndata['h'].shape[1]))



test_g.edata['h'] = torch.reshape(test_g.edata['h'],
                                   (test_g.edata['h'].shape[0], 1,
                                    test_g.edata['h'].shape[1]))

In [92]:
# Convert to GPU
test_g = test_g.to('cuda')

In [93]:
testing_emb = dgi.encoder(test_g, test_g.ndata['h'], test_g.edata['h'])[1]
testing_emb = testing_emb.detach().cpu().numpy()

In [94]:
df_train = pd.DataFrame(training_emb, )
df_train["Attack"] = lab_enc.inverse_transform(
        train_g.edata['Attack'].detach().cpu().numpy())
df_train["Label"] = train_g.edata['Label'].detach().cpu().numpy()

df_test = pd.DataFrame(testing_emb, )
df_test["Attack"] = lab_enc.inverse_transform(
        test_g.edata['Attack'].detach().cpu().numpy())
df_test["Label"] = test_g.edata['Label'].detach().cpu().numpy()

In [95]:
df_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,248,249,250,251,252,253,254,255,Attack,Label
0,0.112324,-0.012708,-0.002396,-0.005498,-0.056462,-0.043737,-0.001765,0.005954,0.032609,-0.015436,...,-0.031268,0.033688,0.007348,-0.019403,0.071821,-0.027189,-0.018634,-0.007879,Benign,0
1,0.112324,-0.012708,-0.002396,-0.005498,-0.056462,-0.043737,-0.001765,0.005954,0.032609,-0.015436,...,-0.031268,0.033688,0.007348,-0.019403,0.071821,-0.027189,-0.018634,-0.007879,Benign,0
2,0.112324,-0.012708,-0.002396,-0.005498,-0.056462,-0.043737,-0.001765,0.005954,0.032609,-0.015436,...,-0.031268,0.033688,0.007348,-0.019403,0.071821,-0.027189,-0.018634,-0.007879,Benign,0
3,0.112324,-0.012708,-0.002396,-0.005498,-0.056462,-0.043737,-0.001765,0.005954,0.032609,-0.015436,...,-0.031268,0.033688,0.007348,-0.019403,0.071821,-0.027189,-0.018634,-0.007879,Benign,0
4,0.112324,-0.012708,-0.002396,-0.005498,-0.056462,-0.043737,-0.001765,0.005954,0.032609,-0.015436,...,-0.031268,0.033688,0.007348,-0.019403,0.071821,-0.027189,-0.018634,-0.007879,Benign,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494553,0.139969,-0.002682,0.015117,-0.016732,-0.083579,-0.037787,0.012863,0.001105,0.012918,0.005574,...,-0.021651,0.046451,0.006902,-0.020827,0.078794,-0.054731,-0.026369,-0.010277,Brute Force -XSS,1
494554,0.139969,-0.002682,0.015117,-0.016732,-0.083579,-0.037787,0.012863,0.001105,0.012918,0.005574,...,-0.021651,0.046451,0.006902,-0.020827,0.078794,-0.054731,-0.026369,-0.010277,Brute Force -XSS,1
494555,0.139969,-0.002682,0.015117,-0.016732,-0.083579,-0.037787,0.012863,0.001105,0.012918,0.005574,...,-0.021651,0.046451,0.006902,-0.020827,0.078794,-0.054731,-0.026369,-0.010277,Brute Force -XSS,1
494556,0.139969,-0.002682,0.015117,-0.016732,-0.083579,-0.037787,0.012863,0.001105,0.012918,0.005574,...,-0.021651,0.046451,0.006902,-0.020827,0.078794,-0.054731,-0.026369,-0.010277,Brute Force -XSS,1


# Embeddings CBLOF  Embeddings

In [102]:
import torch
import dgl
import numpy as np
import pandas as pd
import torch.optim as optim
import time
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
from sklearn.ensemble import IsolationForest
import gc

from tqdm import tqdm
import itertools

In [103]:
benign_train_samples = df_train[df_train.Label == 0].drop(columns=["Label", "Attack"])
normal_train_samples = df_train.drop(columns=["Label", "Attack"])

train_labels = df_train["Label"]
test_labels = df_test["Label"]

test_samples = df_test.drop(columns=["Label", "Attack"])

In [104]:
contamination = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]

In [106]:
from pyod.models.cblof import CBLOF
n_est = [2,3,5,7,9,10]
params = list(itertools.product(n_est, contamination))
score = -1
bs = None
for n_est, con in tqdm(params):
    try:
        clf_if = CBLOF(n_clusters=n_est, contamination=con)
        clf_if.fit(benign_train_samples)
    except:
        continue
    y_pred = clf_if.predict(test_samples)
    test_pred = y_pred

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

100%|██████████| 36/36 [02:44<00:00,  4.56s/it]

{'n_estimators': 2, 'con': 0.01}
0.48537701984966464
              precision    recall  f1-score   support

           0     0.9022    0.7583    0.8240    190934
           1     0.1033    0.2530    0.1467     21020

    accuracy                         0.7082    211954
   macro avg     0.5028    0.5057    0.4854    211954
weighted avg     0.8229    0.7082    0.7568    211954






In [107]:
n_est = [2,3,5,7,9,10]
contamination = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, contamination))
score = -1
bs = None
for n_est, con in tqdm(params):
    try:
        clf_if = CBLOF(n_clusters=n_est, contamination=con)
        clf_if.fit(normal_train_samples)
    except:
        continue
    y_pred = clf_if.predict(test_samples)
    test_pred = y_pred

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

100%|██████████| 36/36 [02:44<00:00,  4.58s/it]

{'n_estimators': 2, 'con': 0.05}
0.48537701984966464
              precision    recall  f1-score   support

           0     0.9022    0.7583    0.8240    190934
           1     0.1033    0.2530    0.1467     21020

    accuracy                         0.7082    211954
   macro avg     0.5028    0.5057    0.4854    211954
weighted avg     0.8229    0.7082    0.7568    211954






In [108]:
###  CBLOF RAW

In [109]:
df_raw_train = pd.concat([X_train.drop(columns=["IPV4_SRC_ADDR","IPV4_DST_ADDR", "h"]), y_train], axis=1)
df_raw_test = pd.concat([X_test.drop(columns=["IPV4_SRC_ADDR","IPV4_DST_ADDR", "h"]), y_test], axis=1)

In [110]:
raw_benign_train_samples = df_raw_train[df_raw_train.Label == 0].drop(columns=["Label", "Attack"])
raw_normal_train_samples = df_raw_train.drop(columns=["Label", "Attack"])

raw_train_labels = df_raw_train["Label"]
raw_test_labels = df_raw_test["Label"]

raw_test_samples = df_raw_test.drop(columns=["Label", "Attack"])

In [111]:
from pyod.models.cblof import CBLOF

n_est = [2,3,5,7,9,10]
contamination = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, contamination))
score = -1
bs = None
for n_est, con in tqdm(params):
    
    try:
        clf_b = CBLOF(n_clusters=n_est, contamination=con)
        clf_b.fit(raw_benign_train_samples)
    except ValueError as e:
        print(n_est)
        continue  
   
    y_pred = clf_b.predict(raw_test_samples)
    test_pred = y_pred

    f1 = f1_score(raw_test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                        "con": con
                }
        bs = test_pred
    del clf_b
    gc.collect()

  

print(best_params)
print(score)
print(classification_report(raw_test_labels, bs, digits=4))

  3%|▎         | 1/36 [00:00<00:08,  4.29it/s]

2


  6%|▌         | 2/36 [00:00<00:07,  4.45it/s]

2


  8%|▊         | 3/36 [00:00<00:07,  4.51it/s]

2


 11%|█         | 4/36 [00:00<00:07,  4.51it/s]

2


 14%|█▍        | 5/36 [00:01<00:06,  4.45it/s]

2


 17%|█▋        | 6/36 [00:01<00:06,  4.51it/s]

2


 19%|█▉        | 7/36 [00:01<00:06,  4.38it/s]

3


 22%|██▏       | 8/36 [00:01<00:06,  4.23it/s]

3


 25%|██▌       | 9/36 [00:02<00:06,  4.17it/s]

3


 28%|██▊       | 10/36 [00:02<00:06,  3.97it/s]

3


 31%|███       | 11/36 [00:02<00:06,  3.98it/s]

3


 33%|███▎      | 12/36 [00:02<00:06,  3.92it/s]

3


 42%|████▏     | 15/36 [00:05<00:11,  1.76it/s]

5


100%|██████████| 36/36 [00:24<00:00,  1.47it/s]

{'n_estimators': 10, 'con': 0.2}
0.5468225176997928
              precision    recall  f1-score   support

           0     0.9211    0.8011    0.8569     95467
           1     0.1726    0.3769    0.2367     10510

    accuracy                         0.7590    105977
   macro avg     0.5468    0.5890    0.5468    105977
weighted avg     0.8469    0.7590    0.7954    105977






In [112]:
n_est = [2,3,5,7,9,10]
contamination = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, contamination))
score = -1
bs = None
for n_est, con in tqdm(params):
    
    try:
        clf_if = CBLOF(n_clusters=n_est, contamination=con)
        clf_if.fit(raw_normal_train_samples)
    except ValueError as e:
        print(n_est)
        continue  
    
    y_pred = clf_if.predict(raw_test_samples)
    test_pred = y_pred

    f1 = f1_score(raw_test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est
                }
        bs = test_pred
    del clf_if
    gc.collect()

        

print("benign only")
print(best_params)
print(score)
print(classification_report(raw_test_labels, bs, digits=4))

  3%|▎         | 1/36 [00:00<00:08,  4.15it/s]

2


  6%|▌         | 2/36 [00:00<00:08,  4.16it/s]

2


  8%|▊         | 3/36 [00:00<00:07,  4.14it/s]

2


 11%|█         | 4/36 [00:00<00:07,  4.15it/s]

2


 14%|█▍        | 5/36 [00:01<00:07,  4.20it/s]

2


 17%|█▋        | 6/36 [00:01<00:07,  4.15it/s]

2


 19%|█▉        | 7/36 [00:01<00:07,  3.81it/s]

3


 22%|██▏       | 8/36 [00:02<00:07,  3.76it/s]

3


 25%|██▌       | 9/36 [00:02<00:07,  3.78it/s]

3


 28%|██▊       | 10/36 [00:02<00:06,  3.86it/s]

3


 31%|███       | 11/36 [00:02<00:06,  3.76it/s]

3


 33%|███▎      | 12/36 [00:03<00:06,  3.84it/s]

3


 39%|███▉      | 14/36 [00:04<00:09,  2.38it/s]

5


 42%|████▏     | 15/36 [00:04<00:08,  2.45it/s]

5


100%|██████████| 36/36 [00:25<00:00,  1.41it/s]

benign only
{'n_estimators': 9}
0.5173713715328907
              precision    recall  f1-score   support

           0     0.9043    0.9022    0.9033     95467
           1     0.1301    0.1328    0.1315     10510

    accuracy                         0.8259    105977
   macro avg     0.5172    0.5175    0.5174    105977
weighted avg     0.8275    0.8259    0.8267    105977






In [113]:
# HBOS  Embeddings

In [114]:
benign_train_samples = df_train[df_train.Label == 0].drop(columns=["Label", "Attack"])
normal_train_samples = df_train.drop(columns=["Label", "Attack"])

train_labels = df_train["Label"]
test_labels = df_test["Label"]

test_samples = df_test.drop(columns=["Label", "Attack"])

In [115]:
contamination = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]

In [116]:
from pyod.models.hbos import HBOS

n_est = [5,10,15,20,25,30]
params = list(itertools.product(n_est, contamination))
score = -1
bs = None
for n_est, con in tqdm(params):
    
    clf_if = HBOS(n_bins=n_est, contamination=con)
    clf_if.fit(benign_train_samples)
    y_pred = clf_if.predict(test_samples)
    test_pred = y_pred

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

100%|██████████| 36/36 [05:40<00:00,  9.47s/it]

{'n_estimators': 5, 'con': 0.001}
0.48537701984966464
              precision    recall  f1-score   support

           0     0.9022    0.7583    0.8240    190934
           1     0.1033    0.2530    0.1467     21020

    accuracy                         0.7082    211954
   macro avg     0.5028    0.5057    0.4854    211954
weighted avg     0.8229    0.7082    0.7568    211954






In [117]:
n_est = [5,10,15,20,25,30]
contamination = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, contamination))
score = -1
bs = None
for n_est, con in tqdm(params):
    
    clf_if = HBOS(n_bins=n_est, contamination=con)
    clf_if.fit(normal_train_samples)
    y_pred = clf_if.predict(test_samples)
    test_pred = y_pred

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

100%|██████████| 36/36 [06:24<00:00, 10.67s/it]

{'n_estimators': 5, 'con': 0.001}
0.48537701984966464
              precision    recall  f1-score   support

           0     0.9022    0.7583    0.8240    190934
           1     0.1033    0.2530    0.1467     21020

    accuracy                         0.7082    211954
   macro avg     0.5028    0.5057    0.4854    211954
weighted avg     0.8229    0.7082    0.7568    211954






In [118]:
##  HBOS  RAw

In [119]:
from pyod.models.cblof import CBLOF

n_est = [5,10,15,20,25,30]
contamination = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, contamination))
score = -1
bs = None
for n_est, con in tqdm(params):
    
    try:
        clf_b = HBOS(n_bins=n_est, contamination=con)
        clf_b.fit(raw_benign_train_samples)
    except ValueError as e:
        print(n_est)
        continue  
   
    y_pred = clf_b.predict(raw_test_samples)
    test_pred = y_pred

    f1 = f1_score(raw_test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                        "con": con
                }
        bs = test_pred
    del clf_b
    gc.collect()

  

print(best_params)
print(score)
print(classification_report(raw_test_labels, bs, digits=4))

100%|██████████| 36/36 [00:30<00:00,  1.19it/s]

{'n_estimators': 5, 'con': 0.2}
0.48766986843324445
              precision    recall  f1-score   support

           0     0.8985    0.8433    0.8700     95467
           1     0.0865    0.1347    0.1053     10510

    accuracy                         0.7730    105977
   macro avg     0.4925    0.4890    0.4877    105977
weighted avg     0.8180    0.7730    0.7942    105977






In [120]:
n_est = [5,10,15,20,25,30]
contamination = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, contamination))
score = -1
bs = None
for n_est, con in tqdm(params):
    
    try:
        clf_if = HBOS(n_bins=n_est, contamination=con)
        clf_if.fit(raw_normal_train_samples)
    except ValueError as e:
        print(n_est)
        continue  
    
    y_pred = clf_if.predict(raw_test_samples)
    test_pred = y_pred

    f1 = f1_score(raw_test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est
                }
        bs = test_pred
    del clf_if
    gc.collect()

        

print("benign only")
print(best_params)
print(score)
print(classification_report(raw_test_labels, bs, digits=4))

100%|██████████| 36/36 [00:30<00:00,  1.20it/s]

benign only
{'n_estimators': 5}
0.4773490180287931
              precision    recall  f1-score   support

           0     0.9005    0.9902    0.9432     95467
           1     0.0659    0.0063    0.0115     10510

    accuracy                         0.8926    105977
   macro avg     0.4832    0.4982    0.4773    105977
weighted avg     0.8177    0.8926    0.8508    105977






In [121]:
##  PCA  Emb

In [122]:
from pyod.models.pca import PCA
n_est = [5,10,15,20,25,30]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    clf_if = PCA(n_components=n_est, contamination=con)
    clf_if.fit(benign_train_samples)
    y_pred = clf_if.predict(test_samples)
    test_pred = y_pred

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

100%|██████████| 36/36 [02:10<00:00,  3.63s/it]

{'n_estimators': 5, 'con': 0.001}
0.4142942827454079
              precision    recall  f1-score   support

           0     0.9044    0.5167    0.6576    190934
           1     0.1029    0.5038    0.1710     21020

    accuracy                         0.5154    211954
   macro avg     0.5037    0.5102    0.4143    211954
weighted avg     0.8249    0.5154    0.6094    211954






In [123]:
n_est = [5,10,15,20,25,30]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    clf_if = PCA(n_components=n_est, contamination=con)
    clf_if.fit(normal_train_samples)
    y_pred = clf_if.predict(test_samples)
    test_pred = y_pred

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

100%|██████████| 36/36 [02:14<00:00,  3.74s/it]

{'n_estimators': 5, 'con': 0.001}
0.48537701984966464
              precision    recall  f1-score   support

           0     0.9022    0.7583    0.8240    190934
           1     0.1033    0.2530    0.1467     21020

    accuracy                         0.7082    211954
   macro avg     0.5028    0.5057    0.4854    211954
weighted avg     0.8229    0.7082    0.7568    211954






In [124]:
##  PCA  RAw

In [125]:
n_est = [5,10,15,20,25,30]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    clf_if = PCA(n_components=n_est, contamination=con)
    clf_if.fit(raw_benign_train_samples)
   
    y_pred = clf_if.predict(raw_test_samples)
    test_pred = y_pred

    f1 = f1_score(raw_test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                        "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()

  

print(best_params)
print(score)
print(classification_report(raw_test_labels, bs, digits=4))

100%|██████████| 36/36 [00:24<00:00,  1.49it/s]

{'n_estimators': 30, 'con': 0.01}
0.4785262330235561
              precision    recall  f1-score   support

           0     0.9006    0.9894    0.9429     95467
           1     0.0752    0.0078    0.0141     10510

    accuracy                         0.8921    105977
   macro avg     0.4879    0.4986    0.4785    105977
weighted avg     0.8187    0.8921    0.8508    105977






In [126]:
n_est = [5,10,15,20,25,30]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    clf_if = PCA(n_components=n_est, contamination=con)
    clf_if.fit(raw_normal_train_samples)

    y_pred = clf_if.predict(raw_test_samples)
    test_pred = y_pred

    f1 = f1_score(raw_test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est
                }
        bs = test_pred
    del clf_if
    gc.collect()

        

print("benign only")
print(best_params)
print(score)
print(classification_report(raw_test_labels, bs, digits=4))

100%|██████████| 36/36 [00:24<00:00,  1.47it/s]

benign only
{'n_estimators': 5}
0.47848077792257493
              precision    recall  f1-score   support

           0     0.9006    0.9893    0.9428     95467
           1     0.0742    0.0078    0.0141     10510

    accuracy                         0.8919    105977
   macro avg     0.4874    0.4985    0.4785    105977
weighted avg     0.8186    0.8919    0.8507    105977






In [127]:
##  IF  Emb

In [128]:
from sklearn.ensemble import IsolationForest
n_est = [20, 50, 100, 150]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    clf_if = IsolationForest(n_estimators=n_est, contamination=con)
    clf_if.fit(benign_train_samples)
    y_pred = clf_if.predict(test_samples)
    test_pred = list(map(lambda x : 0 if x == 1 else 1, y_pred))

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

100%|██████████| 24/24 [00:58<00:00,  2.43s/it]

{'n_estimators': 20, 'con': 0.1}
0.49249823910626217
              precision    recall  f1-score   support

           0     0.8993    0.8909    0.8951    190934
           1     0.0864    0.0938    0.0899     21020

    accuracy                         0.8118    211954
   macro avg     0.4929    0.4923    0.4925    211954
weighted avg     0.8187    0.8118    0.8152    211954






In [129]:
from sklearn.ensemble import IsolationForest

n_est = [20, 50, 100, 150]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    clf_if = IsolationForest(n_estimators=n_est, contamination=con)
    clf_if.fit(normal_train_samples)
    y_pred = clf_if.predict(test_samples)
    test_pred = list(map(lambda x : 0 if x == 1 else 1, y_pred))

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

100%|██████████| 24/24 [01:02<00:00,  2.59s/it]

{'n_estimators': 20, 'con': 0.1}
0.49217659911318623
              precision    recall  f1-score   support

           0     0.8992    0.8909    0.8950    190934
           1     0.0859    0.0931    0.0893     21020

    accuracy                         0.8117    211954
   macro avg     0.4925    0.4920    0.4922    211954
weighted avg     0.8186    0.8117    0.8151    211954






In [130]:
##  IF  Raw

In [131]:
from sklearn.ensemble import IsolationForest

n_est = [20, 50, 100, 150]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    clf_if = IsolationForest(n_estimators=n_est, contamination=con)
    clf_if.fit(raw_benign_train_samples.to_numpy())
   
    y_pred = clf_if.predict(raw_test_samples.to_numpy())
    test_pred = list(map(lambda x : 0 if x == 1 else 1, y_pred))

    f1 = f1_score(raw_test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                        "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()

  

print(best_params)
print(score)
print(classification_report(raw_test_labels, bs, digits=4))

  0%|          | 0/24 [00:00<?, ?it/s]

100%|██████████| 24/24 [00:33<00:00,  1.38s/it]

{'n_estimators': 150, 'con': 0.01}
0.47872370667496134
              precision    recall  f1-score   support

           0     0.9007    0.9904    0.9434     95467
           1     0.0808    0.0077    0.0141     10510

    accuracy                         0.8929    105977
   macro avg     0.4907    0.4990    0.4787    105977
weighted avg     0.8193    0.8929    0.8512    105977






In [132]:
n_est = [20, 50, 100, 150]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    clf_if = IsolationForest(n_estimators=n_est, contamination=con)
    clf_if.fit(raw_normal_train_samples.to_numpy())

    y_pred = clf_if.predict(raw_test_samples.to_numpy())
    test_pred = list(map(lambda x : 0 if x == 1 else 1, y_pred))

    f1 = f1_score(raw_test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est
                }
        bs = test_pred
    del clf_if
    gc.collect()

        

print("benign only")
print(best_params)
print(score)
print(classification_report(raw_test_labels, bs, digits=4))

100%|██████████| 24/24 [00:33<00:00,  1.40s/it]

benign only
{'n_estimators': 20}
0.47970922171562214
              precision    recall  f1-score   support

           0     0.9007    0.9901    0.9433     95467
           1     0.0897    0.0088    0.0161     10510

    accuracy                         0.8928    105977
   macro avg     0.4952    0.4995    0.4797    105977
weighted avg     0.8203    0.8928    0.8514    105977




