In [1]:
import pandas as pd
import numpy as np
import torch
from sklearn import preprocessing
from dgl.data import DGLDataset
import dgl
import time
import networkx as nx
import category_encoders as ce
import torch.nn as nn
import torch.nn.functional as F
import dgl.function as fn
import torch
import tqdm
import math

from typing import *
from sklearn.preprocessing import StandardScaler, Normalizer
import socket
import struct
import random
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# file_name = "NF-CSE-CIC-IDS2018-v2.csv"
file_name = "NF-UNSW-NB15-v2.csv"
data = pd.read_csv(file_name)

In [3]:
data.Label.value_counts()

Label
0    2295222
1      95053
Name: count, dtype: int64

In [4]:
data.rename(columns=lambda x: x.strip(), inplace=True)
data['IPV4_SRC_ADDR'] = data["IPV4_SRC_ADDR"].apply(str)
data['L4_SRC_PORT'] = data["L4_SRC_PORT"].apply(str)
data['IPV4_DST_ADDR'] = data["IPV4_DST_ADDR"].apply(str)
data['L4_DST_PORT'] = data["L4_DST_PORT"].apply(str)

In [5]:
data.drop(columns=["L4_SRC_PORT", "L4_DST_PORT"], inplace=True)

In [6]:
data.Attack.unique()

array(['Benign', 'Exploits', 'Generic', 'Fuzzers', 'Backdoor', 'DoS',
       'Reconnaissance', 'Shellcode', 'Worms', 'Analysis'], dtype=object)

In [7]:
data = data.groupby(by='Attack').sample(frac=0.1, random_state=13)

In [8]:
data.groupby(by="Attack").count()

Unnamed: 0_level_0,IPV4_SRC_ADDR,IPV4_DST_ADDR,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,OUT_BYTES,OUT_PKTS,TCP_FLAGS,CLIENT_TCP_FLAGS,...,NUM_PKTS_1024_TO_1514_BYTES,TCP_WIN_MAX_IN,TCP_WIN_MAX_OUT,ICMP_TYPE,ICMP_IPV4_TYPE,DNS_QUERY_ID,DNS_QUERY_TYPE,DNS_TTL_ANSWER,FTP_COMMAND_RET_CODE,Label
Attack,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Analysis,230,230,230,230,230,230,230,230,230,230,...,230,230,230,230,230,230,230,230,230,230
Backdoor,217,217,217,217,217,217,217,217,217,217,...,217,217,217,217,217,217,217,217,217,217
Benign,229522,229522,229522,229522,229522,229522,229522,229522,229522,229522,...,229522,229522,229522,229522,229522,229522,229522,229522,229522,229522
DoS,579,579,579,579,579,579,579,579,579,579,...,579,579,579,579,579,579,579,579,579,579
Exploits,3155,3155,3155,3155,3155,3155,3155,3155,3155,3155,...,3155,3155,3155,3155,3155,3155,3155,3155,3155,3155
Fuzzers,2231,2231,2231,2231,2231,2231,2231,2231,2231,2231,...,2231,2231,2231,2231,2231,2231,2231,2231,2231,2231
Generic,1656,1656,1656,1656,1656,1656,1656,1656,1656,1656,...,1656,1656,1656,1656,1656,1656,1656,1656,1656,1656
Reconnaissance,1278,1278,1278,1278,1278,1278,1278,1278,1278,1278,...,1278,1278,1278,1278,1278,1278,1278,1278,1278,1278
Shellcode,143,143,143,143,143,143,143,143,143,143,...,143,143,143,143,143,143,143,143,143,143
Worms,16,16,16,16,16,16,16,16,16,16,...,16,16,16,16,16,16,16,16,16,16


In [9]:
X = data.drop(columns=["Attack", "Label"])
y = data[["Attack", "Label"]]

X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=13, stratify=y)

In [10]:
encoder = ce.TargetEncoder(cols=['TCP_FLAGS','L7_PROTO','PROTOCOL',
                                  'CLIENT_TCP_FLAGS','SERVER_TCP_FLAGS','ICMP_TYPE',
                                  'ICMP_IPV4_TYPE','DNS_QUERY_ID','DNS_QUERY_TYPE',
                                  'FTP_COMMAND_RET_CODE'])
encoder.fit(X_train, y_train.Label)

# Transform on training set
X_train = encoder.transform(X_train)

# Transform on testing set
X_test = encoder.transform(X_test)

In [11]:
X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)
X_train.fillna(0, inplace=True)
X_test.fillna(0, inplace=True)

In [12]:
scaler = Normalizer()
cols_to_norm = list(set(list(X_train.iloc[:, 2:].columns))) # Ignore first two as the represents IP addresses
scaler.fit(X_train[cols_to_norm])

# Transform on training set
X_train[cols_to_norm] = scaler.transform(X_train[cols_to_norm])
X_train['h'] = X_train.iloc[:, 2:].values.tolist()

# Transform on testing set
X_test[cols_to_norm] = scaler.transform(X_test[cols_to_norm])
X_test['h'] = X_test.iloc[:, 2:].values.tolist()

train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

In [13]:
X_train.head()

Unnamed: 0,IPV4_SRC_ADDR,IPV4_DST_ADDR,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,OUT_BYTES,OUT_PKTS,TCP_FLAGS,CLIENT_TCP_FLAGS,...,NUM_PKTS_1024_TO_1514_BYTES,TCP_WIN_MAX_IN,TCP_WIN_MAX_OUT,ICMP_TYPE,ICMP_IPV4_TYPE,DNS_QUERY_ID,DNS_QUERY_TYPE,DNS_TTL_ANSWER,FTP_COMMAND_RET_CODE,h
220614,59.166.0.4,149.171.126.1,4.802078e-09,3.048648e-09,3.994566e-05,7.652425e-07,7.2e-05,1.071339e-06,1.980229e-09,1.98036e-09,...,0.0,0.001329685,0.001329685,4.043553e-10,4.043553e-10,6.314168e-09,6.314212e-09,0.0,3.618463e-09,"[4.802078220820267e-09, 3.0486475517277593e-09..."
329844,59.166.0.1,149.171.126.8,9.025364e-09,8.052833e-09,0.0001079121,8.300931e-07,6.3e-05,8.300931e-07,1.397062e-08,1.396955e-08,...,0.0,0.0,0.0,1.181795e-08,1.182599e-08,8.561578e-09,8.561639e-09,0.0,9.271523e-09,"[9.025363504432674e-09, 8.052833119952719e-09,..."
1093665,59.166.0.3,149.171.126.8,3.068685e-13,8.437668e-13,1.638983e-07,3.061234e-09,1e-05,7.129838e-09,1.188386e-13,1.274923e-13,...,6.87555e-09,6.372843e-07,7.080936e-08,4.545956e-14,5.19193e-14,4.034959e-13,4.034988e-13,0.0,4.369547e-13,"[3.068684887633906e-13, 8.437667728858196e-13,..."
1764268,59.166.0.4,149.171.126.0,2.361402e-08,2.106949e-08,7.927293e-05,1.085931e-06,9.7e-05,1.085931e-06,3.655283e-08,3.655002e-08,...,0.0,0.0,0.0,3.092057e-08,3.094159e-08,1.878071e-08,3.577436e-10,3.3e-05,2.425808e-08,"[2.3614022984160315e-08, 2.1069487814953574e-0..."
1525176,59.166.0.9,149.171.126.5,2.309362e-10,2.856102e-10,1.71788e-05,2.649686e-07,0.000124,2.79689e-07,8.94329e-11,9.59453e-11,...,7.360238e-08,0.0001598644,0.0001065762,5.174884e-12,5.174884e-12,3.036539e-10,3.036561e-10,0.0,3.288336e-10,"[2.309362039695701e-10, 2.856102367918858e-10,..."


In [14]:
lab_enc = preprocessing.LabelEncoder()
lab_enc.fit(data["Attack"])

# Transform on training set
train["Attack"] = lab_enc.transform(train["Attack"])

# Transform on testing set
test["Attack"] = lab_enc.transform(test["Attack"])

In [15]:
# Training graph

train_g = nx.from_pandas_edgelist(train, "IPV4_SRC_ADDR", "IPV4_DST_ADDR",
           ["h", "Label", "Attack"], create_using=nx.MultiGraph())
print(train_g) # 39 nodes, 167318 edges
train_g = train_g.to_directed()
print(train_g) # 39 nodes, 334636 edges (doubled because of directed graph)
train_g = dgl.from_networkx(train_g, edge_attrs=['h', 'Attack', 'Label'])
nfeat_weight = torch.ones([train_g.number_of_nodes(),
train_g.edata['h'].shape[1]])
train_g.ndata['h'] = nfeat_weight

# Testing graph
test_g = nx.from_pandas_edgelist(test, "IPV4_SRC_ADDR", "IPV4_DST_ADDR",
            ["h", "Label", "Attack"], create_using=nx.MultiGraph())

test_g = test_g.to_directed()
test_g = dgl.from_networkx(test_g, edge_attrs=['h', 'Attack', 'Label'])
nfeat_weight = torch.ones([test_g.number_of_nodes(),
test_g.edata['h'].shape[1]])
test_g.ndata['h'] = nfeat_weight

MultiGraph with 39 nodes and 167318 edges
MultiDiGraph with 39 nodes and 334636 edges


In [16]:
import torch.nn as nn
import torch.nn.functional as F
import dgl.function as fn
import tqdm
import gc

class SAGELayer(nn.Module):
    def __init__(self, ndim_in, edims, ndim_out, activation):
      super(SAGELayer, self).__init__()
      self.W_apply = nn.Linear(ndim_in + edims , ndim_out)
      self.activation = F.relu
      self.W_edge = nn.Linear(128 * 2, 256)
      self.reset_parameters()

    def reset_parameters(self):
      gain = nn.init.calculate_gain('relu')
      nn.init.xavier_uniform_(self.W_apply.weight, gain=gain)

    def message_func(self, edges):
      return {'m':  edges.data['h']}

    def forward(self, g_dgl, nfeats, efeats):
      with g_dgl.local_scope():
        g = g_dgl
        g.ndata['h'] = nfeats
        g.edata['h'] = efeats
        g.update_all(self.message_func, fn.mean('m', 'h_neigh'))
        g.ndata['h'] = F.relu(self.W_apply(torch.cat([g.ndata['h'], g.ndata['h_neigh']], 2)))

        # Compute edge embeddings
        u, v = g.edges()
        edge = self.W_edge(torch.cat((g.srcdata['h'][u], g.dstdata['h'][v]), 2))
        return g.ndata['h'], edge

In [17]:
class SAGE(nn.Module):
    def __init__(self, ndim_in, ndim_out, edim,  activation):
      super(SAGE, self).__init__()
      self.layers = nn.ModuleList()
      self.layers.append(SAGELayer(ndim_in, edim, 128, F.relu))

    def forward(self, g, nfeats, efeats, corrupt=False):
      if corrupt:
        e_perm = torch.randperm(g.number_of_edges())
        #n_perm = torch.randperm(g.number_of_nodes())
        efeats = efeats[e_perm]
        #nfeats = nfeats[n_perm]
      for i, layer in enumerate(self.layers):
        #nfeats = layer(g, nfeats, efeats)
        nfeats, e_feats = layer(g, nfeats, efeats)
      #return nfeats.sum(1)
      return nfeats.sum(1), e_feats.sum(1)

In [18]:
class Discriminator(nn.Module):
    def __init__(self, n_hidden):
      super(Discriminator, self).__init__()
      self.weight = nn.Parameter(torch.Tensor(n_hidden, n_hidden))
      self.reset_parameters()

    def uniform(self, size, tensor):
      bound = 1.0 / math.sqrt(size)
      if tensor is not None:
        tensor.data.uniform_(-bound, bound)

    def reset_parameters(self):
      size = self.weight.size(0)
      self.uniform(size, self.weight)

    def forward(self, features, summary):
      features = torch.matmul(features, torch.matmul(self.weight, summary))
      return features

In [19]:
class DGI(nn.Module):
    def __init__(self, ndim_in, ndim_out, edim, activation):
      super(DGI, self).__init__()
      self.encoder = SAGE(ndim_in, ndim_out, edim,  F.relu)
      #self.discriminator = Discriminator(128)
      self.discriminator = Discriminator(256)
      self.loss = nn.BCEWithLogitsLoss()

    def forward(self, g, n_features, e_features):
      positive = self.encoder(g, n_features, e_features, corrupt=False)
      negative = self.encoder(g, n_features, e_features, corrupt=True)
      self.loss = nn.BCEWithLogitsLoss()

    def forward(self, g, n_features, e_features):
      positive = self.encoder(g, n_features, e_features, corrupt=False)
      negative = self.encoder(g, n_features, e_features, corrupt=True)

      positive = positive[1]
      negative = negative[1]

      summary = torch.sigmoid(positive.mean(dim=0))

      positive = self.discriminator(positive, summary)
      negative = self.discriminator(negative, summary)

      l1 = self.loss(positive, torch.ones_like(positive))
      l2 = self.loss(negative, torch.zeros_like(negative))

      return l1 + l2

In [53]:
ndim_in = train_g.ndata['h'].shape[1]
hidden_features = 128
ndim_out = 128
num_layers = 1
edim = train_g.edata['h'].shape[1]
learning_rate = 1e-3
epochs = 4000

In [54]:
dgi = DGI(ndim_in,
    ndim_out,
    edim,
    F.relu)

dgi = dgi.to('cuda')

dgi_optimizer = torch.optim.Adam(dgi.parameters(),
                lr=1e-3,
                weight_decay=0.)

In [23]:
# Format node and edge features for E-GraphSAGE
train_g.ndata['h'] = torch.reshape(train_g.ndata['h'],
                                   (train_g.ndata['h'].shape[0], 1,
                                    train_g.ndata['h'].shape[1]))

train_g.edata['h'] = torch.reshape(train_g.edata['h'],
                                   (train_g.edata['h'].shape[0], 1,
                                    train_g.edata['h'].shape[1]))

In [24]:
# Convert to GPU
train_g = train_g.to('cuda')

In [25]:
# cnt_wait = 0
# best = 1e9
# best_t = 0
# dur = []
# node_features = train_g.ndata['h'] 
# edge_features = train_g.edata['h']

# for epoch in range(epochs):
#     dgi.train()
#     if epoch >= 3:
#         t0 = time.time()

#     dgi_optimizer.zero_grad()
#     loss = dgi(train_g, node_features, edge_features)
#     loss.backward()
#     dgi_optimizer.step()

#     if loss < best:
#         best = loss
#         best_t = epoch
#         cnt_wait = 0
#         torch.save(dgi.state_dict(), 'best_dgi.pkl')
#     else:
#         cnt_wait += 1

#   # if cnt_wait == patience:
#   #     print('Early stopping!')
#   #     break

#     if epoch >= 3:
#         dur.append(time.time() - t0)

#     if epoch % 50 == 0:

#         print("Epoch {:05d} | Time(s) {:.4f} | Loss {:.4f} | "
#             "ETputs(KTEPS) {:.2f}".format(epoch, np.mean(dur),
#               loss.item(),
#               train_g.num_edges() / np.mean(dur) / 1000))

In [26]:
dgi.load_state_dict(torch.load('best_dgi_UNSW.pkl'))

<All keys matched successfully>

In [27]:
training_emb = dgi.encoder(train_g, train_g.ndata['h'], train_g.edata['h'])[1]
training_emb = training_emb.detach().cpu().numpy()

In [28]:
test_g.ndata['h'] = torch.reshape(test_g.ndata['h'],
                                   (test_g.ndata['h'].shape[0], 1,
                                    test_g.ndata['h'].shape[1]))



test_g.edata['h'] = torch.reshape(test_g.edata['h'],
                                   (test_g.edata['h'].shape[0], 1,
                                    test_g.edata['h'].shape[1]))

In [29]:
# Convert to GPU
test_g = test_g.to('cuda')

In [30]:
testing_emb = dgi.encoder(test_g, test_g.ndata['h'], test_g.edata['h'])[1]
testing_emb = testing_emb.detach().cpu().numpy()

In [40]:
df_train = pd.DataFrame(training_emb, )
df_train["Attack"] = lab_enc.inverse_transform(
        train_g.edata['Attack'].detach().cpu().numpy())
df_train["Label"] = train_g.edata['Label'].detach().cpu().numpy()

df_test = pd.DataFrame(testing_emb, )
df_test["Attack"] = lab_enc.inverse_transform(
        test_g.edata['Attack'].detach().cpu().numpy())
df_test["Label"] = test_g.edata['Label'].detach().cpu().numpy()

In [41]:
df_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,248,249,250,251,252,253,254,255,Attack,Label
0,-0.036994,-0.010234,-0.021026,0.034767,0.016113,0.010732,-0.010567,0.006373,-0.011485,-0.010453,...,-0.001256,0.020352,-0.029813,-0.002278,-0.020348,0.001581,-0.002483,0.003740,Benign,0
1,-0.036994,-0.010234,-0.021026,0.034767,0.016113,0.010732,-0.010567,0.006373,-0.011485,-0.010453,...,-0.001256,0.020352,-0.029813,-0.002278,-0.020348,0.001581,-0.002483,0.003740,Benign,0
2,-0.036994,-0.010234,-0.021026,0.034767,0.016113,0.010732,-0.010567,0.006373,-0.011485,-0.010453,...,-0.001256,0.020352,-0.029813,-0.002278,-0.020348,0.001581,-0.002483,0.003740,Benign,0
3,-0.036994,-0.010234,-0.021026,0.034767,0.016113,0.010732,-0.010567,0.006373,-0.011485,-0.010453,...,-0.001256,0.020352,-0.029813,-0.002278,-0.020348,0.001581,-0.002483,0.003740,Benign,0
4,-0.036994,-0.010234,-0.021026,0.034767,0.016113,0.010732,-0.010567,0.006373,-0.011485,-0.010453,...,-0.001256,0.020352,-0.029813,-0.002278,-0.020348,0.001581,-0.002483,0.003740,Benign,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
334631,0.279057,-0.084344,0.071967,-0.374351,0.033641,-0.090723,-0.040373,-0.053807,-0.099703,-0.059421,...,0.069546,-0.008771,0.040567,-0.107941,0.011406,0.006651,-0.186833,-0.019512,Benign,0
334632,0.279057,-0.084344,0.071967,-0.374351,0.033641,-0.090723,-0.040373,-0.053807,-0.099703,-0.059421,...,0.069546,-0.008771,0.040567,-0.107941,0.011406,0.006651,-0.186833,-0.019512,Benign,0
334633,0.279057,-0.084344,0.071967,-0.374351,0.033641,-0.090723,-0.040373,-0.053807,-0.099703,-0.059421,...,0.069546,-0.008771,0.040567,-0.107941,0.011406,0.006651,-0.186833,-0.019512,Benign,0
334634,0.279057,-0.084344,0.071967,-0.374351,0.033641,-0.090723,-0.040373,-0.053807,-0.099703,-0.059421,...,0.069546,-0.008771,0.040567,-0.107941,0.011406,0.006651,-0.186833,-0.019512,Benign,0


# Embeddings CBLOF  Embeddings

In [33]:
import torch
import dgl
import numpy as np
import pandas as pd
import torch.optim as optim
import time
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
from sklearn.ensemble import IsolationForest
import gc

from tqdm import tqdm
import itertools

In [34]:
benign_train_samples = df_train[df_train.Label == 0].drop(columns=["Label", "Attack"])
normal_train_samples = df_train.drop(columns=["Label", "Attack"])

train_labels = df_train["Label"]
test_labels = df_test["Label"]

test_samples = df_test.drop(columns=["Label", "Attack"])

In [35]:
test_samples

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,246,247,248,249,250,251,252,253,254,255
0,-0.036773,-0.010351,-0.020939,0.034704,0.016094,0.010692,-0.010801,0.006459,-0.011468,-0.010529,...,0.017052,-0.037851,-0.000985,0.020440,-0.029494,-0.002341,-0.020560,0.001650,-0.002542,0.003791
1,-0.036773,-0.010351,-0.020939,0.034704,0.016094,0.010692,-0.010801,0.006459,-0.011468,-0.010529,...,0.017052,-0.037851,-0.000985,0.020440,-0.029494,-0.002341,-0.020560,0.001650,-0.002542,0.003791
2,-0.036773,-0.010351,-0.020939,0.034704,0.016094,0.010692,-0.010801,0.006459,-0.011468,-0.010529,...,0.017052,-0.037851,-0.000985,0.020440,-0.029494,-0.002341,-0.020560,0.001650,-0.002542,0.003791
3,-0.036773,-0.010351,-0.020939,0.034704,0.016094,0.010692,-0.010801,0.006459,-0.011468,-0.010529,...,0.017052,-0.037851,-0.000985,0.020440,-0.029494,-0.002341,-0.020560,0.001650,-0.002542,0.003791
4,-0.036773,-0.010351,-0.020939,0.034704,0.016094,0.010692,-0.010801,0.006459,-0.011468,-0.010529,...,0.017052,-0.037851,-0.000985,0.020440,-0.029494,-0.002341,-0.020560,0.001650,-0.002542,0.003791
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143413,0.327797,-0.085205,0.064469,-0.550351,0.044841,-0.241439,-0.093871,-0.116196,-0.212388,-0.102773,...,-0.013743,0.444237,0.125920,-0.133720,0.045855,-0.151664,0.015843,-0.059231,-0.394932,0.033923
143414,0.327797,-0.085205,0.064469,-0.550351,0.044841,-0.241439,-0.093871,-0.116196,-0.212388,-0.102773,...,-0.013743,0.444237,0.125920,-0.133720,0.045855,-0.151664,0.015843,-0.059231,-0.394932,0.033923
143415,0.327797,-0.085205,0.064469,-0.550351,0.044841,-0.241439,-0.093871,-0.116196,-0.212388,-0.102773,...,-0.013743,0.444237,0.125920,-0.133720,0.045855,-0.151664,0.015843,-0.059231,-0.394932,0.033923
143416,0.327797,-0.085205,0.064469,-0.550351,0.044841,-0.241439,-0.093871,-0.116196,-0.212388,-0.102773,...,-0.013743,0.444237,0.125920,-0.133720,0.045855,-0.151664,0.015843,-0.059231,-0.394932,0.033923


In [36]:
df_raw_train = pd.concat([X_train.drop(columns=["IPV4_SRC_ADDR","IPV4_DST_ADDR", "h"]), y_train], axis=1)
df_raw_test = pd.concat([X_test.drop(columns=["IPV4_SRC_ADDR","IPV4_DST_ADDR", "h"]), y_test], axis=1)

In [37]:
df_raw_train

Unnamed: 0,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,OUT_BYTES,OUT_PKTS,TCP_FLAGS,CLIENT_TCP_FLAGS,SERVER_TCP_FLAGS,FLOW_DURATION_MILLISECONDS,...,TCP_WIN_MAX_IN,TCP_WIN_MAX_OUT,ICMP_TYPE,ICMP_IPV4_TYPE,DNS_QUERY_ID,DNS_QUERY_TYPE,DNS_TTL_ANSWER,FTP_COMMAND_RET_CODE,Attack,Label
220614,4.802078e-09,3.048648e-09,3.994566e-05,7.652425e-07,0.000072,1.071339e-06,1.980229e-09,1.980360e-09,0.000000e+00,0.000000,...,1.329685e-03,1.329685e-03,4.043553e-10,4.043553e-10,6.314168e-09,6.314212e-09,0.000000,3.618463e-09,Benign,0
329844,9.025364e-09,8.052833e-09,1.079121e-04,8.300931e-07,0.000063,8.300931e-07,1.397062e-08,1.396955e-08,1.452568e-08,0.000000,...,0.000000e+00,0.000000e+00,1.181795e-08,1.182599e-08,8.561578e-09,8.561639e-09,0.000000,9.271523e-09,Benign,0
1093665,3.068685e-13,8.437668e-13,1.638983e-07,3.061234e-09,0.000010,7.129838e-09,1.188386e-13,1.274923e-13,0.000000e+00,0.000042,...,6.372843e-07,7.080936e-08,4.545956e-14,5.191930e-14,4.034959e-13,4.034988e-13,0.000000,4.369547e-13,Benign,0
1764268,2.361402e-08,2.106949e-08,7.927293e-05,1.085931e-06,0.000097,1.085931e-06,3.655283e-08,3.655002e-08,3.800509e-08,0.000000,...,0.000000e+00,0.000000e+00,3.092057e-08,3.094159e-08,1.878071e-08,3.577436e-10,0.000033,2.425808e-08,Benign,0
1525176,2.309362e-10,2.856102e-10,1.717880e-05,2.649686e-07,0.000124,2.796890e-07,8.943290e-11,9.594530e-11,0.000000e+00,0.000000,...,1.598644e-04,1.065762e-04,5.174884e-12,5.174884e-12,3.036539e-10,3.036561e-10,0.000000,3.288336e-10,Benign,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1525437,1.940855e-09,1.232171e-09,1.123952e-04,2.041300e-06,0.000155,2.165015e-06,8.003488e-10,8.004016e-10,0.000000e+00,0.000000,...,1.253977e-03,1.074837e-03,1.006141e-10,1.006141e-10,2.551995e-09,2.552013e-09,0.000000,0.000000e+00,Benign,0
1636406,2.617265e-08,2.335241e-08,7.823357e-05,1.203593e-06,0.000097,1.203593e-06,4.051340e-08,4.051029e-08,4.212302e-08,0.000000,...,0.000000e+00,0.000000e+00,3.427088e-08,3.429417e-08,2.053465e-08,3.965058e-10,0.000036,2.688649e-08,Benign,0
152133,4.439600e-10,5.490674e-10,5.999424e-06,1.131967e-07,0.000125,1.697950e-07,1.719290e-10,3.618107e-09,0.000000e+00,0.000000,...,1.434202e-04,8.195440e-05,8.057850e-10,8.063327e-10,5.837552e-10,5.837593e-10,0.000000,6.321614e-10,Benign,0
1164696,2.451807e-09,1.556554e-09,9.775598e-05,1.797272e-06,0.000137,1.953557e-06,1.011050e-09,1.011116e-09,0.000000e+00,0.000000,...,1.244650e-03,1.131500e-03,4.425931e-10,4.423147e-10,3.223837e-09,3.223860e-09,0.000000,0.000000e+00,Benign,0


In [36]:
raw_benign_train_samples = df_raw_train[df_raw_train.Label == 0].drop(columns=["Label", "Attack"])
raw_normal_train_samples = df_raw_train.drop(columns=["Label", "Attack"])

raw_train_labels = df_raw_train["Label"]
raw_test_labels = df_raw_test["Label"]

raw_test_samples = df_raw_test.drop(columns=["Label", "Attack"])

In [None]:
contamination = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]

In [None]:
from pyod.models.cblof import CBLOF
n_est = [2,3,5,7,9,10]
params = list(itertools.product(n_est, contamination))
score = -1
bs = None
for n_est, con in tqdm(params):
    
    clf_if = CBLOF(n_clusters=n_est, contamination=con)
    clf_if.fit(benign_train_samples)
    y_pred = clf_if.predict(test_samples)
    test_pred = y_pred

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

100%|██████████| 36/36 [04:24<00:00,  7.35s/it]


{'n_estimators': 2, 'con': 0.01}
0.8076747200514057
              precision    recall  f1-score   support

           0     0.9817    0.9914    0.9865    137714
           1     0.7282    0.5533    0.6288      5704

    accuracy                         0.9740    143418
   macro avg     0.8549    0.7724    0.8077    143418
weighted avg     0.9716    0.9740    0.9723    143418



In [None]:
n_est = [2,3,5,7,9,10]
contamination = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, contamination))
score = -1
bs = None
for n_est, con in tqdm(params):
    
    clf_if = CBLOF(n_clusters=n_est, contamination=con)
    clf_if.fit(normal_train_samples)
    y_pred = clf_if.predict(test_samples)
    test_pred = y_pred

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

100%|██████████| 36/36 [04:00<00:00,  6.67s/it]


{'n_estimators': 2, 'con': 0.05}
0.9061445972752045
              precision    recall  f1-score   support

           0     0.9961    0.9874    0.9917    137714
           1     0.7488    0.9074    0.8205      5704

    accuracy                         0.9842    143418
   macro avg     0.8725    0.9474    0.9061    143418
weighted avg     0.9863    0.9842    0.9849    143418



In [None]:
###  CBLOF RAW

In [41]:
from pyod.models.cblof import CBLOF

n_est = [2,3,5,7,9,10]
contamination = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, contamination))
score = -1
bs = None
for n_est, con in tqdm(params):
    
    try:
        clf_b = CBLOF(n_clusters=n_est, contamination=con)
        clf_b.fit(raw_benign_train_samples)
    except ValueError as e:
        print(n_est)
        continue  
   
    y_pred = clf_b.predict(raw_test_samples)
    test_pred = y_pred

    f1 = f1_score(raw_test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                        "con": con
                }
        bs = test_pred
    del clf_b
    gc.collect()

  

print(best_params)
print(score)
print(classification_report(raw_test_labels, bs, digits=4))

  3%|▎         | 1/36 [00:00<00:07,  4.52it/s]

2


  6%|▌         | 2/36 [00:00<00:07,  4.64it/s]

2


  8%|▊         | 3/36 [00:00<00:07,  4.71it/s]

2


 11%|█         | 4/36 [00:00<00:06,  4.59it/s]

2


 14%|█▍        | 5/36 [00:01<00:06,  4.65it/s]

2


 17%|█▋        | 6/36 [00:01<00:06,  4.63it/s]

2


 22%|██▏       | 8/36 [00:03<00:18,  1.52it/s]

3


 28%|██▊       | 10/36 [00:04<00:14,  1.77it/s]

3


 33%|███▎      | 12/36 [00:06<00:15,  1.54it/s]

3


100%|██████████| 36/36 [00:30<00:00,  1.18it/s]

{'n_estimators': 3, 'con': 0.04}
0.7183189342694002
              precision    recall  f1-score   support

           0     0.9793    0.9716    0.9755     68857
           1     0.4244    0.5049    0.4612      2852

    accuracy                         0.9531     71709
   macro avg     0.7019    0.7383    0.7183     71709
weighted avg     0.9573    0.9531    0.9550     71709






In [42]:
n_est = [2,3,5,7,9,10]
contamination = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, contamination))
score = -1
bs = None
for n_est, con in tqdm(params):
    
    try:
        clf_if = CBLOF(n_clusters=n_est, contamination=con)
        clf_if.fit(raw_normal_train_samples)
    except ValueError as e:
        print(n_est)
        continue  
    
    y_pred = clf_if.predict(raw_test_samples)
    test_pred = y_pred

    f1 = f1_score(raw_test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est
                }
        bs = test_pred
    del clf_if
    gc.collect()

        

print("benign only")
print(best_params)
print(score)
print(classification_report(raw_test_labels, bs, digits=4))

  3%|▎         | 1/36 [00:00<00:17,  2.02it/s]

2


  6%|▌         | 2/36 [00:00<00:16,  2.11it/s]

2


  8%|▊         | 3/36 [00:01<00:13,  2.44it/s]

2


 11%|█         | 4/36 [00:01<00:12,  2.62it/s]

2


 14%|█▍        | 5/36 [00:01<00:10,  2.91it/s]

2


 17%|█▋        | 6/36 [00:02<00:09,  3.20it/s]

2


 19%|█▉        | 7/36 [00:02<00:09,  3.03it/s]

3


 22%|██▏       | 8/36 [00:02<00:09,  2.90it/s]

3


 31%|███       | 11/36 [00:04<00:12,  1.97it/s]

3


100%|██████████| 36/36 [00:29<00:00,  1.23it/s]

benign only
{'n_estimators': 3}
0.7173358973597662
              precision    recall  f1-score   support

           0     0.9793    0.9714    0.9753     68857
           1     0.4218    0.5042    0.4594      2852

    accuracy                         0.9528     71709
   macro avg     0.7006    0.7378    0.7173     71709
weighted avg     0.9571    0.9528    0.9548     71709






In [43]:
# HBOS  Embeddings

In [37]:
contamination = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]

In [46]:
from pyod.models.hbos import HBOS

n_est = [5,10,15,20,25,30]
params = list(itertools.product(n_est, contamination))
score = -1
bs = None
for n_est, con in tqdm(params):
    
    clf_if = HBOS(n_bins=n_est, contamination=con)
    clf_if.fit(benign_train_samples)
    y_pred = clf_if.predict(test_samples)
    test_pred = y_pred

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

100%|██████████| 36/36 [05:06<00:00,  8.50s/it]


{'n_estimators': 25, 'con': 0.01}
0.9200514280151223
              precision    recall  f1-score   support

           0     0.9984    0.9873    0.9928    137714
           1     0.7578    0.9607    0.8473      5704

    accuracy                         0.9862    143418
   macro avg     0.8781    0.9740    0.9201    143418
weighted avg     0.9888    0.9862    0.9870    143418



In [47]:
n_est = [5,10,15,20,25,30]
contamination = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, contamination))
score = -1
bs = None
for n_est, con in tqdm(params):
    
    clf_if = HBOS(n_bins=n_est, contamination=con)
    clf_if.fit(normal_train_samples)
    y_pred = clf_if.predict(test_samples)
    test_pred = y_pred

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

100%|██████████| 36/36 [05:11<00:00,  8.64s/it]

{'n_estimators': 5, 'con': 0.05}
0.9264385476143018
              precision    recall  f1-score   support

           0     1.0000    0.9865    0.9932    137714
           1     0.7539    1.0000    0.8597      5704

    accuracy                         0.9870    143418
   macro avg     0.8769    0.9932    0.9264    143418
weighted avg     0.9902    0.9870    0.9879    143418






In [48]:
##  HBOS  RAw

In [49]:
from pyod.models.cblof import CBLOF

n_est = [5,10,15,20,25,30]
contamination = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, contamination))
score = -1
bs = None
for n_est, con in tqdm(params):
    
    try:
        clf_b = HBOS(n_bins=n_est, contamination=con)
        clf_b.fit(raw_benign_train_samples)
    except ValueError as e:
        print(n_est)
        continue  
   
    y_pred = clf_b.predict(raw_test_samples)
    test_pred = y_pred

    f1 = f1_score(raw_test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                        "con": con
                }
        bs = test_pred
    del clf_b
    gc.collect()

  

print(best_params)
print(score)
print(classification_report(raw_test_labels, bs, digits=4))

100%|██████████| 36/36 [00:19<00:00,  1.84it/s]

{'n_estimators': 30, 'con': 0.04}
0.7146257915510277
              precision    recall  f1-score   support

           0     0.9820    0.9613    0.9715     68857
           1     0.3805    0.5743    0.4577      2852

    accuracy                         0.9459     71709
   macro avg     0.6812    0.7678    0.7146     71709
weighted avg     0.9581    0.9459    0.9511     71709






In [50]:
n_est = [5,10,15,20,25,30]
contamination = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, contamination))
score = -1
bs = None
for n_est, con in tqdm(params):
    
    try:
        clf_if = HBOS(n_bins=n_est, contamination=con)
        clf_if.fit(raw_normal_train_samples)
    except ValueError as e:
        print(n_est)
        continue  
    
    y_pred = clf_if.predict(raw_test_samples)
    test_pred = y_pred

    f1 = f1_score(raw_test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est
                }
        bs = test_pred
    del clf_if
    gc.collect()

        

print("benign only")
print(best_params)
print(score)
print(classification_report(raw_test_labels, bs, digits=4))

100%|██████████| 36/36 [00:19<00:00,  1.85it/s]

benign only
{'n_estimators': 30}
0.6688907552042004
              precision    recall  f1-score   support

           0     0.9740    0.9719    0.9730     68857
           1     0.3556    0.3745    0.3648      2852

    accuracy                         0.9481     71709
   macro avg     0.6648    0.6732    0.6689     71709
weighted avg     0.9494    0.9481    0.9488     71709






In [51]:
##  PCA  Emb

In [52]:
from pyod.models.pca import PCA
n_est = [5,10,15,20,25,30]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    clf_if = PCA(n_components=n_est, contamination=con)
    clf_if.fit(benign_train_samples)
    y_pred = clf_if.predict(test_samples)
    test_pred = y_pred

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

  0%|          | 0/36 [00:00<?, ?it/s]

100%|██████████| 36/36 [01:53<00:00,  3.16s/it]


{'n_estimators': 5, 'con': 0.01}
0.8076747200514057
              precision    recall  f1-score   support

           0     0.9817    0.9914    0.9865    137714
           1     0.7282    0.5533    0.6288      5704

    accuracy                         0.9740    143418
   macro avg     0.8549    0.7724    0.8077    143418
weighted avg     0.9716    0.9740    0.9723    143418



In [53]:
n_est = [5,10,15,20,25,30]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    clf_if = PCA(n_components=n_est, contamination=con)
    clf_if.fit(normal_train_samples)
    y_pred = clf_if.predict(test_samples)
    test_pred = y_pred

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

100%|██████████| 36/36 [01:52<00:00,  3.13s/it]

{'n_estimators': 5, 'con': 0.05}
0.9102031301543931
              precision    recall  f1-score   support

           0     0.9969    0.9871    0.9920    137714
           1     0.7489    0.9267    0.8284      5704

    accuracy                         0.9847    143418
   macro avg     0.8729    0.9569    0.9102    143418
weighted avg     0.9871    0.9847    0.9855    143418






In [54]:
##  PCA  RAw

In [55]:
n_est = [5,10,15,20,25,30]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    clf_if = PCA(n_components=n_est, contamination=con)
    clf_if.fit(raw_benign_train_samples)
   
    y_pred = clf_if.predict(raw_test_samples)
    test_pred = y_pred

    f1 = f1_score(raw_test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                        "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()

  

print(best_params)
print(score)
print(classification_report(raw_test_labels, bs, digits=4))

  0%|          | 0/36 [00:00<?, ?it/s]

100%|██████████| 36/36 [00:16<00:00,  2.17it/s]

{'n_estimators': 15, 'con': 0.04}
0.707439747737215
              precision    recall  f1-score   support

           0     0.9815    0.9598    0.9705     68857
           1     0.3671    0.5628    0.4444      2852

    accuracy                         0.9440     71709
   macro avg     0.6743    0.7613    0.7074     71709
weighted avg     0.9570    0.9440    0.9496     71709






In [56]:
n_est = [5,10,15,20,25,30]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    clf_if = PCA(n_components=n_est, contamination=con)
    clf_if.fit(raw_normal_train_samples)

    y_pred = clf_if.predict(raw_test_samples)
    test_pred = y_pred

    f1 = f1_score(raw_test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est
                }
        bs = test_pred
    del clf_if
    gc.collect()

        

print("benign only")
print(best_params)
print(score)
print(classification_report(raw_test_labels, bs, digits=4))

100%|██████████| 36/36 [00:17<00:00,  2.07it/s]

benign only
{'n_estimators': 5}
0.6589707768342642
              precision    recall  f1-score   support

           0     0.9843    0.9222    0.9522     68857
           1     0.2554    0.6441    0.3657      2852

    accuracy                         0.9111     71709
   macro avg     0.6198    0.7832    0.6590     71709
weighted avg     0.9553    0.9111    0.9289     71709






In [57]:
##  IF  Emb

In [58]:
from sklearn.ensemble import IsolationForest
n_est = [20, 50, 100, 150]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    clf_if = IsolationForest(n_estimators=n_est, contamination=con)
    clf_if.fit(benign_train_samples)
    y_pred = clf_if.predict(test_samples)
    test_pred = list(map(lambda x : 0 if x == 1 else 1, y_pred))

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

  0%|          | 0/24 [00:00<?, ?it/s]

100%|██████████| 24/24 [01:15<00:00,  3.13s/it]


{'n_estimators': 20, 'con': 0.01}
0.8751162777785272
              precision    recall  f1-score   support

           0     0.9901    0.9900    0.9900    137714
           1     0.7585    0.7619    0.7602      5704

    accuracy                         0.9809    143418
   macro avg     0.8743    0.8759    0.8751    143418
weighted avg     0.9809    0.9809    0.9809    143418



In [59]:
from sklearn.ensemble import IsolationForest

n_est = [20, 50, 100, 150]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    clf_if = IsolationForest(n_estimators=n_est, contamination=con)
    clf_if.fit(normal_train_samples)
    y_pred = clf_if.predict(test_samples)
    test_pred = list(map(lambda x : 0 if x == 1 else 1, y_pred))

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

100%|██████████| 24/24 [01:18<00:00,  3.28s/it]


{'n_estimators': 50, 'con': 0.05}
0.9264385476143018
              precision    recall  f1-score   support

           0     1.0000    0.9865    0.9932    137714
           1     0.7539    1.0000    0.8597      5704

    accuracy                         0.9870    143418
   macro avg     0.8769    0.9932    0.9264    143418
weighted avg     0.9902    0.9870    0.9879    143418



In [60]:
##  IF  Raw

In [37]:
from sklearn.ensemble import IsolationForest

n_est = [20, 50, 100, 150]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    clf_if = IsolationForest(n_estimators=n_est, contamination=con)
    clf_if.fit(raw_benign_train_samples.to_numpy())
   
    y_pred = clf_if.predict(raw_test_samples.to_numpy())
    test_pred = list(map(lambda x : 0 if x == 1 else 1, y_pred))

    f1 = f1_score(raw_test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                        "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()

  

print(best_params)
print(score)
print(classification_report(raw_test_labels, bs, digits=4))

100%|██████████| 24/24 [00:24<00:00,  1.04s/it]

{'n_estimators': 20, 'con': 0.1}
0.6790403287090715
              precision    recall  f1-score   support

           0     0.9923    0.9081    0.9483     68857
           1     0.2721    0.8296    0.4098      2852

    accuracy                         0.9049     71709
   macro avg     0.6322    0.8688    0.6790     71709
weighted avg     0.9636    0.9049    0.9269     71709






In [62]:
n_est = [20, 50, 100, 150]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    clf_if = IsolationForest(n_estimators=n_est, contamination=con)
    clf_if.fit(raw_normal_train_samples.to_numpy())

    y_pred = clf_if.predict(raw_test_samples.to_numpy())
    test_pred = list(map(lambda x : 0 if x == 1 else 1, y_pred))

    f1 = f1_score(raw_test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est
                }
        bs = test_pred
    del clf_if
    gc.collect()

        

print("benign only")
print(best_params)
print(score)
print(classification_report(raw_test_labels, bs, digits=4))

100%|██████████| 24/24 [00:24<00:00,  1.00s/it]

benign only
{'n_estimators': 50}
0.6661127579055507
              precision    recall  f1-score   support

           0     0.9853    0.9231    0.9532     68857
           1     0.2646    0.6680    0.3790      2852

    accuracy                         0.9130     71709
   macro avg     0.6250    0.7955    0.6661     71709
weighted avg     0.9567    0.9130    0.9304     71709






In [None]:
# LOF Emb

In [None]:
from pyod.models.lof import LOF
n_est = [20, 50, 100, 150]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    clf_if = LOF(n_neighbors=n_est, contamination=con)
    clf_if.fit(benign_train_samples)
    y_pred = clf_if.predict(test_samples)
    test_pred = list(map(lambda x: 0 if x == 1 else 1, y_pred))

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                       }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

  0%|          | 0/24 [00:00<?, ?it/s]

In [None]:
#kNN Emb

In [None]:
from pyod.models.knn import KNN
n_est = [1,2,3,4,5]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    clf_if = KNN(method="mean", contamination=con, n_neighbors=n_est)
    clf_if.fit(benign_train_samples)
    y_pred = clf_if.predict(test_samples)
    test_pred = list(map(lambda x: 0 if x == 1 else 1, y_pred))

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                       }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

  0%|          | 0/30 [00:00<?, ?it/s]

In [1]:
# EllipticEnvelope Emb

In [41]:
from sklearn.covariance import EllipticEnvelope
n_est = [20, 50, 100, 150]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    clf_if = EllipticEnvelope(contamination=con)
    clf_if.fit(benign_train_samples)
    y_pred = clf_if.predict(test_samples)
    test_pred = list(map(lambda x: 0 if x == 1 else 1, y_pred))

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                       }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

  0%|          | 0/24 [03:27<?, ?it/s]


KeyboardInterrupt: 

In [None]:
# Isolation Kernel Emb

In [45]:
from sklearn.cluster import KMeans
n_est = [20, 50, 100, 150]
score = -1
bs = None

for n_es in tqdm(n_est):
    clf_if = KMeans(n_clusters=n_es,random_state=42)
    clf_if.fit(benign_train_samples)
    y_pred = clf_if.predict(test_samples)
    test_pred = list(map(lambda x: 0 if x == 1 else 1, y_pred))

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_es,
                       "con": con
                       }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

{'n_estimators': 50, 'con': 0.01}
0.03825055994420676
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000    137714
           1     0.0398    1.0000    0.0765      5704

    accuracy                         0.0398    143418
   macro avg     0.0199    0.5000    0.0383    143418
weighted avg     0.0016    0.0398    0.0030    143418



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [44]:
# COF emb

In [39]:
from pyod.models.cof import COF
n_est = [20, 50, 100, 150]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    clf_if = COF(contamination=con,n_neighbors=n_est,method='memory')
    clf_if.fit(benign_train_samples)
    y_pred = clf_if.predict(test_samples)
    test_pred = list(map(lambda x: 0 if x == 1 else 1, y_pred))

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                       }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

  0%|          | 0/24 [02:22<?, ?it/s]


KeyboardInterrupt: 

In [None]:
# COF emb

In [None]:
from pyod.models.sod import SOD
n_est = [20, 50, 100, 150]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    clf_if = SOD(contamination=con, n_neighbors=n_est)
    clf_if.fit(benign_train_samples)
    y_pred = clf_if.predict(test_samples)
    test_pred = list(map(lambda x: 0 if x == 1 else 1, y_pred))

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                       }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

  0%|          | 0/24 [00:00<?, ?it/s]

In [1]:
# SOS emb

In [83]:
from pyod.models.sos import SOS
n_est = [20, 50, 100, 150]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for con in tqdm(cont):
    clf_if = SOS(contamination=con)
    clf_if.fit(raw_normal_train_samples)
    y_pred = clf_if.predict(raw_test_samples)
    test_pred = list(map(lambda x: 0 if x == 1 else 1, y_pred))

    f1 = f1_score(raw_test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                       }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(raw_test_labels, bs, digits=4))

  0%|          | 0/6 [00:00<?, ?it/s]


MemoryError: Unable to allocate 209. GiB for an array with shape (167318, 167318) and data type float64

In [96]:
# SUOD emb

In [98]:
from pyod.models.suod import SUOD
from pyod.models.iforest import IForest
from pyod.models.pca import PCA

n_est = [20, 50, 100, 150]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for con in tqdm(cont):
    base_est = [IForest(), PCA()]
    clf_if = SUOD(base_estimators=base_est, contamination=con)
    clf_if.fit(benign_train_samples)
    y_pred = clf_if.predict(test_samples)
    test_pred = list(map(lambda x: 0 if x == 1 else 1, y_pred))

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                       }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

  0%|          | 0/6 [00:00<?, ?it/s]

RandomForestRegressor()



  0%|          | 0/6 [00:17<?, ?it/s]


KeyboardInterrupt: 

In [39]:
# Feature Bragging Emb

In [37]:
from pyod.models.feature_bagging import FeatureBagging
from pyod.models.knn import KNN
from pyod.models.cblof import CBLOF
from pyod.models.iforest import IForest
from sklearn.ensemble import IsolationForest

n_est = [20]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    base_est = IForest()
    clf_if = FeatureBagging(base_estimator=base_est, contamination=con, n_estimators=n_est)
    clf_if.fit(benign_train_samples)
    y_pred = clf_if.predict(test_samples)
    test_pred = list(map(lambda x: 0 if x == 1 else 1, y_pred))

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                       }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

100%|██████████| 6/6 [04:17<00:00, 42.87s/it]

{'n_estimators': 20, 'con': 0.2}
0.4246479720784691
              precision    recall  f1-score   support

           0     0.9489    0.7686    0.8493    137714
           1     0.0000    0.0000    0.0000      5704

    accuracy                         0.7381    143418
   macro avg     0.4744    0.3843    0.4246    143418
weighted avg     0.9111    0.7381    0.8155    143418






In [38]:
from pyod.models.feature_bagging import FeatureBagging
from pyod.models.knn import KNN
from pyod.models.cblof import CBLOF
from pyod.models.iforest import IForest
from sklearn.ensemble import IsolationForest
from pyod.models.auto_encoder import AutoEncoder

n_est = [20]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params[:1]):
    clf_if = AutoEncoder(contamination=con)
    clf_if.fit(benign_train_samples)
    y_pred = clf_if.predict(test_samples)
    test_pred = list(map(lambda x: 0 if x == 1 else 1, y_pred))

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                       }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

Training: 100%|██████████| 10/10 [02:45<00:00, 16.58s/it]
100%|██████████| 1/1 [02:56<00:00, 176.11s/it]

{'n_estimators': 20, 'con': 0.001}
0.03424816280638335
              precision    recall  f1-score   support

           0     0.2819    0.0032    0.0063    137714
           1     0.0324    0.8049    0.0622      5704

    accuracy                         0.0351    143418
   macro avg     0.1571    0.4040    0.0342    143418
weighted avg     0.2720    0.0351    0.0085    143418




