In [1]:
import pandas as pd
import numpy as np
import torch
from sklearn import preprocessing
from dgl.data import DGLDataset
import dgl
import time
import networkx as nx
import category_encoders as ce
import torch.nn as nn
import torch.nn.functional as F
import dgl.function as fn
import torch
import tqdm
import math

from typing import *
from sklearn.preprocessing import StandardScaler, Normalizer
import socket
import struct
import random
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
file_name = "NF-UNSW-NB15-v3.parquet"
data = pd.read_parquet(file_name)

In [3]:
data.Label.value_counts()

Label
0    2151027
1      91904
Name: count, dtype: int64

In [4]:
data.rename(columns=lambda x: x.strip(), inplace=True)
data['IPV4_SRC_ADDR'] = data["IPV4_SRC_ADDR"].apply(str)
data['L4_SRC_PORT'] = data["L4_SRC_PORT"].apply(str)
data['IPV4_DST_ADDR'] = data["IPV4_DST_ADDR"].apply(str)
data['L4_DST_PORT'] = data["L4_DST_PORT"].apply(str)

In [5]:
data.drop(columns=["L4_SRC_PORT", "L4_DST_PORT"], inplace=True)

In [6]:
data.Attack.unique()

array(['Benign', 'Fuzzers', 'Exploits', 'Backdoor', 'Generic', 'DoS',
       'Reconnaissance', 'Shellcode', 'Analysis', 'Worms'], dtype=object)

In [7]:
data = data.groupby(by='Attack').sample(frac=0.1, random_state=13)

In [8]:
data.groupby(by="Attack").count()

Unnamed: 0_level_0,FLOW_START_MILLISECONDS,FLOW_END_MILLISECONDS,IPV4_SRC_ADDR,IPV4_DST_ADDR,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,OUT_BYTES,OUT_PKTS,...,FTP_COMMAND_RET_CODE,SRC_TO_DST_IAT_MIN,SRC_TO_DST_IAT_MAX,SRC_TO_DST_IAT_AVG,SRC_TO_DST_IAT_STDDEV,DST_TO_SRC_IAT_MIN,DST_TO_SRC_IAT_MAX,DST_TO_SRC_IAT_AVG,DST_TO_SRC_IAT_STDDEV,Label
Attack,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Analysis,123,123,123,123,123,123,123,123,123,123,...,123,123,123,123,123,123,123,123,123,123
Backdoor,345,345,345,345,345,345,345,345,345,345,...,345,345,345,345,345,345,345,345,345,345
Benign,215103,215103,215103,215103,215103,215103,215103,215103,215103,215103,...,215103,215103,215103,215103,215103,215103,215103,215103,215103,215103
DoS,505,505,505,505,505,505,505,505,505,505,...,505,505,505,505,505,505,505,505,505,505
Exploits,3882,3882,3882,3882,3882,3882,3882,3882,3882,3882,...,3882,3882,3882,3882,3882,3882,3882,3882,3882,3882
Fuzzers,2559,2559,2559,2559,2559,2559,2559,2559,2559,2559,...,2559,2559,2559,2559,2559,2559,2559,2559,2559,2559
Generic,476,476,476,476,476,476,476,476,476,476,...,476,476,476,476,476,476,476,476,476,476
Reconnaissance,1129,1129,1129,1129,1129,1129,1129,1129,1129,1129,...,1129,1129,1129,1129,1129,1129,1129,1129,1129,1129
Shellcode,159,159,159,159,159,159,159,159,159,159,...,159,159,159,159,159,159,159,159,159,159
Worms,14,14,14,14,14,14,14,14,14,14,...,14,14,14,14,14,14,14,14,14,14


In [9]:
# X = data.drop(columns=["Attack", "Label"])
# calculate the distance miliseconds betweeen columns 2 and 1, put the new column name FLOW_TIME_DIFF (Modified)
# data['FLOW_DIFF_MILISECONDS'] = (data['FLOW_END_MILLISECONDS'] - data['FLOW_START_MILLISECONDS'])
X = data.drop(columns=["Attack", "Label", "FLOW_START_MILLISECONDS", "FLOW_END_MILLISECONDS",
                       "SRC_TO_DST_IAT_MIN", "SRC_TO_DST_IAT_MAX", "SRC_TO_DST_IAT_AVG",
                       "SRC_TO_DST_IAT_STDDEV", "DST_TO_SRC_IAT_MIN", "DST_TO_SRC_IAT_MAX",
                       "DST_TO_SRC_IAT_AVG", "DST_TO_SRC_IAT_STDDEV"])
y = data[["Attack", "Label"]]

X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=13, stratify=y)

In [10]:
encoder = ce.TargetEncoder(cols=['TCP_FLAGS','L7_PROTO','PROTOCOL',
                                  'CLIENT_TCP_FLAGS','SERVER_TCP_FLAGS','ICMP_TYPE',
                                  'ICMP_IPV4_TYPE','DNS_QUERY_ID','DNS_QUERY_TYPE',
                                  'FTP_COMMAND_RET_CODE'])
encoder.fit(X_train, y_train.Label)

# Transform on training set
X_train = encoder.transform(X_train)

# Transform on testing set
X_test = encoder.transform(X_test)

In [11]:
X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)
X_train.fillna(0, inplace=True)
X_test.fillna(0, inplace=True)

In [12]:
scaler = Normalizer()
cols_to_norm = list(set(list(X_train.iloc[:, 2:].columns))) # Ignore first two as the represents IP addresses
scaler.fit(X_train[cols_to_norm])

# Transform on training set
X_train[cols_to_norm] = scaler.transform(X_train[cols_to_norm])
X_train['h'] = X_train.iloc[:, 2:].values.tolist()

# Transform on testing set
X_test[cols_to_norm] = scaler.transform(X_test[cols_to_norm])
X_test['h'] = X_test.iloc[:, 2:].values.tolist()

train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

In [13]:
X_train.head()

Unnamed: 0,IPV4_SRC_ADDR,IPV4_DST_ADDR,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,OUT_BYTES,OUT_PKTS,TCP_FLAGS,CLIENT_TCP_FLAGS,...,NUM_PKTS_1024_TO_1514_BYTES,TCP_WIN_MAX_IN,TCP_WIN_MAX_OUT,ICMP_TYPE,ICMP_IPV4_TYPE,DNS_QUERY_ID,DNS_QUERY_TYPE,DNS_TTL_ANSWER,FTP_COMMAND_RET_CODE,h
287406,59.166.0.8,149.171.126.4,4.732152e-09,0.0,0.003486,5e-06,0.000317,4e-06,1.5059e-09,1.61155e-09,...,2.249372e-06,0.001493,0.00285,2.127056e-08,2.097309e-08,4.396601e-09,4.397286e-09,0.0,4.240589e-09,"[4.732152333835413e-09, 0.0, 0.003485589444774..."
610356,59.166.0.9,149.171.126.6,6.932495e-10,7.976222e-09,0.000215,2e-06,0.000128,2e-06,9.621527e-10,9.621527e-10,...,0.0,0.0,0.0,4.010061e-09,4.009943e-09,1.969528e-08,1.969835e-08,0.0,1.89964e-08,"[6.932494894241551e-10, 7.976221604219919e-09,..."
1456518,59.166.0.2,149.171.126.2,6.381741e-07,2.401223e-07,0.060366,0.000354,0.03893,0.000379,2.030844e-07,2.173323e-07,...,0.0,0.18302,0.18302,4.321202e-07,4.840666e-07,5.929219e-07,5.930143e-07,0.0,5.718822e-07,"[6.381740567526722e-07, 2.40122348914616e-07, ..."
2204716,59.166.0.0,149.171.126.0,1.390953e-08,2.334703e-08,0.000405,6e-06,0.000476,4e-06,4.426392e-09,4.736937e-09,...,0.0,0.002792,0.003191,4.531849e-09,4.766241e-09,1.292322e-08,1.292524e-08,0.0,1.246465e-08,"[1.3909530687793816e-08, 2.334702611325599e-08..."
1090799,59.166.0.1,149.171.126.3,4.602189e-09,0.0,0.000213,3e-06,0.001485,3e-06,1.464542e-09,1.567291e-09,...,9.114981e-07,0.00198,0.00132,8.530939e-11,8.530939e-11,4.275853e-09,4.276519e-09,0.0,4.124125e-09,"[4.602188564242696e-09, 0.0, 0.000212743645752..."


In [14]:
lab_enc = preprocessing.LabelEncoder()
lab_enc.fit(data["Attack"])

# Transform on training set
train["Attack"] = lab_enc.transform(train["Attack"])

# Transform on testing set
test["Attack"] = lab_enc.transform(test["Attack"])

In [15]:
# Training graph

train_g = nx.from_pandas_edgelist(train, "IPV4_SRC_ADDR", "IPV4_DST_ADDR",
           ["h", "Label", "Attack"], create_using=nx.MultiGraph())
train_g = train_g.to_directed()
train_g = dgl.from_networkx(train_g, edge_attrs=['h', 'Attack', 'Label'])
nfeat_weight = torch.ones([train_g.number_of_nodes(),
train_g.edata['h'].shape[1]])
train_g.ndata['h'] = nfeat_weight

# Testing graph
test_g = nx.from_pandas_edgelist(test, "IPV4_SRC_ADDR", "IPV4_DST_ADDR",
            ["h", "Label", "Attack"], create_using=nx.MultiGraph())

test_g = test_g.to_directed()
test_g = dgl.from_networkx(test_g, edge_attrs=['h', 'Attack', 'Label'])
nfeat_weight = torch.ones([test_g.number_of_nodes(),
test_g.edata['h'].shape[1]])
test_g.ndata['h'] = nfeat_weight

In [16]:
import torch.nn as nn
import torch.nn.functional as F
import dgl.function as fn
import tqdm
import gc

class SAGELayer(nn.Module):
    def __init__(self, ndim_in, edims, ndim_out, activation):
      super(SAGELayer, self).__init__()
      self.W_apply = nn.Linear(ndim_in + edims , ndim_out)
      self.activation = F.relu
      self.W_edge = nn.Linear(128 * 2, 256)
      self.reset_parameters()

    def reset_parameters(self):
      gain = nn.init.calculate_gain('relu')
      nn.init.xavier_uniform_(self.W_apply.weight, gain=gain)

    def message_func(self, edges):
      return {'m':  edges.data['h']}

    def forward(self, g_dgl, nfeats, efeats):
      with g_dgl.local_scope():
        g = g_dgl
        g.ndata['h'] = nfeats
        g.edata['h'] = efeats
        g.update_all(self.message_func, fn.mean('m', 'h_neigh'))
        g.ndata['h'] = F.relu(self.W_apply(torch.cat([g.ndata['h'], g.ndata['h_neigh']], 2)))

        # Compute edge embeddings
        u, v = g.edges()
        edge = self.W_edge(torch.cat((g.srcdata['h'][u], g.dstdata['h'][v]), 2))
        return g.ndata['h'], edge

In [17]:
class SAGE(nn.Module):
    def __init__(self, ndim_in, ndim_out, edim,  activation):
      super(SAGE, self).__init__()
      self.layers = nn.ModuleList()
      self.layers.append(SAGELayer(ndim_in, edim, 128, F.relu))

    def forward(self, g, nfeats, efeats, corrupt=False):
      if corrupt:
        e_perm = torch.randperm(g.number_of_edges())
        #n_perm = torch.randperm(g.number_of_nodes())
        efeats = efeats[e_perm]
        #nfeats = nfeats[n_perm]
      for i, layer in enumerate(self.layers):
        #nfeats = layer(g, nfeats, efeats)
        nfeats, e_feats = layer(g, nfeats, efeats)
      #return nfeats.sum(1)
      return nfeats.sum(1), e_feats.sum(1)

In [18]:
class Discriminator(nn.Module):
    def __init__(self, n_hidden):
      super(Discriminator, self).__init__()
      self.weight = nn.Parameter(torch.Tensor(n_hidden, n_hidden))
      self.reset_parameters()

    def uniform(self, size, tensor):
      bound = 1.0 / math.sqrt(size)
      if tensor is not None:
        tensor.data.uniform_(-bound, bound)

    def reset_parameters(self):
      size = self.weight.size(0)
      self.uniform(size, self.weight)

    def forward(self, features, summary):
      features = torch.matmul(features, torch.matmul(self.weight, summary))
      return features

In [19]:
class DGI(nn.Module):
    def __init__(self, ndim_in, ndim_out, edim, activation):
      super(DGI, self).__init__()
      self.encoder = SAGE(ndim_in, ndim_out, edim,  F.relu)
      #self.discriminator = Discriminator(128)
      self.discriminator = Discriminator(256)
      self.loss = nn.BCEWithLogitsLoss()

    def forward(self, g, n_features, e_features):
      positive = self.encoder(g, n_features, e_features, corrupt=False)
      negative = self.encoder(g, n_features, e_features, corrupt=True)
      self.loss = nn.BCEWithLogitsLoss()

    def forward(self, g, n_features, e_features):
      positive = self.encoder(g, n_features, e_features, corrupt=False)
      negative = self.encoder(g, n_features, e_features, corrupt=True)

      positive = positive[1]
      negative = negative[1]

      summary = torch.sigmoid(positive.mean(dim=0))

      positive = self.discriminator(positive, summary)
      negative = self.discriminator(negative, summary)

      l1 = self.loss(positive, torch.ones_like(positive))
      l2 = self.loss(negative, torch.zeros_like(negative))

      return l1 + l2

In [20]:
ndim_in = train_g.ndata['h'].shape[1]
hidden_features = 128
ndim_out = 128
num_layers = 1
edim = train_g.edata['h'].shape[1]
learning_rate = 1e-3
epochs = 4000

In [21]:
dgi = DGI(ndim_in,
    ndim_out,
    edim,
    F.relu)

dgi = dgi.to('cuda')

dgi_optimizer = torch.optim.Adam(dgi.parameters(),
                lr=1e-3,
                weight_decay=0.)

In [22]:
# Format node and edge features for E-GraphSAGE
train_g.ndata['h'] = torch.reshape(train_g.ndata['h'],
                                   (train_g.ndata['h'].shape[0], 1,
                                    train_g.ndata['h'].shape[1]))

train_g.edata['h'] = torch.reshape(train_g.edata['h'],
                                   (train_g.edata['h'].shape[0], 1,
                                    train_g.edata['h'].shape[1]))

In [23]:
# Convert to GPU
train_g = train_g.to('cuda')

In [24]:
cnt_wait = 0
best = 1e9
best_t = 0
dur = []
node_features = train_g.ndata['h'] 
edge_features = train_g.edata['h']

for epoch in range(epochs):
    dgi.train()
    if epoch >= 3:
        t0 = time.time()

    dgi_optimizer.zero_grad()
    loss = dgi(train_g, node_features, edge_features)
    loss.backward()
    dgi_optimizer.step()

    if loss < best:
        best = loss
        best_t = epoch
        cnt_wait = 0
        torch.save(dgi.state_dict(), 'best_dgi_UNSW_v3.pkl')
    else:
        cnt_wait += 1

  # if cnt_wait == patience:
  #     print('Early stopping!')
  #     break

    if epoch >= 3:
        dur.append(time.time() - t0)

    if epoch % 50 == 0:

        print("Epoch {:05d} | Time(s) {:.4f} | Loss {:.4f} | "
            "ETputs(KTEPS) {:.2f}".format(epoch, np.mean(dur),
              loss.item(),
              train_g.num_edges() / np.mean(dur) / 1000))

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Epoch 00000 | Time(s) nan | Loss 1.9401 | ETputs(KTEPS) nan
Epoch 00050 | Time(s) 0.0902 | Loss 1.3919 | ETputs(KTEPS) 3481.73
Epoch 00100 | Time(s) 0.0922 | Loss 1.3774 | ETputs(KTEPS) 3407.37
Epoch 00150 | Time(s) 0.0922 | Loss 1.3526 | ETputs(KTEPS) 3406.03
Epoch 00200 | Time(s) 0.0919 | Loss 1.3576 | ETputs(KTEPS) 3417.49
Epoch 00250 | Time(s) 0.0917 | Loss 1.3371 | ETputs(KTEPS) 3422.94
Epoch 00300 | Time(s) 0.0915 | Loss 1.2568 | ETputs(KTEPS) 3432.78
Epoch 00350 | Time(s) 0.0912 | Loss 1.0818 | ETputs(KTEPS) 3443.23
Epoch 00400 | Time(s) 0.0910 | Loss 0.6947 | ETputs(KTEPS) 3451.14
Epoch 00450 | Time(s) 0.0909 | Loss 0.2463 | ETputs(KTEPS) 3456.13
Epoch 00500 | Time(s) 0.0908 | Loss 0.1004 | ETputs(KTEPS) 3458.52
Epoch 00550 | Time(s) 0.0907 | Loss 0.0519 | ETputs(KTEPS) 3461.74
Epoch 00600 | Time(s) 0.0906 | Loss 0.0413 | ETputs(KTEPS) 3465.10
Epoch 00650 | Time(s) 0.0906 | Loss 0.0243 | ETputs(KTEPS) 3467.75
Epoch 00700 | Time(s) 0.0905 | Loss 0.0175 | ETputs(KTEPS) 3469.76
Ep

In [25]:
dgi.load_state_dict(torch.load('best_dgi_UNSW_v3.pkl'))

  dgi.load_state_dict(torch.load('best_dgi_UNSW_v3.pkl'))


<All keys matched successfully>

In [26]:
training_emb = dgi.encoder(train_g, train_g.ndata['h'], train_g.edata['h'])[1]
training_emb = training_emb.detach().cpu().numpy()

In [27]:
test_g.ndata['h'] = torch.reshape(test_g.ndata['h'],
                                   (test_g.ndata['h'].shape[0], 1,
                                    test_g.ndata['h'].shape[1]))



test_g.edata['h'] = torch.reshape(test_g.edata['h'],
                                   (test_g.edata['h'].shape[0], 1,
                                    test_g.edata['h'].shape[1]))

In [28]:
# Convert to GPU
test_g = test_g.to('cuda')

In [29]:
testing_emb = dgi.encoder(test_g, test_g.ndata['h'], test_g.edata['h'])[1]
testing_emb = testing_emb.detach().cpu().numpy()

In [30]:
df_train = pd.DataFrame(training_emb, )
df_train["Attack"] = lab_enc.inverse_transform(
        train_g.edata['Attack'].detach().cpu().numpy())
df_train["Label"] = train_g.edata['Label'].detach().cpu().numpy()

df_test = pd.DataFrame(testing_emb, )
df_test["Attack"] = lab_enc.inverse_transform(
        test_g.edata['Attack'].detach().cpu().numpy())
df_test["Label"] = test_g.edata['Label'].detach().cpu().numpy()

In [31]:
df_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,248,249,250,251,252,253,254,255,Attack,Label
0,0.086565,-0.078549,0.047042,-0.007064,0.074815,0.074366,0.079844,0.014425,0.006140,0.175500,...,0.060640,0.072725,-0.035717,0.003195,-0.009364,-0.047279,-0.045267,0.116082,Benign,0
1,0.086565,-0.078549,0.047042,-0.007064,0.074815,0.074366,0.079844,0.014425,0.006140,0.175500,...,0.060640,0.072725,-0.035717,0.003195,-0.009364,-0.047279,-0.045267,0.116082,Benign,0
2,0.086565,-0.078549,0.047042,-0.007064,0.074815,0.074366,0.079844,0.014425,0.006140,0.175500,...,0.060640,0.072725,-0.035717,0.003195,-0.009364,-0.047279,-0.045267,0.116082,Benign,0
3,0.086565,-0.078549,0.047042,-0.007064,0.074815,0.074366,0.079844,0.014425,0.006140,0.175500,...,0.060640,0.072725,-0.035717,0.003195,-0.009364,-0.047279,-0.045267,0.116082,Benign,0
4,0.086565,-0.078549,0.047042,-0.007064,0.074815,0.074366,0.079844,0.014425,0.006140,0.175500,...,0.060640,0.072725,-0.035717,0.003195,-0.009364,-0.047279,-0.045267,0.116082,Benign,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
314005,0.074496,-0.016501,0.164899,0.047463,-0.003604,0.042451,0.055327,-0.040595,0.019819,0.295323,...,0.072077,0.107858,-0.025936,0.070601,-0.004867,0.010269,0.003222,0.056055,Benign,0
314006,0.074496,-0.016501,0.164899,0.047463,-0.003604,0.042451,0.055327,-0.040595,0.019819,0.295323,...,0.072077,0.107858,-0.025936,0.070601,-0.004867,0.010269,0.003222,0.056055,Benign,0
314007,0.075656,-0.013816,0.163346,0.043606,-0.003214,0.038514,0.052368,-0.043132,0.019450,0.294129,...,0.071256,0.107451,-0.026052,0.070893,-0.005146,0.010128,0.006694,0.058039,Benign,0
314008,0.075741,-0.012053,0.165243,0.042784,-0.002876,0.033116,0.050093,-0.041187,0.021071,0.292281,...,0.072707,0.107472,-0.027843,0.070031,-0.005759,0.010884,0.010045,0.058938,Benign,0


# Embeddings CBLOF  Embeddings

In [32]:
import torch
import dgl
import numpy as np
import pandas as pd
import torch.optim as optim
import time
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
from sklearn.ensemble import IsolationForest
import gc

from tqdm import tqdm
import itertools

In [33]:
benign_train_samples = df_train[df_train.Label == 0].drop(columns=["Label", "Attack"])
normal_train_samples = df_train.drop(columns=["Label", "Attack"])

train_labels = df_train["Label"]
test_labels = df_test["Label"]

test_samples = df_test.drop(columns=["Label", "Attack"])

In [34]:
test_samples

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,246,247,248,249,250,251,252,253,254,255
0,0.086665,-0.077150,0.047454,-0.007279,0.074681,0.073748,0.078764,0.015200,0.006211,0.175336,...,0.042621,0.038639,0.060561,0.072690,-0.035716,0.004154,-0.009358,-0.047378,-0.044947,0.116477
1,0.086665,-0.077150,0.047454,-0.007279,0.074681,0.073748,0.078764,0.015200,0.006211,0.175336,...,0.042621,0.038639,0.060561,0.072690,-0.035716,0.004154,-0.009358,-0.047378,-0.044947,0.116477
2,0.086665,-0.077150,0.047454,-0.007279,0.074681,0.073748,0.078764,0.015200,0.006211,0.175336,...,0.042621,0.038639,0.060561,0.072690,-0.035716,0.004154,-0.009358,-0.047378,-0.044947,0.116477
3,0.086665,-0.077150,0.047454,-0.007279,0.074681,0.073748,0.078764,0.015200,0.006211,0.175336,...,0.042621,0.038639,0.060561,0.072690,-0.035716,0.004154,-0.009358,-0.047378,-0.044947,0.116477
4,0.086665,-0.077150,0.047454,-0.007279,0.074681,0.073748,0.078764,0.015200,0.006211,0.175336,...,0.042621,0.038639,0.060561,0.072690,-0.035716,0.004154,-0.009358,-0.047378,-0.044947,0.116477
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
134570,0.076041,-0.011342,0.164471,0.042110,-0.003041,0.031472,0.049593,-0.040263,0.021010,0.291303,...,0.036148,-0.022990,0.072594,0.106978,-0.027857,0.070675,-0.006553,0.010637,0.010740,0.059727
134571,0.075704,-0.012132,0.165324,0.042877,-0.002877,0.033308,0.050156,-0.041293,0.021071,0.292362,...,0.037042,-0.023410,0.072710,0.107524,-0.027825,0.069943,-0.005669,0.010920,0.009956,0.058846
134572,0.075704,-0.012132,0.165324,0.042877,-0.002877,0.033308,0.050156,-0.041293,0.021071,0.292362,...,0.037042,-0.023410,0.072710,0.107524,-0.027825,0.069943,-0.005669,0.010920,0.009956,0.058846
134573,0.075704,-0.012132,0.165324,0.042877,-0.002877,0.033308,0.050156,-0.041293,0.021071,0.292362,...,0.037042,-0.023410,0.072710,0.107524,-0.027825,0.069943,-0.005669,0.010920,0.009956,0.058846


In [35]:
df_raw_train = pd.concat([X_train.drop(columns=["IPV4_SRC_ADDR","IPV4_DST_ADDR", "h"]), y_train], axis=1)
df_raw_test = pd.concat([X_test.drop(columns=["IPV4_SRC_ADDR","IPV4_DST_ADDR", "h"]), y_test], axis=1)

In [36]:
df_raw_train

Unnamed: 0,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,OUT_BYTES,OUT_PKTS,TCP_FLAGS,CLIENT_TCP_FLAGS,SERVER_TCP_FLAGS,FLOW_DURATION_MILLISECONDS,...,TCP_WIN_MAX_IN,TCP_WIN_MAX_OUT,ICMP_TYPE,ICMP_IPV4_TYPE,DNS_QUERY_ID,DNS_QUERY_TYPE,DNS_TTL_ANSWER,FTP_COMMAND_RET_CODE,Attack,Label
287406,4.732152e-09,0.000000e+00,0.003486,0.000005,0.000317,0.000004,1.505900e-09,1.611550e-09,0.000000e+00,2.530544e-06,...,0.001493,0.002850,2.127056e-08,2.097309e-08,4.396601e-09,4.397286e-09,0.000000,4.240589e-09,Benign,0
610356,6.932495e-10,7.976222e-09,0.000215,0.000002,0.000128,0.000002,9.621527e-10,9.621527e-10,4.168656e-09,4.198509e-07,...,0.000000,0.000000,4.010061e-09,4.009943e-09,1.969528e-08,1.969835e-08,0.000000,1.899640e-08,Benign,0
1456518,6.381741e-07,2.401223e-07,0.060366,0.000354,0.038930,0.000379,2.030844e-07,2.173323e-07,0.000000e+00,7.533152e-03,...,0.183020,0.183020,4.321202e-07,4.840666e-07,5.929219e-07,5.930143e-07,0.000000,5.718822e-07,Benign,0
2204716,1.390953e-08,2.334703e-08,0.000405,0.000006,0.000476,0.000004,4.426392e-09,4.736937e-09,0.000000e+00,1.101955e-06,...,0.002792,0.003191,4.531849e-09,4.766241e-09,1.292322e-08,1.292524e-08,0.000000,1.246465e-08,Benign,0
1090799,4.602189e-09,0.000000e+00,0.000213,0.000003,0.001485,0.000003,1.464542e-09,1.567291e-09,0.000000e+00,1.002648e-06,...,0.001980,0.001320,8.530939e-11,8.530939e-11,4.275853e-09,4.276519e-09,0.000000,4.124125e-09,Benign,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1225204,1.960806e-09,1.118412e-09,0.000157,0.000002,0.000195,0.000002,2.721379e-09,2.721379e-09,1.179074e-08,1.187518e-06,...,0.000000,0.000000,1.134216e-08,1.134183e-08,4.233212e-08,4.713307e-10,0.000071,5.372992e-08,Benign,0
1161575,3.553645e-09,0.000000e+00,0.000195,0.000003,0.001865,0.000003,1.130867e-09,1.210206e-09,0.000000e+00,9.853564e-07,...,0.001936,0.001019,1.647801e-10,1.647801e-10,3.301660e-09,3.302174e-09,0.000000,3.184501e-09,Benign,0
1927342,7.894386e-07,0.000000e+00,0.030020,0.000344,0.067420,0.000375,2.512209e-07,2.688460e-07,0.000000e+00,9.662692e-03,...,0.203761,0.203761,6.386150e-08,6.386150e-08,7.334604e-07,7.335747e-07,0.000000,7.074338e-07,Benign,0
844639,2.088732e-06,0.000000e+00,0.042651,0.000786,0.060978,0.000869,5.627404e-07,5.628146e-07,0.000000e+00,4.223766e-02,...,0.599022,0.539120,1.482757e-07,1.751208e-07,1.940623e-06,1.940925e-06,0.000000,1.339281e-06,Benign,0


In [37]:
raw_benign_train_samples = df_raw_train[df_raw_train.Label == 0].drop(columns=["Label", "Attack"])
raw_normal_train_samples = df_raw_train.drop(columns=["Label", "Attack"])

raw_train_labels = df_raw_train["Label"]
raw_test_labels = df_raw_test["Label"]

raw_test_samples = df_raw_test.drop(columns=["Label", "Attack"])

In [38]:
contamination = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]

In [39]:
from pyod.models.cblof import CBLOF
n_est = [2,3,5,7,9,10]
params = list(itertools.product(n_est, contamination))
score = -1
bs = None
for n_est, con in tqdm(params):
    
    clf_if = CBLOF(n_clusters=n_est, contamination=con)
    clf_if.fit(benign_train_samples)
    y_pred = clf_if.predict(test_samples)
    test_pred = y_pred

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

100%|██████████| 36/36 [01:48<00:00,  3.01s/it]

{'n_estimators': 2, 'con': 0.001}
0.9373418473269503
              precision    recall  f1-score   support

           0     1.0000    0.9884    0.9942    129059
           1     0.7865    1.0000    0.8805      5516

    accuracy                         0.9889    134575
   macro avg     0.8933    0.9942    0.9373    134575
weighted avg     0.9913    0.9889    0.9895    134575






In [40]:
n_est = [2,3,5,7,9,10]
contamination = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, contamination))
score = -1
bs = None
for n_est, con in tqdm(params):
    
    clf_if = CBLOF(n_clusters=n_est, contamination=con)
    clf_if.fit(normal_train_samples)
    y_pred = clf_if.predict(test_samples)
    test_pred = y_pred

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

100%|██████████| 36/36 [01:28<00:00,  2.46s/it]

{'n_estimators': 2, 'con': 0.04}
0.9937861202229019
              precision    recall  f1-score   support

           0     1.0000    0.9990    0.9995    129059
           1     0.9765    1.0000    0.9881      5516

    accuracy                         0.9990    134575
   macro avg     0.9882    0.9995    0.9938    134575
weighted avg     0.9990    0.9990    0.9990    134575






In [41]:
###  CBLOF RAW

In [42]:
from pyod.models.cblof import CBLOF

n_est = [2,3,5,7,9,10]
contamination = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, contamination))
score = -1
bs = None
for n_est, con in tqdm(params):
    
    try:
        clf_b = CBLOF(n_clusters=n_est, contamination=con)
        clf_b.fit(raw_benign_train_samples)
    except ValueError as e:
        print(n_est)
        continue  
   
    y_pred = clf_b.predict(raw_test_samples)
    test_pred = y_pred

    f1 = f1_score(raw_test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                        "con": con
                }
        bs = test_pred
    del clf_b
    gc.collect()

  

print(best_params)
print(score)
print(classification_report(raw_test_labels, bs, digits=4))

  0%|          | 0/36 [00:00<?, ?it/s]

  3%|▎         | 1/36 [00:00<00:06,  5.15it/s]

2


 11%|█         | 4/36 [00:01<00:09,  3.20it/s]

2
2


 17%|█▋        | 6/36 [00:01<00:07,  4.16it/s]

2
2


 22%|██▏       | 8/36 [00:02<00:05,  4.69it/s]

3
3


 28%|██▊       | 10/36 [00:02<00:04,  5.30it/s]

3
3


 33%|███▎      | 12/36 [00:02<00:04,  5.33it/s]

3
3


100%|██████████| 36/36 [00:13<00:00,  2.74it/s]

{'n_estimators': 9, 'con': 0.04}
0.7471535377464055
              precision    recall  f1-score   support

           0     0.9866    0.9585    0.9724     64531
           1     0.4175    0.6962    0.5220      2758

    accuracy                         0.9477     67289
   macro avg     0.7021    0.8273    0.7472     67289
weighted avg     0.9633    0.9477    0.9539     67289






In [43]:
n_est = [2,3,5,7,9,10]
contamination = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, contamination))
score = -1
bs = None
for n_est, con in tqdm(params):
    
    try:
        clf_if = CBLOF(n_clusters=n_est, contamination=con)
        clf_if.fit(raw_normal_train_samples)
    except ValueError as e:
        print(n_est)
        continue  
    
    y_pred = clf_if.predict(raw_test_samples)
    test_pred = y_pred

    f1 = f1_score(raw_test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est
                }
        bs = test_pred
    del clf_if
    gc.collect()

        

print("benign only")
print(best_params)
print(score)
print(classification_report(raw_test_labels, bs, digits=4))

  6%|▌         | 2/36 [00:00<00:06,  5.46it/s]

2
2


 11%|█         | 4/36 [00:00<00:05,  5.38it/s]

2
2


 17%|█▋        | 6/36 [00:01<00:05,  5.40it/s]

2
2


 22%|██▏       | 8/36 [00:01<00:05,  5.31it/s]

3
3


 25%|██▌       | 9/36 [00:01<00:05,  5.25it/s]

3


 31%|███       | 11/36 [00:02<00:04,  5.13it/s]

3
3


 33%|███▎      | 12/36 [00:02<00:04,  5.18it/s]

3


100%|██████████| 36/36 [00:12<00:00,  2.86it/s]

benign only
{'n_estimators': 10}
0.7467783775952068
              precision    recall  f1-score   support

           0     0.9820    0.9711    0.9765     64531
           1     0.4637    0.5841    0.5170      2758

    accuracy                         0.9553     67289
   macro avg     0.7229    0.7776    0.7468     67289
weighted avg     0.9608    0.9553    0.9577     67289






In [44]:
# HBOS  Embeddings

In [45]:
contamination = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]

In [46]:
from pyod.models.hbos import HBOS

n_est = [5,10,15,20,25,30]
params = list(itertools.product(n_est, contamination))
score = -1
bs = None
for n_est, con in tqdm(params):
    
    clf_if = HBOS(n_bins=n_est, contamination=con)
    clf_if.fit(benign_train_samples)
    y_pred = clf_if.predict(test_samples)
    test_pred = y_pred

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

100%|██████████| 36/36 [02:50<00:00,  4.74s/it]

{'n_estimators': 5, 'con': 0.001}
0.9937861202229019
              precision    recall  f1-score   support

           0     1.0000    0.9990    0.9995    129059
           1     0.9765    1.0000    0.9881      5516

    accuracy                         0.9990    134575
   macro avg     0.9882    0.9995    0.9938    134575
weighted avg     0.9990    0.9990    0.9990    134575






In [47]:
n_est = [5,10,15,20,25,30]
contamination = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, contamination))
score = -1
bs = None
for n_est, con in tqdm(params):
    
    clf_if = HBOS(n_bins=n_est, contamination=con)
    clf_if.fit(normal_train_samples)
    y_pred = clf_if.predict(test_samples)
    test_pred = y_pred

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

100%|██████████| 36/36 [03:11<00:00,  5.33s/it]

{'n_estimators': 15, 'con': 0.04}
0.9937861202229019
              precision    recall  f1-score   support

           0     1.0000    0.9990    0.9995    129059
           1     0.9765    1.0000    0.9881      5516

    accuracy                         0.9990    134575
   macro avg     0.9882    0.9995    0.9938    134575
weighted avg     0.9990    0.9990    0.9990    134575






In [48]:
##  HBOS  RAw

In [49]:
from pyod.models.cblof import CBLOF

n_est = [5,10,15,20,25,30]
contamination = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, contamination))
score = -1
bs = None
for n_est, con in tqdm(params):
    
    try:
        clf_b = HBOS(n_bins=n_est, contamination=con)
        clf_b.fit(raw_benign_train_samples)
    except ValueError as e:
        print(n_est)
        continue  
   
    y_pred = clf_b.predict(raw_test_samples)
    test_pred = y_pred

    f1 = f1_score(raw_test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                        "con": con
                }
        bs = test_pred
    del clf_b
    gc.collect()

  

print(best_params)
print(score)
print(classification_report(raw_test_labels, bs, digits=4))

100%|██████████| 36/36 [00:14<00:00,  2.46it/s]

{'n_estimators': 30, 'con': 0.01}
0.8525989286730036
              precision    recall  f1-score   support

           0     0.9875    0.9887    0.9881     64531
           1     0.7282    0.7063    0.7171      2758

    accuracy                         0.9772     67289
   macro avg     0.8578    0.8475    0.8526     67289
weighted avg     0.9768    0.9772    0.9770     67289






In [50]:
n_est = [5,10,15,20,25,30]
contamination = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, contamination))
score = -1
bs = None
for n_est, con in tqdm(params):
    
    try:
        clf_if = HBOS(n_bins=n_est, contamination=con)
        clf_if.fit(raw_normal_train_samples)
    except ValueError as e:
        print(n_est)
        continue  
    
    y_pred = clf_if.predict(raw_test_samples)
    test_pred = y_pred

    f1 = f1_score(raw_test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est
                }
        bs = test_pred
    del clf_if
    gc.collect()

        

print("benign only")
print(best_params)
print(score)
print(classification_report(raw_test_labels, bs, digits=4))

100%|██████████| 36/36 [00:14<00:00,  2.55it/s]

benign only
{'n_estimators': 5}
0.8078362549428174
              precision    recall  f1-score   support

           0     0.9857    0.9811    0.9834     64531
           1     0.6012    0.6668    0.6323      2758

    accuracy                         0.9682     67289
   macro avg     0.7934    0.8239    0.8078     67289
weighted avg     0.9699    0.9682    0.9690     67289






In [51]:
##  PCA  Emb

In [52]:
from pyod.models.pca import PCA
n_est = [5,10,15,20,25,30]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    clf_if = PCA(n_components=n_est, contamination=con)
    clf_if.fit(benign_train_samples)
    y_pred = clf_if.predict(test_samples)
    test_pred = y_pred

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

  0%|          | 0/36 [00:00<?, ?it/s]

100%|██████████| 36/36 [01:11<00:00,  1.98s/it]

{'n_estimators': 5, 'con': 0.001}
0.9373418473269503
              precision    recall  f1-score   support

           0     1.0000    0.9884    0.9942    129059
           1     0.7865    1.0000    0.8805      5516

    accuracy                         0.9889    134575
   macro avg     0.8933    0.9942    0.9373    134575
weighted avg     0.9913    0.9889    0.9895    134575






In [53]:
n_est = [5,10,15,20,25,30]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    clf_if = PCA(n_components=n_est, contamination=con)
    clf_if.fit(normal_train_samples)
    y_pred = clf_if.predict(test_samples)
    test_pred = y_pred

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

100%|██████████| 36/36 [01:29<00:00,  2.48s/it]

{'n_estimators': 5, 'con': 0.04}
0.9937861202229019
              precision    recall  f1-score   support

           0     1.0000    0.9990    0.9995    129059
           1     0.9765    1.0000    0.9881      5516

    accuracy                         0.9990    134575
   macro avg     0.9882    0.9995    0.9938    134575
weighted avg     0.9990    0.9990    0.9990    134575






In [54]:
##  PCA  RAw

In [55]:
n_est = [5,10,15,20,25,30]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    clf_if = PCA(n_components=n_est, contamination=con)
    clf_if.fit(raw_benign_train_samples)
   
    y_pred = clf_if.predict(raw_test_samples)
    test_pred = y_pred

    f1 = f1_score(raw_test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                        "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()

  

print(best_params)
print(score)
print(classification_report(raw_test_labels, bs, digits=4))

100%|██████████| 36/36 [00:12<00:00,  2.92it/s]

{'n_estimators': 5, 'con': 0.001}
0.9079572980994904
              precision    recall  f1-score   support

           0     0.9881    0.9988    0.9934     64531
           1     0.9621    0.7183    0.8225      2758

    accuracy                         0.9873     67289
   macro avg     0.9751    0.8585    0.9080     67289
weighted avg     0.9870    0.9873    0.9864     67289






In [56]:
n_est = [5,10,15,20,25,30]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    clf_if = PCA(n_components=n_est, contamination=con)
    clf_if.fit(raw_normal_train_samples)

    y_pred = clf_if.predict(raw_test_samples)
    test_pred = y_pred

    f1 = f1_score(raw_test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est
                }
        bs = test_pred
    del clf_if
    gc.collect()

        

print("benign only")
print(best_params)
print(score)
print(classification_report(raw_test_labels, bs, digits=4))

100%|██████████| 36/36 [00:12<00:00,  2.88it/s]

benign only
{'n_estimators': 15}
0.7820758463846016
              precision    recall  f1-score   support

           0     0.9823    0.9817    0.9820     64531
           1     0.5784    0.5859    0.5821      2758

    accuracy                         0.9655     67289
   macro avg     0.7803    0.7838    0.7821     67289
weighted avg     0.9657    0.9655    0.9656     67289






In [57]:
##  IF  Emb

In [58]:
from sklearn.ensemble import IsolationForest
n_est = [20, 50, 100, 150]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    clf_if = IsolationForest(n_estimators=n_est, contamination=con)
    clf_if.fit(benign_train_samples)
    y_pred = clf_if.predict(test_samples)
    test_pred = list(map(lambda x : 0 if x == 1 else 1, y_pred))

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

  0%|          | 0/24 [00:00<?, ?it/s]

100%|██████████| 24/24 [00:53<00:00,  2.21s/it]

{'n_estimators': 150, 'con': 0.001}
0.9937861202229019
              precision    recall  f1-score   support

           0     1.0000    0.9990    0.9995    129059
           1     0.9765    1.0000    0.9881      5516

    accuracy                         0.9990    134575
   macro avg     0.9882    0.9995    0.9938    134575
weighted avg     0.9990    0.9990    0.9990    134575






In [59]:
from sklearn.ensemble import IsolationForest

n_est = [20, 50, 100, 150]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    clf_if = IsolationForest(n_estimators=n_est, contamination=con)
    clf_if.fit(normal_train_samples)
    y_pred = clf_if.predict(test_samples)
    test_pred = list(map(lambda x : 0 if x == 1 else 1, y_pred))

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

100%|██████████| 24/24 [00:55<00:00,  2.32s/it]

{'n_estimators': 150, 'con': 0.04}
0.9937861202229019
              precision    recall  f1-score   support

           0     1.0000    0.9990    0.9995    129059
           1     0.9765    1.0000    0.9881      5516

    accuracy                         0.9990    134575
   macro avg     0.9882    0.9995    0.9938    134575
weighted avg     0.9990    0.9990    0.9990    134575






In [60]:
##  IF  Raw

In [61]:
from sklearn.ensemble import IsolationForest

n_est = [20, 50, 100, 150]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    clf_if = IsolationForest(n_estimators=n_est, contamination=con)
    clf_if.fit(raw_benign_train_samples.to_numpy())
   
    y_pred = clf_if.predict(raw_test_samples.to_numpy())
    test_pred = list(map(lambda x : 0 if x == 1 else 1, y_pred))

    f1 = f1_score(raw_test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                        "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()

  

print(best_params)
print(score)
print(classification_report(raw_test_labels, bs, digits=4))

  0%|          | 0/24 [00:00<?, ?it/s]

100%|██████████| 24/24 [00:16<00:00,  1.41it/s]

{'n_estimators': 150, 'con': 0.01}
0.879820091825837
              precision    recall  f1-score   support

           0     0.9910    0.9888    0.9899     64531
           1     0.7508    0.7897    0.7697      2758

    accuracy                         0.9806     67289
   macro avg     0.8709    0.8892    0.8798     67289
weighted avg     0.9811    0.9806    0.9809     67289






In [62]:
n_est = [20, 50, 100, 150]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    clf_if = IsolationForest(n_estimators=n_est, contamination=con)
    clf_if.fit(raw_normal_train_samples.to_numpy())

    y_pred = clf_if.predict(raw_test_samples.to_numpy())
    test_pred = list(map(lambda x : 0 if x == 1 else 1, y_pred))

    f1 = f1_score(raw_test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est
                }
        bs = test_pred
    del clf_if
    gc.collect()

        

print("benign only")
print(best_params)
print(score)
print(classification_report(raw_test_labels, bs, digits=4))

100%|██████████| 24/24 [00:16<00:00,  1.43it/s]

benign only
{'n_estimators': 50}
0.8354279839346712
              precision    recall  f1-score   support

           0     0.9900    0.9798    0.9849     64531
           1     0.6193    0.7687    0.6860      2758

    accuracy                         0.9712     67289
   macro avg     0.8047    0.8742    0.8354     67289
weighted avg     0.9748    0.9712    0.9726     67289




