In [1]:
import pandas as pd
import numpy as np
import torch
from sklearn import preprocessing
from dgl.data import DGLDataset
import dgl
import time
import networkx as nx
import category_encoders as ce
import torch.nn as nn
import torch.nn.functional as F
import dgl.function as fn
import torch
import tqdm
import math

from typing import *
from sklearn.preprocessing import StandardScaler, Normalizer
import socket
import struct
import random
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
file_name = "NF-CICIDS2018-v3.parquet"
data = pd.read_parquet(file_name)

In [3]:
data.Label.value_counts()

Label
0    17514626
1     2600903
Name: count, dtype: int64

In [4]:
data.rename(columns=lambda x: x.strip(), inplace=True)
data['IPV4_SRC_ADDR'] = data["IPV4_SRC_ADDR"].apply(str)
data['L4_SRC_PORT'] = data["L4_SRC_PORT"].apply(str)
data['IPV4_DST_ADDR'] = data["IPV4_DST_ADDR"].apply(str)
data['L4_DST_PORT'] = data["L4_DST_PORT"].apply(str)

In [5]:
data.drop(columns=["L4_SRC_PORT", "L4_DST_PORT"], inplace=True)

In [6]:
data.Attack.unique()

array(['Benign', 'FTP-BruteForce', 'SSH-Bruteforce',
       'DoS_attacks-GoldenEye', 'DoS_attacks-Slowloris',
       'DoS_attacks-SlowHTTPTest', 'DoS_attacks-Hulk',
       'DDoS_attacks-LOIC-HTTP', 'DDOS_attack-LOIC-UDP',
       'DDOS_attack-HOIC', 'Brute_Force_-Web', 'Brute_Force_-XSS',
       'SQL_Injection', 'Infilteration', 'Bot'], dtype=object)

In [7]:
data = data.groupby(by='Attack').sample(frac=0.1, random_state=13)

In [8]:
data.groupby(by="Attack").count()

Unnamed: 0_level_0,FLOW_START_MILLISECONDS,FLOW_END_MILLISECONDS,IPV4_SRC_ADDR,IPV4_DST_ADDR,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,OUT_BYTES,OUT_PKTS,...,FTP_COMMAND_RET_CODE,SRC_TO_DST_IAT_MIN,SRC_TO_DST_IAT_MAX,SRC_TO_DST_IAT_AVG,SRC_TO_DST_IAT_STDDEV,DST_TO_SRC_IAT_MIN,DST_TO_SRC_IAT_MAX,DST_TO_SRC_IAT_AVG,DST_TO_SRC_IAT_STDDEV,Label
Attack,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Benign,1751463,1751463,1751463,1751463,1751463,1751463,1751463,1751463,1751463,1751463,...,1751463,1751463,1751463,1751463,1751463,1751463,1751463,1751463,1751463,1751463
Bot,20770,20770,20770,20770,20770,20770,20770,20770,20770,20770,...,20770,20770,20770,20770,20770,20770,20770,20770,20770,20770
Brute_Force_-Web,162,162,162,162,162,162,162,162,162,162,...,162,162,162,162,162,162,162,162,162,162
Brute_Force_-XSS,48,48,48,48,48,48,48,48,48,48,...,48,48,48,48,48,48,48,48,48,48
DDOS_attack-HOIC,103231,103231,103231,103231,103231,103231,103231,103231,103231,103231,...,103231,103231,103231,103231,103231,103231,103231,103231,103231,103231
DDOS_attack-LOIC-UDP,345,345,345,345,345,345,345,345,345,345,...,345,345,345,345,345,345,345,345,345,345
DDoS_attacks-LOIC-HTTP,28859,28859,28859,28859,28859,28859,28859,28859,28859,28859,...,28859,28859,28859,28859,28859,28859,28859,28859,28859,28859
DoS_attacks-GoldenEye,6130,6130,6130,6130,6130,6130,6130,6130,6130,6130,...,6130,6130,6130,6130,6130,6130,6130,6130,6130,6130
DoS_attacks-Hulk,10008,10008,10008,10008,10008,10008,10008,10008,10008,10008,...,10008,10008,10008,10008,10008,10008,10008,10008,10008,10008
DoS_attacks-SlowHTTPTest,10555,10555,10555,10555,10555,10555,10555,10555,10555,10555,...,10555,10555,10555,10555,10555,10555,10555,10555,10555,10555


In [None]:
# X = data.drop(columns=["Attack", "Label"])
X = data.drop(columns=["Attack", "Label", "FLOW_START_MILLISECONDS", "FLOW_END_MILLISECONDS",
                       "SRC_TO_DST_IAT_MIN", "SRC_TO_DST_IAT_MAX", "SRC_TO_DST_IAT_AVG",
                       "SRC_TO_DST_IAT_STDDEV", "DST_TO_SRC_IAT_MIN", "DST_TO_SRC_IAT_MAX",
                       "DST_TO_SRC_IAT_AVG", "DST_TO_SRC_IAT_STDDEV"])
y = data[["Attack", "Label"]]

X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=13, stratify=y)

In [10]:
encoder = ce.TargetEncoder(cols=['TCP_FLAGS','L7_PROTO','PROTOCOL',
                                  'CLIENT_TCP_FLAGS','SERVER_TCP_FLAGS','ICMP_TYPE',
                                  'ICMP_IPV4_TYPE','DNS_QUERY_ID','DNS_QUERY_TYPE',
                                  'FTP_COMMAND_RET_CODE'])
encoder.fit(X_train, y_train.Label)

# Transform on training set
X_train = encoder.transform(X_train)

# Transform on testing set
X_test = encoder.transform(X_test)

In [11]:
X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)
X_train.fillna(0, inplace=True)
X_test.fillna(0, inplace=True)

In [12]:
scaler = Normalizer()
cols_to_norm = list(set(list(X_train.iloc[:, 2:].columns))) # Ignore first two as the represents IP addresses
scaler.fit(X_train[cols_to_norm])

# Transform on training set
X_train[cols_to_norm] = scaler.transform(X_train[cols_to_norm])
X_train['h'] = X_train.iloc[:, 2:].values.tolist()

# Transform on testing set
X_test[cols_to_norm] = scaler.transform(X_test[cols_to_norm])
X_test['h'] = X_test.iloc[:, 2:].values.tolist()

train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

In [13]:
X_train.head()

Unnamed: 0,IPV4_SRC_ADDR,IPV4_DST_ADDR,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,OUT_BYTES,OUT_PKTS,TCP_FLAGS,CLIENT_TCP_FLAGS,...,NUM_PKTS_1024_TO_1514_BYTES,TCP_WIN_MAX_IN,TCP_WIN_MAX_OUT,ICMP_TYPE,ICMP_IPV4_TYPE,DNS_QUERY_ID,DNS_QUERY_TYPE,DNS_TTL_ANSWER,FTP_COMMAND_RET_CODE,h
7702671,172.31.67.54,104.16.228.152,2.36428e-06,6.608158e-06,0.008995,0.000117116,0.011969,0.0001054044,3.209025e-06,2.837709e-06,...,0.0,0.095941,0.341979,1.454227e-07,1.454227e-07,2.307945e-06,2.307945e-06,0.0,1.514288e-06,"[2.364279935057952e-06, 6.6081576426327454e-06..."
1769084,172.31.69.19,172.31.0.2,2.048333e-08,1.993613e-08,7.6e-05,1.413897e-06,9.9e-05,1.413897e-06,2.037054e-08,2.037054e-08,...,0.0,0.0,0.0,1.903185e-07,1.903185e-07,1.221542e-07,1.972363e-08,8.5e-05,1.828142e-07,"[2.0483332776817753e-08, 1.9936131278807104e-0..."
633280,172.31.65.114,169.254.169.254,3.181417e-08,8.892055e-08,5.9e-05,7.879671e-07,0.00011,7.879671e-07,7.013197e-08,6.823303e-08,...,0.0,0.001291,0.002824,2.121297e-08,2.121297e-08,3.105612e-08,3.105612e-08,0.0,2.037653e-08,"[3.1814172215512033e-08, 8.892054707761035e-08..."
11883761,172.31.65.65,172.31.0.2,7.12266e-09,6.932382e-09,5.1e-05,4.916537e-07,0.000114,4.916537e-07,7.08344e-09,7.08344e-09,...,0.0,0.0,0.0,6.617938e-08,6.617938e-08,5.099475e-08,6.858488e-09,2.9e-05,6.35699e-08,"[7.122660159886725e-09, 6.932382027330003e-09,..."
19281635,172.31.64.95,172.31.0.2,1.111836e-08,1.082134e-08,5.2e-05,7.674635e-07,0.000114,7.674635e-07,1.105714e-08,1.105714e-08,...,0.0,0.0,0.0,1.033049e-07,1.033049e-07,7.054865e-08,1.070599e-08,4.6e-05,9.923159e-08,"[1.1118357717184076e-08, 1.0821336618881802e-0..."


In [14]:
lab_enc = preprocessing.LabelEncoder()
lab_enc.fit(data["Attack"])

# Transform on training set
train["Attack"] = lab_enc.transform(train["Attack"])

# Transform on testing set
test["Attack"] = lab_enc.transform(test["Attack"])

In [15]:
# Training graph
train_g = nx.from_pandas_edgelist(train, "IPV4_SRC_ADDR", "IPV4_DST_ADDR",
           ["h", "Label", "Attack"], create_using=nx.MultiGraph())
train_g = train_g.to_directed()
train_g = dgl.from_networkx(train_g, edge_attrs=['h', 'Attack', 'Label'])
nfeat_weight = torch.ones([train_g.number_of_nodes(),
train_g.edata['h'].shape[1]])
train_g.ndata['h'] = nfeat_weight

# Testing graph
test_g = nx.from_pandas_edgelist(test, "IPV4_SRC_ADDR", "IPV4_DST_ADDR",
            ["h", "Label", "Attack"], create_using=nx.MultiGraph())

test_g = test_g.to_directed()
test_g = dgl.from_networkx(test_g, edge_attrs=['h', 'Attack', 'Label'])
nfeat_weight = torch.ones([test_g.number_of_nodes(),
test_g.edata['h'].shape[1]])
test_g.ndata['h'] = nfeat_weight

In [16]:
import torch.nn as nn
import torch.nn.functional as F
import dgl.function as fn
import tqdm
import gc

class SAGELayer(nn.Module):
    def __init__(self, ndim_in, edims, ndim_out, activation):
      super(SAGELayer, self).__init__()
      self.W_apply = nn.Linear(ndim_in + edims , ndim_out)
      self.activation = F.relu
      self.W_edge = nn.Linear(128 * 2, 256)
      self.reset_parameters()

    def reset_parameters(self):
      gain = nn.init.calculate_gain('relu')
      nn.init.xavier_uniform_(self.W_apply.weight, gain=gain)

    def message_func(self, edges):
      return {'m':  edges.data['h']}

    def forward(self, g_dgl, nfeats, efeats):
      with g_dgl.local_scope():
        g = g_dgl
        g.ndata['h'] = nfeats
        g.edata['h'] = efeats
        g.update_all(self.message_func, fn.mean('m', 'h_neigh'))
        g.ndata['h'] = F.relu(self.W_apply(torch.cat([g.ndata['h'], g.ndata['h_neigh']], 2)))

        # Compute edge embeddings
        u, v = g.edges()
        edge = self.W_edge(torch.cat((g.srcdata['h'][u], g.dstdata['h'][v]), 2))
        return g.ndata['h'], edge

In [17]:
class SAGE(nn.Module):
    def __init__(self, ndim_in, ndim_out, edim,  activation):
      super(SAGE, self).__init__()
      self.layers = nn.ModuleList()
      self.layers.append(SAGELayer(ndim_in, edim, 128, F.relu))

    def forward(self, g, nfeats, efeats, corrupt=False):
      if corrupt:
        e_perm = torch.randperm(g.number_of_edges())
        #n_perm = torch.randperm(g.number_of_nodes())
        efeats = efeats[e_perm]
        #nfeats = nfeats[n_perm]
      for i, layer in enumerate(self.layers):
        #nfeats = layer(g, nfeats, efeats)
        nfeats, e_feats = layer(g, nfeats, efeats)
      #return nfeats.sum(1)
      return nfeats.sum(1), e_feats.sum(1)

In [18]:
class Discriminator(nn.Module):
    def __init__(self, n_hidden):
      super(Discriminator, self).__init__()
      self.weight = nn.Parameter(torch.Tensor(n_hidden, n_hidden))
      self.reset_parameters()

    def uniform(self, size, tensor):
      bound = 1.0 / math.sqrt(size)
      if tensor is not None:
        tensor.data.uniform_(-bound, bound)

    def reset_parameters(self):
      size = self.weight.size(0)
      self.uniform(size, self.weight)

    def forward(self, features, summary):
      features = torch.matmul(features, torch.matmul(self.weight, summary))
      return features

In [19]:
class DGI(nn.Module):
    def __init__(self, ndim_in, ndim_out, edim, activation):
      super(DGI, self).__init__()
      self.encoder = SAGE(ndim_in, ndim_out, edim,  F.relu)
      #self.discriminator = Discriminator(128)
      self.discriminator = Discriminator(256)
      self.loss = nn.BCEWithLogitsLoss()

    def forward(self, g, n_features, e_features):
      positive = self.encoder(g, n_features, e_features, corrupt=False)
      negative = self.encoder(g, n_features, e_features, corrupt=True)
      self.loss = nn.BCEWithLogitsLoss()

    def forward(self, g, n_features, e_features):
      positive = self.encoder(g, n_features, e_features, corrupt=False)
      negative = self.encoder(g, n_features, e_features, corrupt=True)

      positive = positive[1]
      negative = negative[1]

      summary = torch.sigmoid(positive.mean(dim=0))

      positive = self.discriminator(positive, summary)
      negative = self.discriminator(negative, summary)

      l1 = self.loss(positive, torch.ones_like(positive))
      l2 = self.loss(negative, torch.zeros_like(negative))

      return l1 + l2

In [20]:
ndim_in = train_g.ndata['h'].shape[1]
hidden_features = 128
ndim_out = 128
num_layers = 1
edim = train_g.edata['h'].shape[1]
learning_rate = 1e-3
epochs = 4000

In [21]:
dgi = DGI(ndim_in,
    ndim_out,
    edim,
    F.relu)

# dgi = dgi.to('cuda')

dgi_optimizer = torch.optim.Adam(dgi.parameters(),
                lr=1e-3,
                weight_decay=0.)

In [22]:
# Format node and edge features for E-GraphSAGE
train_g.ndata['h'] = torch.reshape(train_g.ndata['h'],
                                   (train_g.ndata['h'].shape[0], 1,
                                    train_g.ndata['h'].shape[1]))

train_g.edata['h'] = torch.reshape(train_g.edata['h'],
                                   (train_g.edata['h'].shape[0], 1,
                                    train_g.edata['h'].shape[1]))

In [23]:
# Convert to GPU
# train_g = train_g.to('cuda')

In [24]:
# cnt_wait = 0
# best = 1e9
# best_t = 0
# dur = []
# node_features = train_g.ndata['h'] 
# edge_features = train_g.edata['h']
# torch.cuda.empty_cache()

# for epoch in range(epochs):
#     dgi.train()
#     if epoch >= 3:
#         t0 = time.time()

#     dgi_optimizer.zero_grad()
#     loss = dgi(train_g, node_features, edge_features)
#     loss.backward()
#     dgi_optimizer.step()

#     if loss < best:
#         best = loss
#         best_t = epoch
#         cnt_wait = 0
#         torch.save(dgi.state_dict(), 'best_dgi_CSE_v3.pkl')
#     else:
#         cnt_wait += 1

#   # if cnt_wait == patience:
#   #     print('Early stopping!')
#   #     break

#     if epoch >= 3:
#         dur.append(time.time() - t0)

#     if epoch % 50 == 0:

#         print("Epoch {:05d} | Time(s) {:.4f} | Loss {:.4f} | "
#             "ETputs(KTEPS) {:.2f}".format(epoch, np.mean(dur),
#               loss.item(),
#               train_g.num_edges() / np.mean(dur) / 1000))

In [25]:
dgi.load_state_dict(torch.load('best_dgi_CSE_v3.pkl'))

<All keys matched successfully>

In [26]:
training_emb = dgi.encoder(train_g, train_g.ndata['h'], train_g.edata['h'])[1]
training_emb = training_emb.detach().cpu().numpy()

In [27]:
test_g.ndata['h'] = torch.reshape(test_g.ndata['h'],
                                   (test_g.ndata['h'].shape[0], 1,
                                    test_g.ndata['h'].shape[1]))



test_g.edata['h'] = torch.reshape(test_g.edata['h'],
                                   (test_g.edata['h'].shape[0], 1,
                                    test_g.edata['h'].shape[1]))

In [28]:
# Convert to GPU
# test_g = test_g.to('cuda')

In [29]:
testing_emb = dgi.encoder(test_g, test_g.ndata['h'], test_g.edata['h'])[1]
testing_emb = testing_emb.detach().cpu().numpy()

In [30]:
df_train = pd.DataFrame(training_emb, )
df_train["Attack"] = lab_enc.inverse_transform(
        train_g.edata['Attack'].detach().cpu().numpy())
df_train["Label"] = train_g.edata['Label'].detach().cpu().numpy()

df_test = pd.DataFrame(testing_emb, )
df_test["Attack"] = lab_enc.inverse_transform(
        test_g.edata['Attack'].detach().cpu().numpy())
df_test["Label"] = test_g.edata['Label'].detach().cpu().numpy()

In [31]:
df_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,248,249,250,251,252,253,254,255,Attack,Label
0,0.007531,-0.020259,0.018783,0.005637,0.089053,0.017806,0.003693,-0.089690,0.035084,-0.094813,...,-0.038150,-0.051478,0.051154,0.037042,0.042434,-0.036311,-0.084742,0.015793,Benign,0
1,-0.040532,-0.044652,-0.006737,0.038503,0.029205,0.026323,0.019244,-0.028130,-0.005788,-0.111093,...,-0.074950,-0.113045,0.035995,0.052629,0.045247,-0.012881,-0.073286,0.037069,Benign,0
2,-0.016097,-0.071244,0.030629,-0.005677,0.039785,-0.013770,-0.026703,-0.048946,0.016268,-0.120974,...,-0.092802,-0.083286,0.045315,0.092022,0.058614,0.025269,-0.057444,0.001577,Benign,0
3,-0.016097,-0.071244,0.030629,-0.005677,0.039785,-0.013770,-0.026703,-0.048946,0.016268,-0.120974,...,-0.092802,-0.083286,0.045315,0.092022,0.058614,0.025269,-0.057444,0.001577,Benign,0
4,-0.016097,-0.071244,0.030629,-0.005677,0.039785,-0.013770,-0.026703,-0.048946,0.016268,-0.120974,...,-0.092802,-0.083286,0.045315,0.092022,0.058614,0.025269,-0.057444,0.001577,Benign,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2808605,-0.023390,0.025019,-0.015358,-0.002529,0.043663,0.059929,-0.018683,-0.113715,-0.017906,-0.104506,...,-0.115651,-0.061298,0.097790,0.125343,0.013122,-0.046006,-0.064494,0.054646,Benign,0
2808606,-0.050940,0.011565,0.020340,0.069332,0.069402,0.033980,-0.032159,-0.052939,-0.000830,-0.090769,...,-0.057446,-0.176387,0.097421,0.117917,0.093590,-0.004097,-0.053200,0.033025,Benign,0
2808607,-0.052188,0.009527,0.022788,0.064026,0.061622,0.032787,-0.032414,-0.050790,-0.002836,-0.093635,...,-0.052003,-0.175904,0.098811,0.119071,0.099080,0.005192,-0.048313,0.032309,Benign,0
2808608,-0.034494,0.037791,-0.022699,0.007418,0.038260,0.064484,-0.014790,-0.109017,-0.019191,-0.092700,...,-0.125980,-0.073225,0.116176,0.120585,0.009422,-0.068470,-0.071679,0.061564,Benign,0


# Embeddings CBLOF  Embeddings

In [32]:
import torch
import dgl
import numpy as np
import pandas as pd
import torch.optim as optim
import time
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
from sklearn.ensemble import IsolationForest
import gc

from tqdm import tqdm
import itertools

In [33]:
benign_train_samples = df_train[df_train.Label == 0].drop(columns=["Label", "Attack"])
normal_train_samples = df_train.drop(columns=["Label", "Attack"])

train_labels = df_train["Label"]
test_labels = df_test["Label"]

test_samples = df_test.drop(columns=["Label", "Attack"])

In [34]:
test_samples

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,246,247,248,249,250,251,252,253,254,255
0,-0.027911,-0.019412,0.016638,0.052916,0.105581,0.036157,-0.012026,-0.084199,0.031912,-0.083274,...,-0.012909,-0.031004,-0.050105,-0.051015,0.051873,0.056758,0.015649,-0.068981,-0.115814,0.032739
1,-0.027911,-0.019412,0.016638,0.052916,0.105581,0.036157,-0.012026,-0.084199,0.031912,-0.083274,...,-0.012909,-0.031004,-0.050105,-0.051015,0.051873,0.056758,0.015649,-0.068981,-0.115814,0.032739
2,-0.057264,-0.040439,-0.014924,0.046776,0.029506,0.011349,0.021992,0.013301,0.025913,-0.139302,...,-0.027610,-0.069913,-0.115737,-0.096843,0.048649,0.061553,0.052261,-0.019892,-0.068267,0.032932
3,-0.017759,-0.032197,0.034722,0.004706,0.047751,0.029255,0.000446,-0.074925,0.006789,-0.103674,...,0.000816,-0.002345,-0.040882,-0.076522,0.057519,0.068598,0.055958,-0.007944,-0.082713,0.023024
4,-0.017759,-0.032197,0.034722,0.004706,0.047751,0.029255,0.000446,-0.074925,0.006789,-0.103674,...,0.000816,-0.002345,-0.040882,-0.076522,0.057519,0.068598,0.055958,-0.007944,-0.082713,0.023024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1203630,-0.029910,0.025817,-0.016431,0.002595,0.041463,0.062697,-0.023663,-0.107048,-0.023853,-0.107478,...,0.032124,-0.039382,-0.115163,-0.070650,0.092998,0.122290,0.017345,-0.031408,-0.059937,0.061405
1203631,-0.031719,-0.003941,0.032492,0.039005,0.060259,0.031722,-0.022036,-0.062805,0.014916,-0.083344,...,-0.036074,0.018172,-0.050201,-0.148952,0.095564,0.119883,0.075159,-0.006409,-0.062527,0.022829
1203632,-0.045501,0.034528,-0.037564,0.042303,0.056692,0.056418,-0.013252,-0.103189,-0.025277,-0.076668,...,-0.015406,-0.083250,-0.147768,-0.074784,0.125180,0.110010,0.015077,-0.084775,-0.078181,0.045045
1203633,-0.046385,0.007020,0.019926,0.065604,0.068039,0.028297,-0.036458,-0.048222,0.000296,-0.097017,...,-0.048906,0.014594,-0.062495,-0.177318,0.092364,0.119836,0.095607,0.004322,-0.047522,0.030102


In [35]:
df_raw_train = pd.concat([X_train.drop(columns=["IPV4_SRC_ADDR","IPV4_DST_ADDR", "h"]), y_train], axis=1)
df_raw_test = pd.concat([X_test.drop(columns=["IPV4_SRC_ADDR","IPV4_DST_ADDR", "h"]), y_test], axis=1)

In [36]:
df_raw_train

Unnamed: 0,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,OUT_BYTES,OUT_PKTS,TCP_FLAGS,CLIENT_TCP_FLAGS,SERVER_TCP_FLAGS,FLOW_DURATION_MILLISECONDS,...,TCP_WIN_MAX_IN,TCP_WIN_MAX_OUT,ICMP_TYPE,ICMP_IPV4_TYPE,DNS_QUERY_ID,DNS_QUERY_TYPE,DNS_TTL_ANSWER,FTP_COMMAND_RET_CODE,Attack,Label
7702671,2.364280e-06,6.608158e-06,0.008995,1.171160e-04,0.011969,1.054044e-04,3.209025e-06,2.837709e-06,5.959448e-06,5.397292e-01,...,0.095941,0.341979,1.454227e-07,1.454227e-07,2.307945e-06,2.307945e-06,0.000000,1.514288e-06,Benign,0
1769084,2.048333e-08,1.993613e-08,0.000076,1.413897e-06,0.000099,1.413897e-06,2.037054e-08,2.037054e-08,2.002452e-08,1.413897e-06,...,0.000000,0.000000,1.903185e-07,1.903185e-07,1.221542e-07,1.972363e-08,0.000085,1.828142e-07,Benign,0
633280,3.181417e-08,8.892055e-08,0.000059,7.879671e-07,0.000110,7.879671e-07,7.013197e-08,6.823303e-08,1.883408e-09,1.575934e-07,...,0.001291,0.002824,2.121297e-08,2.121297e-08,3.105612e-08,3.105612e-08,0.000000,2.037653e-08,Benign,0
11883761,7.122660e-09,6.932382e-09,0.000051,4.916537e-07,0.000114,4.916537e-07,7.083440e-09,7.083440e-09,6.963118e-09,4.916537e-07,...,0.000000,0.000000,6.617938e-08,6.617938e-08,5.099475e-08,6.858488e-09,0.000029,6.356990e-08,Benign,0
19281635,1.111836e-08,1.082134e-08,0.000052,7.674635e-07,0.000114,7.674635e-07,1.105714e-08,1.105714e-08,1.086932e-08,7.674635e-07,...,0.000000,0.000000,1.033049e-07,1.033049e-07,7.054865e-08,1.070599e-08,0.000046,9.923159e-08,Benign,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14190676,3.087347e-06,5.607146e-08,0.022573,1.223469e-04,0.028644,1.070535e-04,2.745137e-07,8.082226e-07,1.841944e-07,3.029614e-02,...,0.125283,0.978775,2.058573e-06,2.058573e-06,3.013783e-06,3.013783e-06,0.000000,1.977402e-06,Benign,0
1099287,3.108400e-08,3.025361e-08,0.000159,2.145626e-06,0.000193,2.145626e-06,3.091284e-08,3.091284e-08,3.038775e-08,2.145626e-06,...,0.000000,0.000000,2.888134e-07,2.888134e-07,2.081412e-07,2.993113e-08,0.000129,2.774254e-07,Benign,0
16216474,3.101477e-06,5.632809e-08,0.029559,1.997236e-04,0.011123,1.229068e-04,6.836976e-06,6.651853e-06,6.568899e-08,6.457218e-02,...,0.125857,0.983255,2.067995e-06,2.067995e-06,3.027577e-06,3.027577e-06,0.000000,1.986453e-06,Benign,0
11087976,3.477073e-07,3.384185e-07,0.001632,2.400109e-05,0.003096,2.400109e-05,3.457927e-07,3.457927e-07,3.399189e-07,6.480294e-04,...,0.000000,0.000000,3.230682e-06,3.230682e-06,2.438678e-06,3.375498e-07,0.001440,3.103295e-06,Benign,0


In [37]:
raw_benign_train_samples = df_raw_train[df_raw_train.Label == 0].drop(columns=["Label", "Attack"])
raw_normal_train_samples = df_raw_train.drop(columns=["Label", "Attack"])

raw_train_labels = df_raw_train["Label"]
raw_test_labels = df_raw_test["Label"]

raw_test_samples = df_raw_test.drop(columns=["Label", "Attack"])

In [38]:
contamination = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]

In [None]:
from pyod.models.cblof import CBLOF
n_est = [2,3,5,7,9,10]
params = list(itertools.product(n_est, contamination))
score = -1
bs = None
for n_est, con in tqdm(params):
    clf_if = CBLOF(n_clusters=n_est, contamination=con)
    try:
        clf_if.fit(benign_train_samples)
    except Exception as e:
        continue
    y_pred = clf_if.predict(test_samples)
    test_pred = y_pred

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

100%|██████████| 36/36 [10:01<00:00, 16.71s/it]


{'n_estimators': 10, 'con': 0.2}
0.519640481510056
              precision    recall  f1-score   support

           0     0.8795    0.7968    0.8361   1047581
           1     0.1638    0.2672    0.2031    156054

    accuracy                         0.7282   1203635
   macro avg     0.5217    0.5320    0.5196   1203635
weighted avg     0.7867    0.7282    0.7541   1203635



In [None]:
n_est = [2,3,5,7,9,10]
contamination = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, contamination))
score = -1
bs = None
for n_est, con in tqdm(params):
    clf_if = CBLOF(n_clusters=n_est, contamination=con)
    try:
        clf_if.fit(normal_train_samples)
    except Exception as e:
        continue
    y_pred = clf_if.predict(test_samples)
    test_pred = y_pred

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

100%|██████████| 36/36 [12:10<00:00, 20.29s/it]


{'n_estimators': 7, 'con': 0.04}
0.48627928295331835
              precision    recall  f1-score   support

           0     0.8701    0.9573    0.9116   1047581
           1     0.1237    0.0404    0.0609    156054

    accuracy                         0.8384   1203635
   macro avg     0.4969    0.4989    0.4863   1203635
weighted avg     0.7733    0.8384    0.8013   1203635



In [46]:
###  CBLOF RAW

In [47]:
from pyod.models.cblof import CBLOF

n_est = [2,3,5,7,9,10]
contamination = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, contamination))
score = -1
bs = None
for n_est, con in tqdm(params):
    
    try:
        clf_b = CBLOF(n_clusters=n_est, contamination=con)
        clf_b.fit(raw_benign_train_samples)
    except ValueError as e:
        print(n_est)
        continue  
   
    y_pred = clf_b.predict(raw_test_samples)
    test_pred = y_pred

    f1 = f1_score(raw_test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                        "con": con
                }
        bs = test_pred
    del clf_b
    gc.collect()

  

print(best_params)
print(score)
print(classification_report(raw_test_labels, bs, digits=4))

  3%|▎         | 1/36 [00:04<02:29,  4.29s/it]

2


  6%|▌         | 2/36 [00:04<01:12,  2.14s/it]

2


  8%|▊         | 3/36 [00:05<00:48,  1.47s/it]

2


 11%|█         | 4/36 [00:06<00:37,  1.16s/it]

2


 14%|█▍        | 5/36 [00:06<00:30,  1.02it/s]

2


 19%|█▉        | 7/36 [00:11<00:41,  1.44s/it]

3


 22%|██▏       | 8/36 [00:11<00:34,  1.23s/it]

3


 25%|██▌       | 9/36 [00:12<00:28,  1.07s/it]

3


 28%|██▊       | 10/36 [00:13<00:25,  1.04it/s]

3


 31%|███       | 11/36 [00:13<00:22,  1.11it/s]

3


 33%|███▎      | 12/36 [00:14<00:20,  1.18it/s]

3


 50%|█████     | 18/36 [00:27<00:31,  1.73s/it]

5


100%|██████████| 36/36 [01:19<00:00,  2.20s/it]


{'n_estimators': 10, 'con': 0.2}
0.6536850237040513
              precision    recall  f1-score   support

           0     0.9421    0.7990    0.8646    525439
           1     0.3308    0.6692    0.4427     78027

    accuracy                         0.7822    603466
   macro avg     0.6364    0.7341    0.6537    603466
weighted avg     0.8630    0.7822    0.8101    603466



In [48]:
n_est = [2,3,5,7,9,10]
contamination = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, contamination))
score = -1
bs = None
for n_est, con in tqdm(params):
    
    try:
        clf_if = CBLOF(n_clusters=n_est, contamination=con)
        clf_if.fit(raw_normal_train_samples)
    except ValueError as e:
        print(n_est)
        continue  
    
    y_pred = clf_if.predict(raw_test_samples)
    test_pred = y_pred

    f1 = f1_score(raw_test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est
                }
        bs = test_pred
    del clf_if
    gc.collect()

        

print("benign only")
print(best_params)
print(score)
print(classification_report(raw_test_labels, bs, digits=4))

  3%|▎         | 1/36 [00:04<02:43,  4.68s/it]

2


  6%|▌         | 2/36 [00:05<01:21,  2.39s/it]

2


  8%|▊         | 3/36 [00:06<00:54,  1.65s/it]

2


 11%|█         | 4/36 [00:06<00:41,  1.29s/it]

2


 14%|█▍        | 5/36 [00:07<00:33,  1.07s/it]

2


 17%|█▋        | 6/36 [00:08<00:28,  1.05it/s]

2


 19%|█▉        | 7/36 [00:09<00:26,  1.10it/s]

3


 22%|██▏       | 8/36 [00:10<00:24,  1.12it/s]

3


 25%|██▌       | 9/36 [00:10<00:23,  1.15it/s]

3


 28%|██▊       | 10/36 [00:11<00:21,  1.18it/s]

3


 31%|███       | 11/36 [00:12<00:20,  1.19it/s]

3


 33%|███▎      | 12/36 [00:13<00:19,  1.21it/s]

3


100%|██████████| 36/36 [01:23<00:00,  2.33s/it]


benign only
{'n_estimators': 7}
0.6866026039087414
              precision    recall  f1-score   support

           0     0.9366    0.8599    0.8966    525439
           1     0.3919    0.6079    0.4766     78027

    accuracy                         0.8273    603466
   macro avg     0.6642    0.7339    0.6866    603466
weighted avg     0.8662    0.8273    0.8423    603466



In [49]:
# HBOS  Embeddings

In [50]:
contamination = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]

In [51]:
from pyod.models.hbos import HBOS

n_est = [5,10,15,20,25,30]
params = list(itertools.product(n_est, contamination))
score = -1
bs = None
for n_est, con in tqdm(params):
    
    clf_if = HBOS(n_bins=n_est, contamination=con)
    clf_if.fit(benign_train_samples)
    y_pred = clf_if.predict(test_samples)
    test_pred = y_pred

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

100%|██████████| 36/36 [20:44<00:00, 34.58s/it]


{'n_estimators': 15, 'con': 0.04}
0.4867579660896399
              precision    recall  f1-score   support

           0     0.8702    0.9579    0.9119   1047581
           1     0.1260    0.0408    0.0616    156054

    accuracy                         0.8390   1203635
   macro avg     0.4981    0.4993    0.4868   1203635
weighted avg     0.7737    0.8390    0.8017   1203635



In [52]:
n_est = [5,10,15,20,25,30]
contamination = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, contamination))
score = -1
bs = None
for n_est, con in tqdm(params):
    
    clf_if = HBOS(n_bins=n_est, contamination=con)
    clf_if.fit(normal_train_samples)
    y_pred = clf_if.predict(test_samples)
    test_pred = y_pred

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

100%|██████████| 36/36 [23:39<00:00, 39.43s/it]


{'n_estimators': 25, 'con': 0.04}
0.4840943070129387
              precision    recall  f1-score   support

           0     0.8694    0.9510    0.9084   1047581
           1     0.1107    0.0410    0.0598    156054

    accuracy                         0.8330   1203635
   macro avg     0.4901    0.4960    0.4841   1203635
weighted avg     0.7710    0.8330    0.7983   1203635



In [53]:
##  HBOS  RAw

In [54]:
from pyod.models.cblof import CBLOF

n_est = [5,10,15,20,25,30]
contamination = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, contamination))
score = -1
bs = None
for n_est, con in tqdm(params):
    
    try:
        clf_b = HBOS(n_bins=n_est, contamination=con)
        clf_b.fit(raw_benign_train_samples)
    except ValueError as e:
        print(n_est)
        continue  
   
    y_pred = clf_b.predict(raw_test_samples)
    test_pred = y_pred

    f1 = f1_score(raw_test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                        "con": con
                }
        bs = test_pred
    del clf_b
    gc.collect()

  

print(best_params)
print(score)
print(classification_report(raw_test_labels, bs, digits=4))

100%|██████████| 36/36 [01:28<00:00,  2.47s/it]


{'n_estimators': 10, 'con': 0.2}
0.5893718087441495
              precision    recall  f1-score   support

           0     0.9074    0.8071    0.8543    525439
           1     0.2552    0.4452    0.3244     78027

    accuracy                         0.7603    603466
   macro avg     0.5813    0.6261    0.5894    603466
weighted avg     0.8230    0.7603    0.7858    603466



In [55]:
n_est = [5,10,15,20,25,30]
contamination = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, contamination))
score = -1
bs = None
for n_est, con in tqdm(params):
    
    try:
        clf_if = HBOS(n_bins=n_est, contamination=con)
        clf_if.fit(raw_normal_train_samples)
    except ValueError as e:
        print(n_est)
        continue  
    
    y_pred = clf_if.predict(raw_test_samples)
    test_pred = y_pred

    f1 = f1_score(raw_test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est
                }
        bs = test_pred
    del clf_if
    gc.collect()

        

print("benign only")
print(best_params)
print(score)
print(classification_report(raw_test_labels, bs, digits=4))

100%|██████████| 36/36 [01:27<00:00,  2.42s/it]


benign only
{'n_estimators': 10}
0.5698714357716641
              precision    recall  f1-score   support

           0     0.8962    0.8240    0.8586    525439
           1     0.2317    0.3574    0.2811     78027

    accuracy                         0.7637    603466
   macro avg     0.5640    0.5907    0.5699    603466
weighted avg     0.8103    0.7637    0.7839    603466



In [56]:
##  PCA  Emb

In [57]:
from pyod.models.pca import PCA
n_est = [5,10,15,20,25,30]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    clf_if = PCA(n_components=n_est, contamination=con)
    clf_if.fit(benign_train_samples)
    y_pred = clf_if.predict(test_samples)
    test_pred = y_pred

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

100%|██████████| 36/36 [07:29<00:00, 12.49s/it]


{'n_estimators': 5, 'con': 0.05}
0.4821948079252052
              precision    recall  f1-score   support

           0     0.8689    0.9469    0.9062   1047581
           1     0.1023    0.0407    0.0582    156054

    accuracy                         0.8294   1203635
   macro avg     0.4856    0.4938    0.4822   1203635
weighted avg     0.7695    0.8294    0.7962   1203635



In [58]:
n_est = [5,10,15,20,25,30]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    clf_if = PCA(n_components=n_est, contamination=con)
    clf_if.fit(normal_train_samples)
    y_pred = clf_if.predict(test_samples)
    test_pred = y_pred

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

100%|██████████| 36/36 [08:18<00:00, 13.86s/it]


{'n_estimators': 30, 'con': 0.05}
0.4817735478588872
              precision    recall  f1-score   support

           0     0.8687    0.9459    0.9057   1047581
           1     0.1006    0.0406    0.0579    156054

    accuracy                         0.8285   1203635
   macro avg     0.4847    0.4933    0.4818   1203635
weighted avg     0.7691    0.8285    0.7957   1203635



In [59]:
##  PCA  RAw

In [60]:
n_est = [5,10,15,20,25,30]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    clf_if = PCA(n_components=n_est, contamination=con)
    clf_if.fit(raw_benign_train_samples)
   
    y_pred = clf_if.predict(raw_test_samples)
    test_pred = y_pred

    f1 = f1_score(raw_test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                        "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()

  

print(best_params)
print(score)
print(classification_report(raw_test_labels, bs, digits=4))

100%|██████████| 36/36 [01:01<00:00,  1.72s/it]


{'n_estimators': 20, 'con': 0.1}
0.51033412405322
              precision    recall  f1-score   support

           0     0.8733    0.9000    0.8864    525439
           1     0.1517    0.1204    0.1342     78027

    accuracy                         0.7992    603466
   macro avg     0.5125    0.5102    0.5103    603466
weighted avg     0.7800    0.7992    0.7892    603466



In [61]:
n_est = [5,10,15,20,25,30]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    clf_if = PCA(n_components=n_est, contamination=con)
    clf_if.fit(raw_normal_train_samples)

    y_pred = clf_if.predict(raw_test_samples)
    test_pred = y_pred

    f1 = f1_score(raw_test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est
                }
        bs = test_pred
    del clf_if
    gc.collect()

        

print("benign only")
print(best_params)
print(score)
print(classification_report(raw_test_labels, bs, digits=4))

100%|██████████| 36/36 [01:06<00:00,  1.85s/it]


benign only
{'n_estimators': 10}
0.5006148850729417
              precision    recall  f1-score   support

           0     0.8711    0.9004    0.8855    525439
           1     0.1327    0.1026    0.1157     78027

    accuracy                         0.7972    603466
   macro avg     0.5019    0.5015    0.5006    603466
weighted avg     0.7756    0.7972    0.7860    603466



In [62]:
##  IF  Emb

In [63]:
from sklearn.ensemble import IsolationForest
n_est = [20, 50, 100, 150]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    clf_if = IsolationForest(n_estimators=n_est, contamination=con)
    clf_if.fit(benign_train_samples)
    y_pred = clf_if.predict(test_samples)
    test_pred = list(map(lambda x : 0 if x == 1 else 1, y_pred))

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

100%|██████████| 24/24 [05:35<00:00, 13.96s/it]


{'n_estimators': 50, 'con': 0.04}
0.4862899980070696
              precision    recall  f1-score   support

           0     0.8701    0.9571    0.9115   1047581
           1     0.1234    0.0406    0.0611    156054

    accuracy                         0.8382   1203635
   macro avg     0.4967    0.4988    0.4863   1203635
weighted avg     0.7733    0.8382    0.8012   1203635



In [64]:
from sklearn.ensemble import IsolationForest

n_est = [20, 50, 100, 150]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    clf_if = IsolationForest(n_estimators=n_est, contamination=con)
    clf_if.fit(normal_train_samples)
    y_pred = clf_if.predict(test_samples)
    test_pred = list(map(lambda x : 0 if x == 1 else 1, y_pred))

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

100%|██████████| 24/24 [06:32<00:00, 16.34s/it]


{'n_estimators': 100, 'con': 0.04}
0.48630372763961804
              precision    recall  f1-score   support

           0     0.8701    0.9573    0.9116   1047581
           1     0.1237    0.0405    0.0610    156054

    accuracy                         0.8384   1203635
   macro avg     0.4969    0.4989    0.4863   1203635
weighted avg     0.7733    0.8384    0.8013   1203635



In [65]:
##  IF  Raw

In [66]:
from sklearn.ensemble import IsolationForest

n_est = [20, 50, 100, 150]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    clf_if = IsolationForest(n_estimators=n_est, contamination=con)
    clf_if.fit(raw_benign_train_samples.to_numpy())
   
    y_pred = clf_if.predict(raw_test_samples.to_numpy())
    test_pred = list(map(lambda x : 0 if x == 1 else 1, y_pred))

    f1 = f1_score(raw_test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                        "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()

  

print(best_params)
print(score)
print(classification_report(raw_test_labels, bs, digits=4))

100%|██████████| 24/24 [01:55<00:00,  4.81s/it]


{'n_estimators': 150, 'con': 0.1}
0.585237576373461
              precision    recall  f1-score   support

           0     0.8920    0.8996    0.8958    525439
           1     0.2830    0.2668    0.2747     78027

    accuracy                         0.8178    603466
   macro avg     0.5875    0.5832    0.5852    603466
weighted avg     0.8133    0.8178    0.8155    603466



In [67]:
n_est = [20, 50, 100, 150]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    clf_if = IsolationForest(n_estimators=n_est, contamination=con)
    clf_if.fit(raw_normal_train_samples.to_numpy())

    y_pred = clf_if.predict(raw_test_samples.to_numpy())
    test_pred = list(map(lambda x : 0 if x == 1 else 1, y_pred))

    f1 = f1_score(raw_test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est
                }
        bs = test_pred
    del clf_if
    gc.collect()

        

print("benign only")
print(best_params)
print(score)
print(classification_report(raw_test_labels, bs, digits=4))

100%|██████████| 24/24 [02:07<00:00,  5.31s/it]


benign only
{'n_estimators': 150}
0.5228837500770191
              precision    recall  f1-score   support

           0     0.8802    0.8083    0.8427    525439
           1     0.1670    0.2589    0.2031     78027

    accuracy                         0.7373    603466
   macro avg     0.5236    0.5336    0.5229    603466
weighted avg     0.7880    0.7373    0.7600    603466

