In [1]:
import pandas as pd
import numpy as np
import torch
from sklearn import preprocessing
from dgl.data import DGLDataset
import dgl
import time
import networkx as nx
import category_encoders as ce
import torch.nn as nn
import torch.nn.functional as F
import dgl.function as fn
import torch
import tqdm
import math

from typing import *
from sklearn.preprocessing import StandardScaler, Normalizer
import socket
import struct
import random
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
file_name = "NF-CICIDS2018-v3.parquet"
data = pd.read_parquet(file_name)

In [3]:
##ADDED
file_name_test = "NF-CSE-CIC-IDS2018-v2.parquet"
test_data = pd.read_parquet(file_name_test)
##

In [4]:
data.Label.value_counts()

Label
0    17514626
1     2600903
Name: count, dtype: int64

In [5]:
## ADDED
test_data.Label.value_counts()
##

Label
0    16635567
1     2258141
Name: count, dtype: int64

In [6]:
data.rename(columns=lambda x: x.strip(), inplace=True)
data['IPV4_SRC_ADDR'] = data["IPV4_SRC_ADDR"].apply(str)
data['L4_SRC_PORT'] = data["L4_SRC_PORT"].apply(str)
data['IPV4_DST_ADDR'] = data["IPV4_DST_ADDR"].apply(str)
data['L4_DST_PORT'] = data["L4_DST_PORT"].apply(str)

In [7]:
##ADDED
test_data.rename(columns=lambda x: x.strip(), inplace=True)
test_data['IPV4_SRC_ADDR'] = test_data["IPV4_SRC_ADDR"].apply(str)
test_data['L4_SRC_PORT'] = test_data["L4_SRC_PORT"].apply(str)
test_data['IPV4_DST_ADDR'] = test_data["IPV4_DST_ADDR"].apply(str)
test_data['L4_DST_PORT'] = test_data["L4_DST_PORT"].apply(str)
##

In [8]:
data.drop(columns=["L4_SRC_PORT", "L4_DST_PORT"], inplace=True)

In [9]:
# ADDED
test_data.drop(columns=["L4_SRC_PORT", "L4_DST_PORT"], inplace=True)
##

In [10]:
data.Attack.unique()

array(['Benign', 'FTP-BruteForce', 'SSH-Bruteforce',
       'DoS_attacks-GoldenEye', 'DoS_attacks-Slowloris',
       'DoS_attacks-SlowHTTPTest', 'DoS_attacks-Hulk',
       'DDoS_attacks-LOIC-HTTP', 'DDOS_attack-LOIC-UDP',
       'DDOS_attack-HOIC', 'Brute_Force_-Web', 'Brute_Force_-XSS',
       'SQL_Injection', 'Infilteration', 'Bot'], dtype=object)

In [11]:
##ADDED
test_data.Attack.unique()
##

array(['SSH-Bruteforce', 'Benign', 'DDoS attacks-LOIC-HTTP',
       'DDOS attack-HOIC', 'DoS attacks-Slowloris', 'DoS attacks-Hulk',
       'FTP-BruteForce', 'Infilteration', 'Bot', 'DoS attacks-GoldenEye',
       'Brute Force -Web', 'DoS attacks-SlowHTTPTest', 'SQL Injection',
       'DDOS attack-LOIC-UDP', 'Brute Force -XSS'], dtype=object)

In [12]:
## ADDED
test_data['Attack'] = test_data['Attack'].str.replace(' ', '_')
##

In [13]:
data = data.groupby(by='Attack').sample(frac=0.02, random_state=13)

In [14]:
## ADDED
test_data = test_data.groupby(by='Attack').sample(frac=0.02, random_state=13)
##

In [15]:
data.groupby(by="Attack").count()

Unnamed: 0_level_0,FLOW_START_MILLISECONDS,FLOW_END_MILLISECONDS,IPV4_SRC_ADDR,IPV4_DST_ADDR,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,OUT_BYTES,OUT_PKTS,...,FTP_COMMAND_RET_CODE,SRC_TO_DST_IAT_MIN,SRC_TO_DST_IAT_MAX,SRC_TO_DST_IAT_AVG,SRC_TO_DST_IAT_STDDEV,DST_TO_SRC_IAT_MIN,DST_TO_SRC_IAT_MAX,DST_TO_SRC_IAT_AVG,DST_TO_SRC_IAT_STDDEV,Label
Attack,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Benign,350293,350293,350293,350293,350293,350293,350293,350293,350293,350293,...,350293,350293,350293,350293,350293,350293,350293,350293,350293,350293
Bot,4154,4154,4154,4154,4154,4154,4154,4154,4154,4154,...,4154,4154,4154,4154,4154,4154,4154,4154,4154,4154
Brute_Force_-Web,32,32,32,32,32,32,32,32,32,32,...,32,32,32,32,32,32,32,32,32,32
Brute_Force_-XSS,10,10,10,10,10,10,10,10,10,10,...,10,10,10,10,10,10,10,10,10,10
DDOS_attack-HOIC,20646,20646,20646,20646,20646,20646,20646,20646,20646,20646,...,20646,20646,20646,20646,20646,20646,20646,20646,20646,20646
DDOS_attack-LOIC-UDP,69,69,69,69,69,69,69,69,69,69,...,69,69,69,69,69,69,69,69,69,69
DDoS_attacks-LOIC-HTTP,5772,5772,5772,5772,5772,5772,5772,5772,5772,5772,...,5772,5772,5772,5772,5772,5772,5772,5772,5772,5772
DoS_attacks-GoldenEye,1226,1226,1226,1226,1226,1226,1226,1226,1226,1226,...,1226,1226,1226,1226,1226,1226,1226,1226,1226,1226
DoS_attacks-Hulk,2002,2002,2002,2002,2002,2002,2002,2002,2002,2002,...,2002,2002,2002,2002,2002,2002,2002,2002,2002,2002
DoS_attacks-SlowHTTPTest,2111,2111,2111,2111,2111,2111,2111,2111,2111,2111,...,2111,2111,2111,2111,2111,2111,2111,2111,2111,2111


In [16]:
##ADDED
test_data.groupby(by="Attack").count()
##

Unnamed: 0_level_0,IPV4_SRC_ADDR,IPV4_DST_ADDR,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,OUT_BYTES,OUT_PKTS,TCP_FLAGS,CLIENT_TCP_FLAGS,...,NUM_PKTS_1024_TO_1514_BYTES,TCP_WIN_MAX_IN,TCP_WIN_MAX_OUT,ICMP_TYPE,ICMP_IPV4_TYPE,DNS_QUERY_ID,DNS_QUERY_TYPE,DNS_TTL_ANSWER,FTP_COMMAND_RET_CODE,Label
Attack,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Benign,332711,332711,332711,332711,332711,332711,332711,332711,332711,332711,...,332711,332711,332711,332711,332711,332711,332711,332711,332711,332711
Bot,2862,2862,2862,2862,2862,2862,2862,2862,2862,2862,...,2862,2862,2862,2862,2862,2862,2862,2862,2862,2862
Brute_Force_-Web,43,43,43,43,43,43,43,43,43,43,...,43,43,43,43,43,43,43,43,43,43
Brute_Force_-XSS,19,19,19,19,19,19,19,19,19,19,...,19,19,19,19,19,19,19,19,19,19
DDOS_attack-HOIC,21617,21617,21617,21617,21617,21617,21617,21617,21617,21617,...,21617,21617,21617,21617,21617,21617,21617,21617,21617,21617
DDOS_attack-LOIC-UDP,42,42,42,42,42,42,42,42,42,42,...,42,42,42,42,42,42,42,42,42,42
DDoS_attacks-LOIC-HTTP,6146,6146,6146,6146,6146,6146,6146,6146,6146,6146,...,6146,6146,6146,6146,6146,6146,6146,6146,6146,6146
DoS_attacks-GoldenEye,554,554,554,554,554,554,554,554,554,554,...,554,554,554,554,554,554,554,554,554,554
DoS_attacks-Hulk,8653,8653,8653,8653,8653,8653,8653,8653,8653,8653,...,8653,8653,8653,8653,8653,8653,8653,8653,8653,8653
DoS_attacks-SlowHTTPTest,282,282,282,282,282,282,282,282,282,282,...,282,282,282,282,282,282,282,282,282,282


In [17]:
# X = data.drop(columns=["Attack", "Label"])
X = data.drop(columns=["Attack", "Label", "FLOW_START_MILLISECONDS", "FLOW_END_MILLISECONDS",
                       "SRC_TO_DST_IAT_MIN", "SRC_TO_DST_IAT_MAX", "SRC_TO_DST_IAT_AVG",
                       "SRC_TO_DST_IAT_STDDEV", "DST_TO_SRC_IAT_MIN", "DST_TO_SRC_IAT_MAX",
                       "DST_TO_SRC_IAT_AVG", "DST_TO_SRC_IAT_STDDEV"])
y = data[["Attack", "Label"]]

# X_train, X_test, y_train, y_test = train_test_split(
#         X, y, test_size=0.3, random_state=13, stratify=y)
# ADDED
X_train, y_train = X, y
X_test = test_data.drop(columns=["Attack", "Label"])
y_test = test_data[["Attack", "Label"]]

X_train.shape, X_test.shape, y_train.shape, y_test.shape
##

((402311, 41), (377874, 41), (402311, 2), (377874, 2))

In [18]:
encoder = ce.TargetEncoder(cols=['TCP_FLAGS','L7_PROTO','PROTOCOL',
                                  'CLIENT_TCP_FLAGS','SERVER_TCP_FLAGS','ICMP_TYPE',
                                  'ICMP_IPV4_TYPE','DNS_QUERY_ID','DNS_QUERY_TYPE',
                                  'FTP_COMMAND_RET_CODE'])
encoder.fit(X_train, y_train.Label)

# Transform on training set
X_train = encoder.transform(X_train)

# Transform on testing set
X_test = encoder.transform(X_test)

In [19]:
X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)
X_train.fillna(0, inplace=True)
X_test.fillna(0, inplace=True)

In [20]:
scaler = Normalizer()
cols_to_norm = list(set(list(X_train.iloc[:, 2:].columns))) # Ignore first two as the represents IP addresses
scaler.fit(X_train[cols_to_norm])

# Transform on training set
X_train[cols_to_norm] = scaler.transform(X_train[cols_to_norm])
X_train['h'] = X_train.iloc[:, 2:].values.tolist()

# Transform on testing set
X_test[cols_to_norm] = scaler.transform(X_test[cols_to_norm])
X_test['h'] = X_test.iloc[:, 2:].values.tolist()

train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

In [21]:
X_train.head()

Unnamed: 0,IPV4_SRC_ADDR,IPV4_DST_ADDR,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,OUT_BYTES,OUT_PKTS,TCP_FLAGS,CLIENT_TCP_FLAGS,...,NUM_PKTS_1024_TO_1514_BYTES,TCP_WIN_MAX_IN,TCP_WIN_MAX_OUT,ICMP_TYPE,ICMP_IPV4_TYPE,DNS_QUERY_ID,DNS_QUERY_TYPE,DNS_TTL_ANSWER,FTP_COMMAND_RET_CODE,h
1142795,5.101.40.105,172.31.64.23,3.075184e-06,5.481931e-08,0.021067,0.0001221252,0.026272,0.0001068595,6.576196e-08,2.489842e-06,...,1.5e-05,0.125056,0.977001,2.056008e-06,2.056008e-06,3.001804e-06,3.001804e-06,0.0,1.973818e-06,"[3.0751839227192853e-06, 5.481930634219952e-08..."
14404567,172.31.64.35,172.31.0.2,1.684303e-08,1.640394e-08,7.9e-05,1.171864e-06,9.7e-05,1.171864e-06,1.673918e-08,1.673918e-08,...,0.0,0.0,0.0,1.578289e-07,1.578289e-07,1.300264e-07,1.655126e-08,7e-05,1.515196e-07,"[1.684303268567306e-08, 1.6403940652141906e-08..."
16627420,172.31.69.5,172.31.0.2,1.468786e-08,1.430495e-08,7.1e-05,1.021916e-06,0.000103,1.021916e-06,1.459729e-08,1.459729e-08,...,0.0,0.0,0.0,1.376337e-07,1.376337e-07,1.117219e-07,1.443342e-08,6.1e-05,1.321317e-07,"[1.4687856599708106e-08, 1.4304949260932607e-0..."
6874011,172.31.69.8,172.31.0.2,2.524708e-07,2.458889e-07,0.001142,1.75658e-05,0.005252,1.75658e-05,2.509141e-07,2.509141e-07,...,0.0,0.0,0.0,2.365796e-06,2.365796e-06,1.920397e-06,2.480972e-07,0.001054,2.271222e-06,"[2.524707696959322e-07, 2.4588894409826177e-07..."
16243632,172.31.66.56,172.31.0.2,1.216816e-08,1.185094e-08,5.9e-05,8.466068e-07,0.00011,8.466068e-07,1.209313e-08,1.209313e-08,...,0.0,0.0,0.0,1.140227e-07,1.140227e-07,9.393689e-08,1.195737e-08,4.9e-05,1.094645e-07,"[1.2168162610059382e-08, 1.185094282164601e-08..."


In [22]:
lab_enc = preprocessing.LabelEncoder()
lab_enc.fit(data["Attack"])

# Transform on training set
train["Attack"] = lab_enc.transform(train["Attack"])

# Transform on testing set
test["Attack"] = lab_enc.transform(test["Attack"])

In [23]:
# Training graph
train_g = nx.from_pandas_edgelist(train, "IPV4_SRC_ADDR", "IPV4_DST_ADDR",
           ["h", "Label", "Attack"], create_using=nx.MultiGraph())
train_g = train_g.to_directed()
train_g = dgl.from_networkx(train_g, edge_attrs=['h', 'Attack', 'Label'])
nfeat_weight = torch.ones([train_g.number_of_nodes(),
train_g.edata['h'].shape[1]])
train_g.ndata['h'] = nfeat_weight

# Testing graph
test_g = nx.from_pandas_edgelist(test, "IPV4_SRC_ADDR", "IPV4_DST_ADDR",
            ["h", "Label", "Attack"], create_using=nx.MultiGraph())

test_g = test_g.to_directed()
test_g = dgl.from_networkx(test_g, edge_attrs=['h', 'Attack', 'Label'])
nfeat_weight = torch.ones([test_g.number_of_nodes(),
test_g.edata['h'].shape[1]])
test_g.ndata['h'] = nfeat_weight

In [24]:
import torch.nn as nn
import torch.nn.functional as F
import dgl.function as fn
import tqdm
import gc

class SAGELayer(nn.Module):
    def __init__(self, ndim_in, edims, ndim_out, activation):
      super(SAGELayer, self).__init__()
      self.W_apply = nn.Linear(ndim_in + edims , ndim_out)
      self.activation = F.relu
      self.W_edge = nn.Linear(128 * 2, 256)
      self.reset_parameters()

    def reset_parameters(self):
      gain = nn.init.calculate_gain('relu')
      nn.init.xavier_uniform_(self.W_apply.weight, gain=gain)

    def message_func(self, edges):
      return {'m':  edges.data['h']}

    def forward(self, g_dgl, nfeats, efeats):
      with g_dgl.local_scope():
        g = g_dgl
        g.ndata['h'] = nfeats
        g.edata['h'] = efeats
        g.update_all(self.message_func, fn.mean('m', 'h_neigh'))
        g.ndata['h'] = F.relu(self.W_apply(torch.cat([g.ndata['h'], g.ndata['h_neigh']], 2)))

        # Compute edge embeddings
        u, v = g.edges()
        edge = self.W_edge(torch.cat((g.srcdata['h'][u], g.dstdata['h'][v]), 2))
        return g.ndata['h'], edge

In [25]:
class SAGE(nn.Module):
    def __init__(self, ndim_in, ndim_out, edim,  activation):
      super(SAGE, self).__init__()
      self.layers = nn.ModuleList()
      self.layers.append(SAGELayer(ndim_in, edim, 128, F.relu))

    def forward(self, g, nfeats, efeats, corrupt=False):
      if corrupt:
        e_perm = torch.randperm(g.number_of_edges())
        #n_perm = torch.randperm(g.number_of_nodes())
        efeats = efeats[e_perm]
        #nfeats = nfeats[n_perm]
      for i, layer in enumerate(self.layers):
        #nfeats = layer(g, nfeats, efeats)
        nfeats, e_feats = layer(g, nfeats, efeats)
      #return nfeats.sum(1)
      return nfeats.sum(1), e_feats.sum(1)

In [26]:
class Discriminator(nn.Module):
    def __init__(self, n_hidden):
      super(Discriminator, self).__init__()
      self.weight = nn.Parameter(torch.Tensor(n_hidden, n_hidden))
      self.reset_parameters()

    def uniform(self, size, tensor):
      bound = 1.0 / math.sqrt(size)
      if tensor is not None:
        tensor.data.uniform_(-bound, bound)

    def reset_parameters(self):
      size = self.weight.size(0)
      self.uniform(size, self.weight)

    def forward(self, features, summary):
      features = torch.matmul(features, torch.matmul(self.weight, summary))
      return features

In [27]:
class DGI(nn.Module):
    def __init__(self, ndim_in, ndim_out, edim, activation):
      super(DGI, self).__init__()
      self.encoder = SAGE(ndim_in, ndim_out, edim,  F.relu)
      #self.discriminator = Discriminator(128)
      self.discriminator = Discriminator(256)
      self.loss = nn.BCEWithLogitsLoss()

    def forward(self, g, n_features, e_features):
      positive = self.encoder(g, n_features, e_features, corrupt=False)
      negative = self.encoder(g, n_features, e_features, corrupt=True)
      self.loss = nn.BCEWithLogitsLoss()

    def forward(self, g, n_features, e_features):
      positive = self.encoder(g, n_features, e_features, corrupt=False)
      negative = self.encoder(g, n_features, e_features, corrupt=True)

      positive = positive[1]
      negative = negative[1]

      summary = torch.sigmoid(positive.mean(dim=0))

      positive = self.discriminator(positive, summary)
      negative = self.discriminator(negative, summary)

      l1 = self.loss(positive, torch.ones_like(positive))
      l2 = self.loss(negative, torch.zeros_like(negative))

      return l1 + l2

In [28]:
ndim_in = train_g.ndata['h'].shape[1]
hidden_features = 128
ndim_out = 128
num_layers = 1
edim = train_g.edata['h'].shape[1]
learning_rate = 1e-3
epochs = 4000

In [29]:
dgi = DGI(ndim_in,
    ndim_out,
    edim,
    F.relu)

dgi = dgi.to('cuda')

dgi_optimizer = torch.optim.Adam(dgi.parameters(),
                lr=1e-3,
                weight_decay=0.)

In [30]:
# Format node and edge features for E-GraphSAGE
train_g.ndata['h'] = torch.reshape(train_g.ndata['h'],
                                   (train_g.ndata['h'].shape[0], 1,
                                    train_g.ndata['h'].shape[1]))

train_g.edata['h'] = torch.reshape(train_g.edata['h'],
                                   (train_g.edata['h'].shape[0], 1,
                                    train_g.edata['h'].shape[1]))

In [31]:
# Convert to GPU
train_g = train_g.to('cuda')

In [32]:
# cnt_wait = 0
# best = 1e9
# best_t = 0
# dur = []
# node_features = train_g.ndata['h'] 
# edge_features = train_g.edata['h']
# torch.cuda.empty_cache()

# for epoch in range(epochs):
#     dgi.train()
#     if epoch >= 3:
#         t0 = time.time()

#     dgi_optimizer.zero_grad()
#     loss = dgi(train_g, node_features, edge_features)
#     loss.backward()
#     dgi_optimizer.step()

#     if loss < best:
#         best = loss
#         best_t = epoch
#         cnt_wait = 0
#         torch.save(dgi.state_dict(), 'best_dgi_CSE_v3.pkl')
#     else:
#         cnt_wait += 1

#   # if cnt_wait == patience:
#   #     print('Early stopping!')
#   #     break

#     if epoch >= 3:
#         dur.append(time.time() - t0)

#     if epoch % 50 == 0:

#         print("Epoch {:05d} | Time(s) {:.4f} | Loss {:.4f} | "
#             "ETputs(KTEPS) {:.2f}".format(epoch, np.mean(dur),
#               loss.item(),
#               train_g.num_edges() / np.mean(dur) / 1000))

In [33]:
dgi.load_state_dict(torch.load('best_dgi_CSE_v3.pkl'))

  dgi.load_state_dict(torch.load('best_dgi_CSE_v3.pkl'))


<All keys matched successfully>

In [34]:
training_emb = dgi.encoder(train_g, train_g.ndata['h'], train_g.edata['h'])[1]
training_emb = training_emb.detach().cpu().numpy()

In [35]:
test_g.ndata['h'] = torch.reshape(test_g.ndata['h'],
                                   (test_g.ndata['h'].shape[0], 1,
                                    test_g.ndata['h'].shape[1]))



test_g.edata['h'] = torch.reshape(test_g.edata['h'],
                                   (test_g.edata['h'].shape[0], 1,
                                    test_g.edata['h'].shape[1]))

In [36]:
# Convert to GPU
test_g = test_g.to('cuda')

In [37]:
testing_emb = dgi.encoder(test_g, test_g.ndata['h'], test_g.edata['h'])[1]
testing_emb = testing_emb.detach().cpu().numpy()

In [38]:
df_train = pd.DataFrame(training_emb, )
df_train["Attack"] = lab_enc.inverse_transform(
        train_g.edata['Attack'].detach().cpu().numpy())
df_train["Label"] = train_g.edata['Label'].detach().cpu().numpy()

df_test = pd.DataFrame(testing_emb, )
df_test["Attack"] = lab_enc.inverse_transform(
        test_g.edata['Attack'].detach().cpu().numpy())
df_test["Label"] = test_g.edata['Label'].detach().cpu().numpy()

In [39]:
df_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,248,249,250,251,252,253,254,255,Attack,Label
0,0.010044,0.016325,0.093376,-0.008862,0.079747,0.067937,0.162018,0.084112,0.016122,-0.032304,...,-0.015731,-0.033184,0.042320,-0.067537,0.069052,-0.066628,-0.137370,-0.035196,Benign,0
1,0.010044,0.016325,0.093376,-0.008862,0.079747,0.067937,0.162018,0.084112,0.016122,-0.032304,...,-0.015731,-0.033184,0.042320,-0.067537,0.069052,-0.066628,-0.137370,-0.035196,Benign,0
2,0.010044,0.016325,0.093376,-0.008862,0.079747,0.067937,0.162018,0.084112,0.016122,-0.032304,...,-0.015731,-0.033184,0.042320,-0.067537,0.069052,-0.066628,-0.137370,-0.035196,Benign,0
3,0.010044,0.016325,0.093376,-0.008862,0.079747,0.067937,0.162018,0.084112,0.016122,-0.032304,...,-0.015731,-0.033184,0.042320,-0.067537,0.069052,-0.066628,-0.137370,-0.035196,Benign,0
4,0.010044,0.016325,0.093376,-0.008862,0.079747,0.067937,0.162018,0.084112,0.016122,-0.032304,...,-0.015731,-0.033184,0.042320,-0.067537,0.069052,-0.066628,-0.137370,-0.035196,Benign,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
802502,0.005458,0.031087,0.025435,-0.003261,0.017776,0.021832,0.136148,0.065669,-0.002815,-0.045494,...,-0.006325,-0.021873,0.046571,-0.046403,0.062711,-0.003701,-0.091494,-0.044468,SSH-Bruteforce,1
802503,0.005458,0.031087,0.025435,-0.003261,0.017776,0.021832,0.136148,0.065669,-0.002815,-0.045494,...,-0.006325,-0.021873,0.046571,-0.046403,0.062711,-0.003701,-0.091494,-0.044468,SSH-Bruteforce,1
802504,0.005458,0.031087,0.025435,-0.003261,0.017776,0.021832,0.136148,0.065669,-0.002815,-0.045494,...,-0.006325,-0.021873,0.046571,-0.046403,0.062711,-0.003701,-0.091494,-0.044468,SSH-Bruteforce,1
802505,0.005458,0.031087,0.025435,-0.003261,0.017776,0.021832,0.136148,0.065669,-0.002815,-0.045494,...,-0.006325,-0.021873,0.046571,-0.046403,0.062711,-0.003701,-0.091494,-0.044468,SSH-Bruteforce,1


# Embeddings CBLOF  Embeddings

In [40]:
import torch
import dgl
import numpy as np
import pandas as pd
import torch.optim as optim
import time
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
from sklearn.ensemble import IsolationForest
import gc

from tqdm import tqdm
import itertools

In [41]:
benign_train_samples = df_train[df_train.Label == 0].drop(columns=["Label", "Attack"])
normal_train_samples = df_train.drop(columns=["Label", "Attack"])

train_labels = df_train["Label"]
test_labels = df_test["Label"]

test_samples = df_test.drop(columns=["Label", "Attack"])

In [42]:
test_samples

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,246,247,248,249,250,251,252,253,254,255
0,0.013708,0.015145,0.037452,0.005404,0.012947,0.060107,0.164785,0.041856,0.02629,-0.066434,...,-0.081500,0.012664,-0.023328,0.017443,0.111194,-0.033910,0.071483,-0.002435,-0.122524,-0.038479
1,0.013708,0.015145,0.037452,0.005404,0.012947,0.060107,0.164785,0.041856,0.02629,-0.066434,...,-0.081500,0.012664,-0.023328,0.017443,0.111194,-0.033910,0.071483,-0.002435,-0.122524,-0.038479
2,0.013708,0.015145,0.037452,0.005404,0.012947,0.060107,0.164785,0.041856,0.02629,-0.066434,...,-0.081500,0.012664,-0.023328,0.017443,0.111194,-0.033910,0.071483,-0.002435,-0.122524,-0.038479
3,0.013708,0.015145,0.037452,0.005404,0.012947,0.060107,0.164785,0.041856,0.02629,-0.066434,...,-0.081500,0.012664,-0.023328,0.017443,0.111194,-0.033910,0.071483,-0.002435,-0.122524,-0.038479
4,0.013708,0.015145,0.037452,0.005404,0.012947,0.060107,0.164785,0.041856,0.02629,-0.066434,...,-0.081500,0.012664,-0.023328,0.017443,0.111194,-0.033910,0.071483,-0.002435,-0.122524,-0.038479
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
754753,0.016680,0.056353,-0.003914,-0.010406,-0.069867,0.061574,0.180381,0.065347,0.02229,-0.058257,...,-0.131445,0.030415,-0.017139,0.012956,0.134187,-0.080939,0.062478,-0.040283,-0.118634,-0.054126
754754,0.016680,0.056353,-0.003914,-0.010406,-0.069867,0.061574,0.180381,0.065347,0.02229,-0.058257,...,-0.131445,0.030415,-0.017139,0.012956,0.134187,-0.080939,0.062478,-0.040283,-0.118634,-0.054126
754755,0.016680,0.056353,-0.003914,-0.010406,-0.069867,0.061574,0.180381,0.065347,0.02229,-0.058257,...,-0.131445,0.030415,-0.017139,0.012956,0.134187,-0.080939,0.062478,-0.040283,-0.118634,-0.054126
754756,0.016680,0.056353,-0.003914,-0.010406,-0.069867,0.061574,0.180381,0.065347,0.02229,-0.058257,...,-0.131445,0.030415,-0.017139,0.012956,0.134187,-0.080939,0.062478,-0.040283,-0.118634,-0.054126


In [43]:
df_raw_train = pd.concat([X_train.drop(columns=["IPV4_SRC_ADDR","IPV4_DST_ADDR", "h"]), y_train], axis=1)
df_raw_test = pd.concat([X_test.drop(columns=["IPV4_SRC_ADDR","IPV4_DST_ADDR", "h"]), y_test], axis=1)

In [44]:
df_raw_train

Unnamed: 0,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,OUT_BYTES,OUT_PKTS,TCP_FLAGS,CLIENT_TCP_FLAGS,SERVER_TCP_FLAGS,FLOW_DURATION_MILLISECONDS,...,TCP_WIN_MAX_IN,TCP_WIN_MAX_OUT,ICMP_TYPE,ICMP_IPV4_TYPE,DNS_QUERY_ID,DNS_QUERY_TYPE,DNS_TTL_ANSWER,FTP_COMMAND_RET_CODE,Attack,Label
1142795,3.075184e-06,5.481931e-08,0.021067,1.221252e-04,0.026272,1.068595e-04,6.576196e-08,2.489842e-06,1.869754e-07,2.540204e-02,...,0.125056,0.977001,2.056008e-06,2.056008e-06,3.001804e-06,3.001804e-06,0.000000,1.973818e-06,Benign,0
14404567,1.684303e-08,1.640394e-08,0.000079,1.171864e-06,0.000097,1.171864e-06,1.673918e-08,1.673918e-08,1.621682e-08,1.171864e-06,...,0.000000,0.000000,1.578289e-07,1.578289e-07,1.300264e-07,1.655126e-08,0.000070,1.515196e-07,Benign,0
16627420,1.468786e-08,1.430495e-08,0.000071,1.021916e-06,0.000103,1.021916e-06,1.459729e-08,1.459729e-08,1.414177e-08,1.021916e-06,...,0.000000,0.000000,1.376337e-07,1.376337e-07,1.117219e-07,1.443342e-08,0.000061,1.321317e-07,Benign,0
6874011,2.524708e-07,2.458889e-07,0.001142,1.756580e-05,0.005252,1.756580e-05,2.509141e-07,2.509141e-07,2.430841e-07,7.377635e-04,...,0.000000,0.000000,2.365796e-06,2.365796e-06,1.920397e-06,2.480972e-07,0.001054,2.271222e-06,Benign,0
16243632,1.216816e-08,1.185094e-08,0.000059,8.466068e-07,0.000110,8.466068e-07,1.209313e-08,1.209313e-08,1.171576e-08,8.466068e-07,...,0.000000,0.000000,1.140227e-07,1.140227e-07,9.393689e-08,1.195737e-08,0.000049,1.094645e-07,Benign,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1545168,1.483101e-06,6.150719e-06,0.022941,1.693334e-04,0.028485,1.693334e-04,2.005601e-06,1.765370e-06,3.733765e-06,2.238146e-03,...,0.197921,0.197656,9.915722e-07,9.915722e-07,1.447711e-06,1.447711e-06,0.000000,9.519335e-07,SSH-Bruteforce,1
1696743,1.767323e-06,7.329444e-06,0.027478,2.017845e-04,0.033944,2.017845e-04,2.389954e-06,2.103685e-06,4.449304e-06,3.246099e-03,...,0.235851,0.235535,1.181597e-06,1.181597e-06,1.725151e-06,1.725151e-06,0.000000,1.134362e-06,SSH-Bruteforce,1
1562087,1.764147e-06,7.316273e-06,0.027569,2.014219e-04,0.033883,2.014219e-04,2.385660e-06,2.099905e-06,4.441309e-06,3.240266e-03,...,0.235427,0.235112,1.179474e-06,1.179474e-06,1.722051e-06,1.722051e-06,0.000000,1.132324e-06,SSH-Bruteforce,1
1497956,1.721773e-06,7.140540e-06,0.026906,1.965839e-04,0.033069,1.965839e-04,2.328357e-06,2.049466e-06,4.334631e-06,3.076965e-03,...,0.229772,0.229465,1.151144e-06,1.151144e-06,1.680688e-06,1.680688e-06,0.000000,1.105126e-06,SSH-Bruteforce,1


In [45]:
raw_benign_train_samples = df_raw_train[df_raw_train.Label == 0].drop(columns=["Label", "Attack"])
raw_normal_train_samples = df_raw_train.drop(columns=["Label", "Attack"])

raw_train_labels = df_raw_train["Label"]
raw_test_labels = df_raw_test["Label"]

raw_test_samples = df_raw_test.drop(columns=["Label", "Attack"])

In [46]:
contamination = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]

In [47]:
from pyod.models.cblof import CBLOF
n_est = [2,3,5,7,9,10]
params = list(itertools.product(n_est, contamination))
score = -1
bs = None
for n_est, con in tqdm(params):
    clf_if = CBLOF(n_clusters=n_est, contamination=con)
    try:
        clf_if.fit(benign_train_samples)
    except Exception as e:
        continue
    y_pred = clf_if.predict(test_samples)
    test_pred = y_pred

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

100%|██████████| 36/36 [02:22<00:00,  3.95s/it]

{'n_estimators': 7, 'con': 0.001}
0.9613458058165368
              precision    recall  f1-score   support

           0     0.9846    0.9979    0.9912    664432
           1     0.9828    0.8852    0.9315     90326

    accuracy                         0.9844    754758
   macro avg     0.9837    0.9416    0.9613    754758
weighted avg     0.9844    0.9844    0.9841    754758






In [48]:
n_est = [2,3,5,7,9,10]
contamination = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, contamination))
score = -1
bs = None
for n_est, con in tqdm(params):
    clf_if = CBLOF(n_clusters=n_est, contamination=con)
    try:
        clf_if.fit(normal_train_samples)
    except Exception as e:
        continue
    y_pred = clf_if.predict(test_samples)
    test_pred = y_pred

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

100%|██████████| 36/36 [02:33<00:00,  4.26s/it]

{'n_estimators': 5, 'con': 0.001}
0.9644481243717342
              precision    recall  f1-score   support

           0     0.9846    0.9994    0.9920    664432
           1     0.9950    0.8852    0.9369     90326

    accuracy                         0.9857    754758
   macro avg     0.9898    0.9423    0.9644    754758
weighted avg     0.9859    0.9857    0.9854    754758






In [49]:
###  CBLOF RAW

In [50]:
from pyod.models.cblof import CBLOF

n_est = [2,3,5,7,9,10]
contamination = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, contamination))
score = -1
bs = None
for n_est, con in tqdm(params):
    
    try:
        clf_b = CBLOF(n_clusters=n_est, contamination=con)
        clf_b.fit(raw_benign_train_samples)
    except ValueError as e:
        print(n_est)
        continue  
   
    y_pred = clf_b.predict(raw_test_samples)
    test_pred = y_pred

    f1 = f1_score(raw_test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                        "con": con
                }
        bs = test_pred
    del clf_b
    gc.collect()

  

print(best_params)
print(score)
print(classification_report(raw_test_labels, bs, digits=4))

  0%|          | 0/36 [00:00<?, ?it/s]

  3%|▎         | 1/36 [00:00<00:08,  3.91it/s]

2


  8%|▊         | 3/36 [00:01<00:19,  1.69it/s]

2


 11%|█         | 4/36 [00:02<00:14,  2.21it/s]

2


 14%|█▍        | 5/36 [00:02<00:11,  2.61it/s]

2


 17%|█▋        | 6/36 [00:02<00:10,  2.94it/s]

2


 19%|█▉        | 7/36 [00:02<00:09,  3.15it/s]

3


 22%|██▏       | 8/36 [00:03<00:08,  3.30it/s]

3


 25%|██▌       | 9/36 [00:03<00:07,  3.40it/s]

3


 31%|███       | 11/36 [00:04<00:10,  2.47it/s]

3


 33%|███▎      | 12/36 [00:04<00:08,  2.75it/s]

3


 39%|███▉      | 14/36 [00:05<00:09,  2.22it/s]

5


 50%|█████     | 18/36 [00:08<00:10,  1.68it/s]

5


100%|██████████| 36/36 [00:26<00:00,  1.34it/s]

{'n_estimators': 5, 'con': 0.1}
0.8910565039633965
              precision    recall  f1-score   support

           0     0.9750    0.9724    0.9737    332711
           1     0.8008    0.8162    0.8084     45163

    accuracy                         0.9538    377874
   macro avg     0.8879    0.8943    0.8911    377874
weighted avg     0.9542    0.9538    0.9540    377874






In [51]:
n_est = [2,3,5,7,9,10]
contamination = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, contamination))
score = -1
bs = None
for n_est, con in tqdm(params):
    
    try:
        clf_if = CBLOF(n_clusters=n_est, contamination=con)
        clf_if.fit(raw_normal_train_samples)
    except ValueError as e:
        print(n_est)
        continue  
    
    y_pred = clf_if.predict(raw_test_samples)
    test_pred = y_pred

    f1 = f1_score(raw_test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est
                }
        bs = test_pred
    del clf_if
    gc.collect()

        

print("benign only")
print(best_params)
print(score)
print(classification_report(raw_test_labels, bs, digits=4))

  3%|▎         | 1/36 [00:00<00:09,  3.61it/s]

2


  6%|▌         | 2/36 [00:00<00:09,  3.57it/s]

2


  8%|▊         | 3/36 [00:00<00:09,  3.63it/s]

2


 14%|█▍        | 5/36 [00:01<00:13,  2.34it/s]

2


 17%|█▋        | 6/36 [00:02<00:11,  2.66it/s]

2


 19%|█▉        | 7/36 [00:02<00:10,  2.88it/s]

3


 22%|██▏       | 8/36 [00:02<00:09,  3.06it/s]

3


 25%|██▌       | 9/36 [00:03<00:08,  3.18it/s]

3


 28%|██▊       | 10/36 [00:03<00:07,  3.29it/s]

3


 31%|███       | 11/36 [00:03<00:07,  3.34it/s]

3


 33%|███▎      | 12/36 [00:03<00:07,  3.42it/s]

3


100%|██████████| 36/36 [00:28<00:00,  1.27it/s]

benign only
{'n_estimators': 9}
0.8910264566707677
              precision    recall  f1-score   support

           0     0.9750    0.9725    0.9737    332711
           1     0.8008    0.8160    0.8083     45163

    accuracy                         0.9538    377874
   macro avg     0.8879    0.8942    0.8910    377874
weighted avg     0.9541    0.9538    0.9539    377874






In [52]:
# HBOS  Embeddings

In [53]:
contamination = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]

In [54]:
from pyod.models.hbos import HBOS

n_est = [5,10,15,20,25,30]
params = list(itertools.product(n_est, contamination))
score = -1
bs = None
for n_est, con in tqdm(params):
    
    clf_if = HBOS(n_bins=n_est, contamination=con)
    clf_if.fit(benign_train_samples)
    y_pred = clf_if.predict(test_samples)
    test_pred = y_pred

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

100%|██████████| 36/36 [07:06<00:00, 11.84s/it]

{'n_estimators': 10, 'con': 0.01}
0.9600433567181486
              precision    recall  f1-score   support

           0     0.9846    0.9972    0.9909    664432
           1     0.9776    0.8854    0.9292     90326

    accuracy                         0.9839    754758
   macro avg     0.9811    0.9413    0.9600    754758
weighted avg     0.9838    0.9839    0.9835    754758






In [55]:
n_est = [5,10,15,20,25,30]
contamination = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, contamination))
score = -1
bs = None
for n_est, con in tqdm(params):
    
    clf_if = HBOS(n_bins=n_est, contamination=con)
    clf_if.fit(normal_train_samples)
    y_pred = clf_if.predict(test_samples)
    test_pred = y_pred

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

100%|██████████| 36/36 [07:36<00:00, 12.69s/it]

{'n_estimators': 10, 'con': 0.01}
0.9597662004989305
              precision    recall  f1-score   support

           0     0.9846    0.9971    0.9908    664432
           1     0.9765    0.8854    0.9287     90326

    accuracy                         0.9837    754758
   macro avg     0.9806    0.9412    0.9598    754758
weighted avg     0.9836    0.9837    0.9834    754758






In [56]:
##  HBOS  RAw

In [57]:
from pyod.models.cblof import CBLOF

n_est = [5,10,15,20,25,30]
contamination = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, contamination))
score = -1
bs = None
for n_est, con in tqdm(params):
    
    try:
        clf_b = HBOS(n_bins=n_est, contamination=con)
        clf_b.fit(raw_benign_train_samples)
    except ValueError as e:
        print(n_est)
        continue  
   
    y_pred = clf_b.predict(raw_test_samples)
    test_pred = y_pred

    f1 = f1_score(raw_test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                        "con": con
                }
        bs = test_pred
    del clf_b
    gc.collect()

  

print(best_params)
print(score)
print(classification_report(raw_test_labels, bs, digits=4))

  0%|          | 0/36 [00:00<?, ?it/s]

100%|██████████| 36/36 [00:31<00:00,  1.15it/s]

{'n_estimators': 5, 'con': 0.1}
0.8489311773555535
              precision    recall  f1-score   support

           0     0.9581    0.9744    0.9662    332711
           1     0.7843    0.6857    0.7317     45163

    accuracy                         0.9399    377874
   macro avg     0.8712    0.8301    0.8489    377874
weighted avg     0.9373    0.9399    0.9381    377874






In [58]:
n_est = [5,10,15,20,25,30]
contamination = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, contamination))
score = -1
bs = None
for n_est, con in tqdm(params):
    
    try:
        clf_if = HBOS(n_bins=n_est, contamination=con)
        clf_if.fit(raw_normal_train_samples)
    except ValueError as e:
        print(n_est)
        continue  
    
    y_pred = clf_if.predict(raw_test_samples)
    test_pred = y_pred

    f1 = f1_score(raw_test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est
                }
        bs = test_pred
    del clf_if
    gc.collect()

        

print("benign only")
print(best_params)
print(score)
print(classification_report(raw_test_labels, bs, digits=4))

100%|██████████| 36/36 [00:32<00:00,  1.12it/s]

benign only
{'n_estimators': 5}
0.8526731297510903
              precision    recall  f1-score   support

           0     0.9621    0.9695    0.9658    332711
           1     0.7618    0.7186    0.7396     45163

    accuracy                         0.9395    377874
   macro avg     0.8620    0.8440    0.8527    377874
weighted avg     0.9382    0.9395    0.9387    377874






In [59]:
##  PCA  Emb

In [60]:
from pyod.models.pca import PCA
n_est = [5,10,15,20,25,30]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    clf_if = PCA(n_components=n_est, contamination=con)
    clf_if.fit(benign_train_samples)
    y_pred = clf_if.predict(test_samples)
    test_pred = y_pred

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

  0%|          | 0/36 [00:00<?, ?it/s]

100%|██████████| 36/36 [02:44<00:00,  4.56s/it]

{'n_estimators': 15, 'con': 0.01}
0.9612963725493711
              precision    recall  f1-score   support

           0     0.9846    0.9979    0.9912    664432
           1     0.9826    0.8852    0.9314     90326

    accuracy                         0.9844    754758
   macro avg     0.9836    0.9416    0.9613    754758
weighted avg     0.9844    0.9844    0.9840    754758






In [61]:
n_est = [5,10,15,20,25,30]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    clf_if = PCA(n_components=n_est, contamination=con)
    clf_if.fit(normal_train_samples)
    y_pred = clf_if.predict(test_samples)
    test_pred = y_pred

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

100%|██████████| 36/36 [02:57<00:00,  4.94s/it]

{'n_estimators': 15, 'con': 0.01}
0.9613736155210402
              precision    recall  f1-score   support

           0     0.9846    0.9979    0.9912    664432
           1     0.9829    0.8852    0.9315     90326

    accuracy                         0.9844    754758
   macro avg     0.9838    0.9416    0.9614    754758
weighted avg     0.9844    0.9844    0.9841    754758






In [62]:
##  PCA  RAw

In [63]:
n_est = [5,10,15,20,25,30]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    clf_if = PCA(n_components=n_est, contamination=con)
    clf_if.fit(raw_benign_train_samples)
   
    y_pred = clf_if.predict(raw_test_samples)
    test_pred = y_pred

    f1 = f1_score(raw_test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                        "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()

  

print(best_params)
print(score)
print(classification_report(raw_test_labels, bs, digits=4))

  0%|          | 0/36 [00:00<?, ?it/s]

100%|██████████| 36/36 [00:24<00:00,  1.49it/s]

{'n_estimators': 10, 'con': 0.2}
0.8541803932624754
              precision    recall  f1-score   support

           0     0.9646    0.9661    0.9653    332711
           1     0.7472    0.7389    0.7430     45163

    accuracy                         0.9389    377874
   macro avg     0.8559    0.8525    0.8542    377874
weighted avg     0.9386    0.9389    0.9388    377874






In [64]:
n_est = [5,10,15,20,25,30]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    clf_if = PCA(n_components=n_est, contamination=con)
    clf_if.fit(raw_normal_train_samples)

    y_pred = clf_if.predict(raw_test_samples)
    test_pred = y_pred

    f1 = f1_score(raw_test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est
                }
        bs = test_pred
    del clf_if
    gc.collect()

        

print("benign only")
print(best_params)
print(score)
print(classification_report(raw_test_labels, bs, digits=4))

100%|██████████| 36/36 [00:25<00:00,  1.41it/s]

benign only
{'n_estimators': 20}
0.8542386487089035
              precision    recall  f1-score   support

           0     0.9646    0.9661    0.9654    332711
           1     0.7474    0.7389    0.7431     45163

    accuracy                         0.9389    377874
   macro avg     0.8560    0.8525    0.8542    377874
weighted avg     0.9387    0.9389    0.9388    377874






In [65]:
##  IF  Emb

In [66]:
from sklearn.ensemble import IsolationForest
n_est = [20, 50, 100, 150]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    clf_if = IsolationForest(n_estimators=n_est, contamination=con)
    clf_if.fit(benign_train_samples)
    y_pred = clf_if.predict(test_samples)
    test_pred = list(map(lambda x : 0 if x == 1 else 1, y_pred))

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

100%|██████████| 24/24 [02:08<00:00,  5.34s/it]

{'n_estimators': 20, 'con': 0.001}
0.9482731112321782
              precision    recall  f1-score   support

           0     0.9779    0.9996    0.9886    664432
           1     0.9962    0.8340    0.9079     90326

    accuracy                         0.9798    754758
   macro avg     0.9870    0.9168    0.9483    754758
weighted avg     0.9801    0.9798    0.9790    754758






In [67]:
from sklearn.ensemble import IsolationForest

n_est = [20, 50, 100, 150]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    clf_if = IsolationForest(n_estimators=n_est, contamination=con)
    clf_if.fit(normal_train_samples)
    y_pred = clf_if.predict(test_samples)
    test_pred = list(map(lambda x : 0 if x == 1 else 1, y_pred))

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

100%|██████████| 24/24 [02:14<00:00,  5.61s/it]

{'n_estimators': 50, 'con': 0.04}
0.9405209174314567
              precision    recall  f1-score   support

           0     0.9845    0.9874    0.9859    664432
           1     0.9050    0.8855    0.8951     90326

    accuracy                         0.9752    754758
   macro avg     0.9447    0.9364    0.9405    754758
weighted avg     0.9750    0.9752    0.9751    754758






In [68]:
##  IF  Raw

In [69]:
from sklearn.ensemble import IsolationForest

n_est = [20, 50, 100, 150]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    clf_if = IsolationForest(n_estimators=n_est, contamination=con)
    clf_if.fit(raw_benign_train_samples.to_numpy())
   
    y_pred = clf_if.predict(raw_test_samples.to_numpy())
    test_pred = list(map(lambda x : 0 if x == 1 else 1, y_pred))

    f1 = f1_score(raw_test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                        "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()

  

print(best_params)
print(score)
print(classification_report(raw_test_labels, bs, digits=4))

100%|██████████| 24/24 [00:35<00:00,  1.47s/it]

{'n_estimators': 50, 'con': 0.2}
0.7795375391183983
              precision    recall  f1-score   support

           0     0.9320    0.9871    0.9588    332711
           1     0.8319    0.4696    0.6003     45163

    accuracy                         0.9253    377874
   macro avg     0.8820    0.7283    0.7795    377874
weighted avg     0.9201    0.9253    0.9159    377874






In [70]:
n_est = [20, 50, 100, 150]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    clf_if = IsolationForest(n_estimators=n_est, contamination=con)
    clf_if.fit(raw_normal_train_samples.to_numpy())

    y_pred = clf_if.predict(raw_test_samples.to_numpy())
    test_pred = list(map(lambda x : 0 if x == 1 else 1, y_pred))

    f1 = f1_score(raw_test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est
                }
        bs = test_pred
    del clf_if
    gc.collect()

        

print("benign only")
print(best_params)
print(score)
print(classification_report(raw_test_labels, bs, digits=4))

100%|██████████| 24/24 [00:37<00:00,  1.54s/it]

benign only
{'n_estimators': 100}
0.597095716438742
              precision    recall  f1-score   support

           0     0.8958    0.9981    0.9442    332711
           1     0.9119    0.1449    0.2500     45163

    accuracy                         0.8961    377874
   macro avg     0.9039    0.5715    0.5971    377874
weighted avg     0.8977    0.8961    0.8612    377874




