In [54]:
import pandas as pd
import numpy as np
import torch
from sklearn import preprocessing
from dgl.data import DGLDataset
import dgl
import time
import networkx as nx
import category_encoders as ce
import torch.nn as nn
import torch.nn.functional as F
import dgl.function as fn
import torch
import tqdm
import math

from typing import *
from sklearn.preprocessing import StandardScaler, Normalizer
import socket
import struct
import random
from sklearn.model_selection import train_test_split

In [55]:
file_name = "NF-CSE-CIC-IDS2018-v2.csv"
# file_name = "NF-UNSW-NB15-v2.csv"
data = pd.read_csv(file_name)

In [56]:
data.Label.value_counts()

Label
0.0    4084171
1.0     554361
Name: count, dtype: int64

In [57]:
data.rename(columns=lambda x: x.strip(), inplace=True)
data['IPV4_SRC_ADDR'] = data["IPV4_SRC_ADDR"].apply(str)
data['L4_SRC_PORT'] = data["L4_SRC_PORT"].apply(str)
data['IPV4_DST_ADDR'] = data["IPV4_DST_ADDR"].apply(str)
data['L4_DST_PORT'] = data["L4_DST_PORT"].apply(str)

In [58]:
data.drop(columns=["L4_SRC_PORT", "L4_DST_PORT"], inplace=True)

In [59]:
data.Attack.unique()

array(['SSH-Bruteforce', 'Benign', 'DDoS attacks-LOIC-HTTP',
       'DDOS attack-HOIC', 'DoS attacks-Slowloris', 'DoS attacks-Hulk',
       'FTP-BruteForce', 'Infilteration', 'Bot', 'DoS attacks-GoldenEye',
       'Brute Force -Web', 'DoS attacks-SlowHTTPTest', 'SQL Injection',
       'DDOS attack-LOIC-UDP', 'Brute Force -XSS', nan], dtype=object)

In [60]:
data = data.groupby(by='Attack').sample(frac=0.1, random_state=13)

In [61]:
data.groupby(by="Attack").count()

Unnamed: 0_level_0,IPV4_SRC_ADDR,IPV4_DST_ADDR,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,OUT_BYTES,OUT_PKTS,TCP_FLAGS,CLIENT_TCP_FLAGS,...,NUM_PKTS_1024_TO_1514_BYTES,TCP_WIN_MAX_IN,TCP_WIN_MAX_OUT,ICMP_TYPE,ICMP_IPV4_TYPE,DNS_QUERY_ID,DNS_QUERY_TYPE,DNS_TTL_ANSWER,FTP_COMMAND_RET_CODE,Label
Attack,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Benign,408417,408417,408417,408417,408417,408417,408417,408417,408417,408417,...,408417,408417,408417,408417,408417,408417,408417,408417,408417,408417
Bot,3521,3521,3521,3521,3521,3521,3521,3521,3521,3521,...,3521,3521,3521,3521,3521,3521,3521,3521,3521,3521
Brute Force -Web,55,55,55,55,55,55,55,55,55,55,...,55,55,55,55,55,55,55,55,55,55
Brute Force -XSS,23,23,23,23,23,23,23,23,23,23,...,23,23,23,23,23,23,23,23,23,23
DDOS attack-HOIC,26496,26496,26496,26496,26496,26496,26496,26496,26496,26496,...,26496,26496,26496,26496,26496,26496,26496,26496,26496,26496
DDOS attack-LOIC-UDP,48,48,48,48,48,48,48,48,48,48,...,48,48,48,48,48,48,48,48,48,48
DDoS attacks-LOIC-HTTP,7564,7564,7564,7564,7564,7564,7564,7564,7564,7564,...,7564,7564,7564,7564,7564,7564,7564,7564,7564,7564
DoS attacks-GoldenEye,694,694,694,694,694,694,694,694,694,694,...,694,694,694,694,694,694,694,694,694,694
DoS attacks-Hulk,10618,10618,10618,10618,10618,10618,10618,10618,10618,10618,...,10618,10618,10618,10618,10618,10618,10618,10618,10618,10618
DoS attacks-SlowHTTPTest,348,348,348,348,348,348,348,348,348,348,...,348,348,348,348,348,348,348,348,348,348


In [62]:
X = data.drop(columns=["Attack", "Label"])
y = data[["Attack", "Label"]]

X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=13, stratify=y)

In [63]:
encoder = ce.TargetEncoder(cols=['TCP_FLAGS','L7_PROTO','PROTOCOL',
                                  'CLIENT_TCP_FLAGS','SERVER_TCP_FLAGS','ICMP_TYPE',
                                  'ICMP_IPV4_TYPE','DNS_QUERY_ID','DNS_QUERY_TYPE',
                                  'FTP_COMMAND_RET_CODE'])
encoder.fit(X_train, y_train.Label)

# Transform on training set
X_train = encoder.transform(X_train)

# Transform on testing set
X_test = encoder.transform(X_test)

In [64]:
X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)
X_train.fillna(0, inplace=True)
X_test.fillna(0, inplace=True)

In [65]:
scaler = Normalizer()
cols_to_norm = list(set(list(X_train.iloc[:, 2:].columns))) # Ignore first two as the represents IP addresses
scaler.fit(X_train[cols_to_norm])

# Transform on training set
X_train[cols_to_norm] = scaler.transform(X_train[cols_to_norm])
X_train['h'] = X_train.iloc[:, 2:].values.tolist()

# Transform on testing set
X_test[cols_to_norm] = scaler.transform(X_test[cols_to_norm])
X_test['h'] = X_test.iloc[:, 2:].values.tolist()

train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

In [66]:
X_train.head()

Unnamed: 0,IPV4_SRC_ADDR,IPV4_DST_ADDR,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,OUT_BYTES,OUT_PKTS,TCP_FLAGS,CLIENT_TCP_FLAGS,...,NUM_PKTS_1024_TO_1514_BYTES,TCP_WIN_MAX_IN,TCP_WIN_MAX_OUT,ICMP_TYPE,ICMP_IPV4_TYPE,DNS_QUERY_ID,DNS_QUERY_TYPE,DNS_TTL_ANSWER,FTP_COMMAND_RET_CODE,h
3955721,172.31.67.42,172.31.0.2,6.46765e-09,5.167049e-09,4.2e-05,5.744228e-07,0.000118,5.744228e-07,6.493897e-09,6.493897e-09,...,0.0,0.0,0.0,6.789944e-08,6.789944e-08,5.804603e-08,5.040895e-09,3.4e-05,6.865008e-08,"[6.467649681641264e-09, 5.167049175319861e-09,..."
2325016,66.17.123.196,172.31.65.27,1.033723e-08,4.729764e-10,7.7e-05,4.206477e-07,9.8e-05,3.680667e-07,1.452982e-10,7.759581e-09,...,5.258096e-08,0.000431,0.003365,6.215314e-09,6.215314e-09,1.032167e-08,1.032167e-08,0.0,6.284025e-09,"[1.0337231687413666e-08, 4.729763781866915e-10..."
1764450,172.31.64.121,23.45.134.221,3.585086e-08,1.640343e-09,0.000764,4.923654e-06,0.001081,4.194224e-06,1.363043e-08,3.653466e-09,...,3.647151e-07,0.001494,0.005325,8.075172e-10,8.075172e-10,3.579688e-08,3.579688e-08,0.0,2.179381e-08,"[3.5850855809563285e-08, 1.6403432222909793e-0..."
3566283,172.31.66.66,23.217.40.12,3.471685e-09,1.588457e-10,1.4e-05,1.765894e-07,0.000124,1.765894e-07,2.744133e-09,8.087445e-11,...,7.063574e-08,0.000145,0.000516,2.087368e-09,2.087368e-09,3.466457e-09,3.466457e-09,0.0,2.110445e-09,"[3.4716846101506993e-09, 1.58845700935081e-10,..."
3814722,178.137.217.167,172.31.66.97,1.316328e-07,1.778572e-09,8.8e-05,2.008674e-06,8.8e-05,2.008674e-06,1.566218e-10,1.531469e-10,...,0.0,0.010997,0.005485,7.914488e-08,7.914488e-08,1.314346e-07,1.314346e-07,0.0,8.001984e-08,"[1.3163276546494432e-07, 1.778571631316055e-09..."


In [67]:
lab_enc = preprocessing.LabelEncoder()
lab_enc.fit(data["Attack"])

# Transform on training set
train["Attack"] = lab_enc.transform(train["Attack"])

# Transform on testing set
test["Attack"] = lab_enc.transform(test["Attack"])

In [68]:
# Training graph

train_g = nx.from_pandas_edgelist(train, "IPV4_SRC_ADDR", "IPV4_DST_ADDR",
            ["h", "Label", "Attack"], create_using=nx.MultiGraph())

train_g = train_g.to_directed()
train_g = dgl.from_networkx(train_g, edge_attrs=['h', 'Attack', 'Label'])
nfeat_weight = torch.ones([train_g.number_of_nodes(),
train_g.edata['h'].shape[1]])
train_g.ndata['h'] = nfeat_weight

# Testing graph
test_g = nx.from_pandas_edgelist(test, "IPV4_SRC_ADDR", "IPV4_DST_ADDR",
            ["h", "Label", "Attack"], create_using=nx.MultiGraph())

test_g = test_g.to_directed()
test_g = dgl.from_networkx(test_g, edge_attrs=['h', 'Attack', 'Label'])
nfeat_weight = torch.ones([test_g.number_of_nodes(),
test_g.edata['h'].shape[1]])
test_g.ndata['h'] = nfeat_weight

In [69]:
import torch.nn as nn
import torch.nn.functional as F
import dgl.function as fn
import tqdm
import gc

class SAGELayer(nn.Module):
    def __init__(self, ndim_in, edims, ndim_out, activation):
      super(SAGELayer, self).__init__()
      self.W_apply = nn.Linear(ndim_in + edims , ndim_out)
      self.activation = F.relu
      # self.W_edge = nn.Linear(128 * 2, 256)
      self.W_edge = nn.Linear(256 * 2, 512)
      self.reset_parameters()

    def reset_parameters(self):
      gain = nn.init.calculate_gain('relu')
      nn.init.xavier_uniform_(self.W_apply.weight, gain=gain)

    def message_func(self, edges):
      return {'m':  edges.data['h']}

    def forward(self, g_dgl, nfeats, efeats):
      with g_dgl.local_scope():
        g = g_dgl
        g.ndata['h'] = nfeats
        g.edata['h'] = efeats
        g.update_all(self.message_func, fn.mean('m', 'h_neigh'))
        g.ndata['h'] = F.relu(self.W_apply(torch.cat([g.ndata['h'], g.ndata['h_neigh']], 2)))

        # Compute edge embeddings
        u, v = g.edges()
        edge = self.W_edge(torch.cat((g.srcdata['h'][u], g.dstdata['h'][v]), 2))
        return g.ndata['h'], edge

In [70]:
class SAGE(nn.Module):
    def __init__(self, ndim_in, ndim_out, edim,  activation):
      super(SAGE, self).__init__()
      self.layers = nn.ModuleList()
      # self.layers.append(SAGELayer(ndim_in, edim, 128, F.relu))
      self.layers.append(SAGELayer(ndim_in, edim, 256, F.relu))

    def forward(self, g, nfeats, efeats, corrupt=False):
      if corrupt:
        e_perm = torch.randperm(g.number_of_edges())
        #n_perm = torch.randperm(g.number_of_nodes())
        efeats = efeats[e_perm]
        #nfeats = nfeats[n_perm]
      for i, layer in enumerate(self.layers):
        #nfeats = layer(g, nfeats, efeats)
        nfeats, e_feats = layer(g, nfeats, efeats)
      #return nfeats.sum(1)
      return nfeats.sum(1), e_feats.sum(1)

In [71]:
class Discriminator(nn.Module):
    def __init__(self, n_hidden):
      super(Discriminator, self).__init__()
      self.weight = nn.Parameter(torch.Tensor(n_hidden, n_hidden))
      self.reset_parameters()

    def uniform(self, size, tensor):
      bound = 1.0 / math.sqrt(size)
      if tensor is not None:
        tensor.data.uniform_(-bound, bound)

    def reset_parameters(self):
      size = self.weight.size(0)
      self.uniform(size, self.weight)

    def forward(self, features, summary):
      features = torch.matmul(features, torch.matmul(self.weight, summary))
      return features

In [72]:
class DGI(nn.Module):
    def __init__(self, ndim_in, ndim_out, edim, activation):
      super(DGI, self).__init__()
      self.encoder = SAGE(ndim_in, ndim_out, edim,  F.relu)
      #self.discriminator = Discriminator(128)
      # self.discriminator = Discriminator(256)
      self.discriminator = Discriminator(512)
      self.loss = nn.BCEWithLogitsLoss()

    def forward(self, g, n_features, e_features):
      positive = self.encoder(g, n_features, e_features, corrupt=False)
      negative = self.encoder(g, n_features, e_features, corrupt=True)
      self.loss = nn.BCEWithLogitsLoss()

    def forward(self, g, n_features, e_features):
      positive = self.encoder(g, n_features, e_features, corrupt=False)
      negative = self.encoder(g, n_features, e_features, corrupt=True)

      positive = positive[1]
      negative = negative[1]

      summary = torch.sigmoid(positive.mean(dim=0))

      positive = self.discriminator(positive, summary)
      negative = self.discriminator(negative, summary)

      l1 = self.loss(positive, torch.ones_like(positive))
      l2 = self.loss(negative, torch.zeros_like(negative))

      return l1 + l2

In [73]:
ndim_in = train_g.ndata['h'].shape[1]
# hidden_features = 128
hidden_features = 256
# ndim_out = 128
ndim_out = 256
num_layers = 1
edim = train_g.edata['h'].shape[1]
learning_rate = 1e-3
epochs = 4000

In [75]:
# Format node and edge features for E-GraphSAGE
train_g.ndata['h'] = torch.reshape(train_g.ndata['h'],
                                   (train_g.ndata['h'].shape[0], 1,
                                    train_g.ndata['h'].shape[1]))

train_g.edata['h'] = torch.reshape(train_g.edata['h'],
                                   (train_g.edata['h'].shape[0], 1,
                                    train_g.edata['h'].shape[1]))

In [76]:
# Convert to GPU
train_g = train_g.to('cuda')

In [77]:
# cnt_wait = 0
# best = 1e9
# best_t = 0
# dur = []
# node_features = train_g.ndata['h'] 
# edge_features = train_g.edata['h']

# for epoch in range(epochs):
#     dgi.train()
#     if epoch >= 3:
#         t0 = time.time()

#     dgi_optimizer.zero_grad()
#     loss = dgi(train_g, node_features, edge_features)
#     loss.backward()
#     dgi_optimizer.step()

#     if loss < best:
#         best = loss
#         best_t = epoch
#         cnt_wait = 0
#         torch.save(dgi.state_dict(), 'best_dgi_CSE_256.pkl')
#     else:
#         cnt_wait += 1

#   # if cnt_wait == patience:
#   #     print('Early stopping!')
#   #     break

#     if epoch >= 3:
#         dur.append(time.time() - t0)

#     if epoch % 50 == 0:

#         print("Epoch {:05d} | Time(s) {:.4f} | Loss {:.4f} | "
#             "ETputs(KTEPS) {:.2f}".format(epoch, np.mean(dur),
#               loss.item(),
#               train_g.num_edges() / np.mean(dur) / 1000))

In [78]:
dgi.load_state_dict(torch.load('best_dgi_CSE_256.pkl'))

<All keys matched successfully>

In [79]:
training_emb = dgi.encoder(train_g, train_g.ndata['h'], train_g.edata['h'])[1]
training_emb = training_emb.detach().cpu().numpy()

In [80]:
test_g.ndata['h'] = torch.reshape(test_g.ndata['h'],
                                   (test_g.ndata['h'].shape[0], 1,
                                    test_g.ndata['h'].shape[1]))



test_g.edata['h'] = torch.reshape(test_g.edata['h'],
                                   (test_g.edata['h'].shape[0], 1,
                                    test_g.edata['h'].shape[1]))

In [81]:
# Convert to GPU
test_g = test_g.to('cuda')

In [82]:
testing_emb = dgi.encoder(test_g, test_g.ndata['h'], test_g.edata['h'])[1]
testing_emb = testing_emb.detach().cpu().numpy()

In [83]:
df_train = pd.DataFrame(training_emb, )
df_train["Attack"] = lab_enc.inverse_transform(
        train_g.edata['Attack'].detach().cpu().numpy())
df_train["Label"] = train_g.edata['Label'].detach().cpu().numpy()

df_test = pd.DataFrame(testing_emb, )
df_test["Attack"] = lab_enc.inverse_transform(
        test_g.edata['Attack'].detach().cpu().numpy())
df_test["Label"] = test_g.edata['Label'].detach().cpu().numpy()

In [84]:
df_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,504,505,506,507,508,509,510,511,Attack,Label
0,-0.029625,0.022520,0.020715,-0.007495,0.022959,0.024238,0.020899,-0.004040,-0.036133,-0.010040,...,0.002338,0.016757,-0.054432,-0.013585,0.014985,-0.032736,0.024949,0.004614,Benign,0.0
1,-0.029625,0.022520,0.020715,-0.007495,0.022959,0.024238,0.020899,-0.004040,-0.036133,-0.010040,...,0.002338,0.016757,-0.054432,-0.013585,0.014985,-0.032736,0.024949,0.004614,Benign,0.0
2,-0.029625,0.022520,0.020715,-0.007495,0.022959,0.024238,0.020899,-0.004040,-0.036133,-0.010040,...,0.002338,0.016757,-0.054432,-0.013585,0.014985,-0.032736,0.024949,0.004614,Benign,0.0
3,-0.029625,0.022520,0.020715,-0.007495,0.022959,0.024238,0.020899,-0.004040,-0.036133,-0.010040,...,0.002338,0.016757,-0.054432,-0.013585,0.014985,-0.032736,0.024949,0.004614,Benign,0.0
4,-0.029625,0.022520,0.020715,-0.007495,0.022959,0.024238,0.020899,-0.004040,-0.036133,-0.010040,...,0.002338,0.016757,-0.054432,-0.013585,0.014985,-0.032736,0.024949,0.004614,Benign,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
648523,-0.027430,0.023259,0.025607,0.004301,0.022323,0.023649,0.021689,-0.002786,-0.027523,-0.010992,...,0.002821,0.010980,-0.048134,-0.015805,0.013831,-0.042795,0.020903,-0.000556,Benign,0.0
648524,-0.019095,0.024805,0.038940,0.025595,0.042218,0.042471,0.014964,0.030470,-0.020955,-0.013074,...,0.007159,-0.007553,-0.023525,-0.047482,0.009466,-0.037597,-0.008835,0.011760,Benign,0.0
648525,-0.025361,0.022368,0.029624,0.010737,0.023206,0.025833,0.023007,0.000869,-0.026398,-0.013008,...,0.005596,0.008988,-0.044136,-0.021254,0.011232,-0.042439,0.015082,0.001843,Benign,0.0
648526,-0.023725,0.022673,0.032043,0.016234,0.024704,0.027166,0.023036,0.004915,-0.023127,-0.014779,...,0.006795,0.006318,-0.040506,-0.025454,0.008896,-0.041490,0.010445,0.002558,Benign,0.0


# Embeddings CBLOF  Embeddings

In [85]:
import torch
import dgl
import numpy as np
import pandas as pd
import torch.optim as optim
import time
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
from sklearn.ensemble import IsolationForest
import gc

from tqdm import tqdm
import itertools

In [86]:
benign_train_samples = df_train[df_train.Label == 0].drop(columns=["Label", "Attack"])
normal_train_samples = df_train.drop(columns=["Label", "Attack"])

train_labels = df_train["Label"]
test_labels = df_test["Label"]

test_samples = df_test.drop(columns=["Label", "Attack"])

In [87]:
contamination = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]

In [88]:
from pyod.models.cblof import CBLOF
n_est = [2,3,5,7,9,10]
params = list(itertools.product(n_est, contamination))
score = -1
bs = None
for n_est, con in tqdm(params):
    
    clf_if = CBLOF(n_clusters=n_est, contamination=con)
    clf_if.fit(benign_train_samples)
    y_pred = clf_if.predict(test_samples)
    test_pred = y_pred

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

100%|██████████| 36/36 [08:03<00:00, 13.43s/it]


{'n_estimators': 9, 'con': 0.001}
0.9456596279480542
              precision    recall  f1-score   support

         0.0     0.9775    0.9988    0.9880    244664
         1.0     0.9898    0.8307    0.9033     33262

    accuracy                         0.9787    277926
   macro avg     0.9836    0.9148    0.9457    277926
weighted avg     0.9789    0.9787    0.9779    277926



In [89]:
from pyod.models.cblof import CBLOF
n_est = [2,3,5,7,9,10]
contamination = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, contamination))
score = -1
bs = None
for n_est, con in tqdm(params):
    
    clf_if = CBLOF(n_clusters=n_est, contamination=con)
    clf_if.fit(normal_train_samples)
    y_pred = clf_if.predict(test_samples)
    test_pred = y_pred

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

100%|██████████| 36/36 [08:49<00:00, 14.72s/it]


{'n_estimators': 2, 'con': 0.1}
0.9457747132523189
              precision    recall  f1-score   support

         0.0     0.9772    0.9992    0.9881    244664
         1.0     0.9933    0.8285    0.9035     33262

    accuracy                         0.9788    277926
   macro avg     0.9852    0.9139    0.9458    277926
weighted avg     0.9791    0.9788    0.9780    277926



In [95]:
# HBOS  Embeddings

In [96]:
benign_train_samples = df_train[df_train.Label == 0].drop(columns=["Label", "Attack"])
normal_train_samples = df_train.drop(columns=["Label", "Attack"])

train_labels = df_train["Label"]
test_labels = df_test["Label"]

test_samples = df_test.drop(columns=["Label", "Attack"])

In [97]:
contamination = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]

In [98]:
from pyod.models.hbos import HBOS

n_est = [5,10,15,20,25,30]
params = list(itertools.product(n_est, contamination))
score = -1
bs = None
for n_est, con in tqdm(params):
    
    clf_if = HBOS(n_bins=n_est, contamination=con)
    clf_if.fit(benign_train_samples)
    y_pred = clf_if.predict(test_samples)
    test_pred = y_pred

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

  0%|          | 0/36 [00:00<?, ?it/s]

100%|██████████| 36/36 [18:02<00:00, 30.06s/it]


{'n_estimators': 5, 'con': 0.001}
0.9455755860758053
              precision    recall  f1-score   support

         0.0     0.9775    0.9988    0.9880    244664
         1.0     0.9894    0.8307    0.9031     33262

    accuracy                         0.9787    277926
   macro avg     0.9835    0.9147    0.9456    277926
weighted avg     0.9789    0.9787    0.9779    277926



In [99]:
n_est = [5,10,15,20,25,30]
contamination = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, contamination))
score = -1
bs = None
for n_est, con in tqdm(params):
    
    clf_if = HBOS(n_bins=n_est, contamination=con)
    clf_if.fit(normal_train_samples)
    y_pred = clf_if.predict(test_samples)
    test_pred = y_pred

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

100%|██████████| 36/36 [20:11<00:00, 33.64s/it]


{'n_estimators': 5, 'con': 0.1}
0.9144373209493302
              precision    recall  f1-score   support

         0.0     0.9690    0.9943    0.9815    244664
         1.0     0.9480    0.7661    0.8474     33262

    accuracy                         0.9670    277926
   macro avg     0.9585    0.8802    0.9144    277926
weighted avg     0.9665    0.9670    0.9654    277926



In [103]:
##  PCA  Emb

In [104]:
from pyod.models.pca import PCA
n_est = [5,10,15,20,25,30]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    clf_if = PCA(n_components=n_est, contamination=con)
    clf_if.fit(benign_train_samples)
    y_pred = clf_if.predict(test_samples)
    test_pred = y_pred

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

  0%|          | 0/36 [00:00<?, ?it/s]

100%|██████████| 36/36 [07:55<00:00, 13.22s/it]


{'n_estimators': 10, 'con': 0.001}
0.9448943465029236
              precision    recall  f1-score   support

         0.0     0.9772    0.9988    0.9879    244664
         1.0     0.9895    0.8286    0.9019     33262

    accuracy                         0.9784    277926
   macro avg     0.9833    0.9137    0.9449    277926
weighted avg     0.9787    0.9784    0.9776    277926



In [105]:
n_est = [5,10,15,20,25,30]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    clf_if = PCA(n_components=n_est, contamination=con)
    clf_if.fit(normal_train_samples)
    y_pred = clf_if.predict(test_samples)
    test_pred = y_pred

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

100%|██████████| 36/36 [08:47<00:00, 14.67s/it]


{'n_estimators': 5, 'con': 0.1}
0.9034035275978365
              precision    recall  f1-score   support

         0.0     0.9686    0.9889    0.9787    244664
         1.0     0.9036    0.7643    0.8281     33262

    accuracy                         0.9620    277926
   macro avg     0.9361    0.8766    0.9034    277926
weighted avg     0.9608    0.9620    0.9606    277926



In [109]:
##  IF  Emb

In [110]:
from sklearn.ensemble import IsolationForest
n_est = [20, 50, 100, 150]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    clf_if = IsolationForest(n_estimators=n_est, contamination=con)
    clf_if.fit(benign_train_samples)
    y_pred = clf_if.predict(test_samples)
    test_pred = list(map(lambda x : 0 if x == 1 else 1, y_pred))

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

100%|██████████| 24/24 [02:31<00:00,  6.31s/it]


{'n_estimators': 50, 'con': 0.001}
0.9468256894474336
              precision    recall  f1-score   support

         0.0     0.9775    0.9994    0.9883    244664
         1.0     0.9949    0.8306    0.9053     33262

    accuracy                         0.9792    277926
   macro avg     0.9862    0.9150    0.9468    277926
weighted avg     0.9796    0.9792    0.9784    277926



In [111]:
from sklearn.ensemble import IsolationForest

n_est = [20, 50, 100, 150]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    clf_if = IsolationForest(n_estimators=n_est, contamination=con)
    clf_if.fit(normal_train_samples)
    y_pred = clf_if.predict(test_samples)
    test_pred = list(map(lambda x : 0 if x == 1 else 1, y_pred))

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

100%|██████████| 24/24 [02:41<00:00,  6.72s/it]


{'n_estimators': 20, 'con': 0.2}
0.8092077501266562
              precision    recall  f1-score   support

         0.0     0.9889    0.8918    0.9378    244664
         1.0     0.5379    0.9263    0.6806     33262

    accuracy                         0.8959    277926
   macro avg     0.7634    0.9091    0.8092    277926
weighted avg     0.9349    0.8959    0.9071    277926

