In [6]:
import pandas as pd
import numpy as np
import torch
from sklearn import preprocessing
from dgl.data import DGLDataset
import dgl
import time
import networkx as nx
import category_encoders as ce
import torch.nn as nn
import torch.nn.functional as F
import dgl.function as fn
import torch
import tqdm
import math

from typing import *
from sklearn.preprocessing import StandardScaler, Normalizer
import socket
import struct
import random
from sklearn.model_selection import train_test_split

In [7]:
file_name = "Self-Collected-NF-CSE-CICIDS.parquet"
data = pd.read_parquet(file_name)

In [8]:
data.Label.value_counts()

Label
0    636608
1    349522
Name: count, dtype: int64

In [9]:
data.rename(columns=lambda x: x.strip(), inplace=True)
data['IPV4_SRC_ADDR'] = data["IPV4_SRC_ADDR"].apply(str)
data['L4_SRC_PORT'] = data["L4_SRC_PORT"].apply(str)
data['IPV4_DST_ADDR'] = data["IPV4_DST_ADDR"].apply(str)
data['L4_DST_PORT'] = data["L4_DST_PORT"].apply(str)

In [10]:
data.drop(columns=["L4_SRC_PORT", "L4_DST_PORT"], inplace=True)

In [11]:
data.Attack.unique()

array(['DoS attacks-GoldenEye', 'Benign', 'FTP-BruteForce',
       'DoS attacks-Hulk', 'Infilteration', 'DoS attacks-SlowHTTPTest',
       'DoS attacks-Slowloris', 'SQL Injection', 'SSH-Bruteforce',
       'Brute Force -Web', 'Brute Force -XSS'], dtype=object)

In [None]:
# Scale the dataset based on your needs (machine capacity) ideally ~500k rows
data_attack = data[data['Label'] == 1].sample(frac=0.2, random_state=13)
data_benign = data[data['Label'] == 0].sample(frac=1, random_state=13)
data = pd.concat([data_attack, data_benign], axis=0)
data = data.sample(frac=0.5, random_state=13).reset_index(drop=True)
data.Label.value_counts()

In [None]:
data.groupby(by="Attack").count()

Unnamed: 0_level_0,IPV4_SRC_ADDR,IPV4_DST_ADDR,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,OUT_BYTES,OUT_PKTS,TCP_FLAGS,CLIENT_TCP_FLAGS,...,NUM_PKTS_1024_TO_1514_BYTES,TCP_WIN_MAX_IN,TCP_WIN_MAX_OUT,ICMP_TYPE,ICMP_IPV4_TYPE,DNS_QUERY_ID,DNS_QUERY_TYPE,DNS_TTL_ANSWER,FTP_COMMAND_RET_CODE,Label
Attack,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Benign,332711,332711,332711,332711,332711,332711,332711,332711,332711,332711,...,332711,332711,332711,332711,332711,332711,332711,332711,332711,332711
Bot,2862,2862,2862,2862,2862,2862,2862,2862,2862,2862,...,2862,2862,2862,2862,2862,2862,2862,2862,2862,2862
Brute Force -Web,43,43,43,43,43,43,43,43,43,43,...,43,43,43,43,43,43,43,43,43,43
Brute Force -XSS,19,19,19,19,19,19,19,19,19,19,...,19,19,19,19,19,19,19,19,19,19
DDOS attack-HOIC,21617,21617,21617,21617,21617,21617,21617,21617,21617,21617,...,21617,21617,21617,21617,21617,21617,21617,21617,21617,21617
DDOS attack-LOIC-UDP,42,42,42,42,42,42,42,42,42,42,...,42,42,42,42,42,42,42,42,42,42
DDoS attacks-LOIC-HTTP,6146,6146,6146,6146,6146,6146,6146,6146,6146,6146,...,6146,6146,6146,6146,6146,6146,6146,6146,6146,6146
DoS attacks-GoldenEye,554,554,554,554,554,554,554,554,554,554,...,554,554,554,554,554,554,554,554,554,554
DoS attacks-Hulk,8653,8653,8653,8653,8653,8653,8653,8653,8653,8653,...,8653,8653,8653,8653,8653,8653,8653,8653,8653,8653
DoS attacks-SlowHTTPTest,282,282,282,282,282,282,282,282,282,282,...,282,282,282,282,282,282,282,282,282,282


In [None]:
X = data.drop(columns=["Attack", "Label"])
y = data[["Attack", "Label"]]

X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=13, stratify=y)

In [None]:
encoder = ce.TargetEncoder(cols=['TCP_FLAGS','L7_PROTO','PROTOCOL',
                                  'CLIENT_TCP_FLAGS','SERVER_TCP_FLAGS','ICMP_TYPE',
                                  'ICMP_IPV4_TYPE','DNS_QUERY_ID','DNS_QUERY_TYPE',
                                  'FTP_COMMAND_RET_CODE'])
encoder.fit(X_train, y_train.Label)

# Transform on training set
X_train = encoder.transform(X_train)

# Transform on testing set
X_test = encoder.transform(X_test)

In [None]:
X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)
X_train.fillna(0, inplace=True)
X_test.fillna(0, inplace=True)

In [None]:
# (Modified)
scaler = Normalizer()
cols_to_norm = list(set(list(X_train.iloc[:, 2:].columns))) # Ignore first two as the represents IP addresses
scaler.fit(X_train[cols_to_norm])

# Transform on training set
X_train[cols_to_norm] = scaler.transform(X_train[cols_to_norm])
X_train['h'] = X_train.iloc[:, 2:].values.tolist()
X_train['id'] = X_train.index

# Transform on testing set
X_test[cols_to_norm] = scaler.transform(X_test[cols_to_norm])
X_test['h'] = X_test.iloc[:, 2:].values.tolist()
X_test['id'] = X_test.index

train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

In [None]:
X_train.head()

Unnamed: 0,IPV4_SRC_ADDR,IPV4_DST_ADDR,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,OUT_BYTES,OUT_PKTS,TCP_FLAGS,CLIENT_TCP_FLAGS,...,TCP_WIN_MAX_IN,TCP_WIN_MAX_OUT,ICMP_TYPE,ICMP_IPV4_TYPE,DNS_QUERY_ID,DNS_QUERY_TYPE,DNS_TTL_ANSWER,FTP_COMMAND_RET_CODE,h,id
9910620,172.31.65.112,172.217.15.65,6.210352e-09,1.019912e-10,4.3e-05,5.354863e-07,0.000117,4.094895e-07,1.62441e-08,1.157705e-08,...,0.000258,0.001348,3.704528e-09,3.704528e-09,6.19055e-09,6.19055e-09,0.0,3.764741e-09,"[6.210351872620525e-09, 1.019912495558199e-10,...",9910620
17557027,18.218.11.51,172.31.69.28,4.579454e-08,1.543672e-07,0.000122,1.16136e-06,0.000266,1.16136e-06,1.197824e-07,8.536806e-08,...,0.015222,0.006244,2.731684e-08,2.731684e-08,4.564852e-08,4.564852e-08,0.0,2.776084e-08,"[4.57945440200515e-08, 1.5436721751920067e-07,...",17557027
4328285,190.38.79.218,172.31.64.101,4.0794e-08,1.77817e-09,0.000103,1.034545e-06,7.1e-05,8.276362e-07,1.644264e-09,2.215507e-09,...,0.003398,0.001695,2.433397e-08,2.433397e-08,4.066392e-08,4.066392e-08,0.0,2.472949e-08,"[4.079399803822932e-08, 1.778169602167593e-09,...",4328285
1287389,172.31.68.15,172.31.0.2,6.533832e-09,5.154645e-09,3.8e-05,5.997981e-07,6.7e-05,5.997981e-07,6.565366e-09,6.565366e-09,...,0.0,0.0,7.05405e-08,7.05405e-08,6.235997e-08,5.346222e-09,3.6e-05,7.168706e-08,"[6.5338315603211064e-09, 5.154645331031908e-09...",1287389
4527816,172.31.66.21,172.31.0.2,1.168657e-08,9.219721e-09,7.9e-05,1.072813e-06,9.7e-05,1.072813e-06,1.174297e-08,1.174297e-08,...,0.0,0.0,1.261704e-07,1.261704e-07,1.115385e-07,9.562381e-09,6.4e-05,1.282212e-07,"[1.1686566408769841e-08, 9.219721141969458e-09...",4527816


In [None]:
lab_enc = preprocessing.LabelEncoder()
lab_enc.fit(data["Attack"])

# Transform on training set
train["Attack"] = lab_enc.transform(train["Attack"])

# Transform on testing set
test["Attack"] = lab_enc.transform(test["Attack"])

In [None]:
# Training graph (Modified)

train['id'] = train.index

train_g = nx.from_pandas_edgelist(train, "IPV4_SRC_ADDR", "IPV4_DST_ADDR",
           ["h", "Label", "Attack", "id"], create_using=nx.MultiGraph())
train_g = train_g.to_directed()
train_g = dgl.from_networkx(train_g, edge_attrs=['h', 'Attack', 'Label', "id"])
nfeat_weight = torch.ones([train_g.number_of_nodes(),
train_g.edata['h'].shape[1]])
train_g.ndata['h'] = nfeat_weight

test['id'] = test.index

# Testing graph
test_g = nx.from_pandas_edgelist(test, "IPV4_SRC_ADDR", "IPV4_DST_ADDR",
            ["h", "Label", "Attack", "id"], create_using=nx.MultiGraph())

test_g = test_g.to_directed()
test_g = dgl.from_networkx(test_g, edge_attrs=['h', 'Attack', 'Label', "id"])
nfeat_weight = torch.ones([test_g.number_of_nodes(),
test_g.edata['h'].shape[1]])
test_g.ndata['h'] = nfeat_weight

In [None]:
import torch.nn as nn
import torch.nn.functional as F
import dgl.function as fn
import tqdm
import gc

class SAGELayer(nn.Module):
    def __init__(self, ndim_in, edims, ndim_out, activation):
      super(SAGELayer, self).__init__()
      self.W_apply = nn.Linear(ndim_in + edims , ndim_out)
      self.activation = F.relu
      self.W_edge = nn.Linear(128 * 2, 256)
      self.reset_parameters()

    def reset_parameters(self):
      gain = nn.init.calculate_gain('relu')
      nn.init.xavier_uniform_(self.W_apply.weight, gain=gain)

    def message_func(self, edges):
      return {'m':  edges.data['h']}

    def forward(self, g_dgl, nfeats, efeats):
      with g_dgl.local_scope():
        g = g_dgl
        g.ndata['h'] = nfeats
        g.edata['h'] = efeats
        g.update_all(self.message_func, fn.mean('m', 'h_neigh'))
        g.ndata['h'] = F.relu(self.W_apply(torch.cat([g.ndata['h'], g.ndata['h_neigh']], 2)))

        # Compute edge embeddings
        u, v = g.edges()
        edge = self.W_edge(torch.cat((g.srcdata['h'][u], g.dstdata['h'][v]), 2))
        return g.ndata['h'], edge

In [None]:
class SAGE(nn.Module):
    def __init__(self, ndim_in, ndim_out, edim,  activation):
      super(SAGE, self).__init__()
      self.layers = nn.ModuleList()
      self.layers.append(SAGELayer(ndim_in, edim, 128, F.relu))

    def forward(self, g, nfeats, efeats, corrupt=False):
      if corrupt:
        e_perm = torch.randperm(g.number_of_edges())
        #n_perm = torch.randperm(g.number_of_nodes())
        efeats = efeats[e_perm]
        #nfeats = nfeats[n_perm]
      for i, layer in enumerate(self.layers):
        #nfeats = layer(g, nfeats, efeats)
        nfeats, e_feats = layer(g, nfeats, efeats)
      #return nfeats.sum(1)
      return nfeats.sum(1), e_feats.sum(1)

In [None]:
class Discriminator(nn.Module):
    def __init__(self, n_hidden):
      super(Discriminator, self).__init__()
      self.weight = nn.Parameter(torch.Tensor(n_hidden, n_hidden))
      self.reset_parameters()

    def uniform(self, size, tensor):
      bound = 1.0 / math.sqrt(size)
      if tensor is not None:
        tensor.data.uniform_(-bound, bound)

    def reset_parameters(self):
      size = self.weight.size(0)
      self.uniform(size, self.weight)

    def forward(self, features, summary):
      features = torch.matmul(features, torch.matmul(self.weight, summary))
      return features

In [None]:
class DGI(nn.Module):
    def __init__(self, ndim_in, ndim_out, edim, activation):
      super(DGI, self).__init__()
      self.encoder = SAGE(ndim_in, ndim_out, edim,  F.relu)
      #self.discriminator = Discriminator(128)
      self.discriminator = Discriminator(256)
      self.loss = nn.BCEWithLogitsLoss()

    def forward(self, g, n_features, e_features):
      positive = self.encoder(g, n_features, e_features, corrupt=False)
      negative = self.encoder(g, n_features, e_features, corrupt=True)
      self.loss = nn.BCEWithLogitsLoss()

    def forward(self, g, n_features, e_features):
      positive = self.encoder(g, n_features, e_features, corrupt=False)
      negative = self.encoder(g, n_features, e_features, corrupt=True)

      positive = positive[1]
      negative = negative[1]

      summary = torch.sigmoid(positive.mean(dim=0))

      positive = self.discriminator(positive, summary)
      negative = self.discriminator(negative, summary)

      l1 = self.loss(positive, torch.ones_like(positive))
      l2 = self.loss(negative, torch.zeros_like(negative))

      return l1 + l2

In [None]:
ndim_in = train_g.ndata['h'].shape[1]
hidden_features = 128
ndim_out = 128
num_layers = 1
edim = train_g.edata['h'].shape[1]
learning_rate = 1e-3
epochs = 4000

In [None]:
dgi = DGI(ndim_in,
    ndim_out,
    edim,
    F.relu)

dgi = dgi.to('cuda')

dgi_optimizer = torch.optim.Adam(dgi.parameters(),
                lr=1e-3,
                weight_decay=0.)

In [None]:
# Format node and edge features for E-GraphSAGE
train_g.ndata['h'] = torch.reshape(train_g.ndata['h'],
                                   (train_g.ndata['h'].shape[0], 1,
                                    train_g.ndata['h'].shape[1]))

train_g.edata['h'] = torch.reshape(train_g.edata['h'],
                                   (train_g.edata['h'].shape[0], 1,
                                    train_g.edata['h'].shape[1]))

In [None]:
# Convert to GPU
train_g = train_g.to('cuda')

In [None]:
dgi.load_state_dict(torch.load('best_dgi_CSE_self_collected.pkl'))

  dgi.load_state_dict(torch.load('best_dgi_CSE_v2.pkl'))


<All keys matched successfully>

In [None]:
training_emb = dgi.encoder(train_g, train_g.ndata['h'], train_g.edata['h'])[1]
training_emb = training_emb.detach().cpu().numpy()

In [None]:
test_g.ndata['h'] = torch.reshape(test_g.ndata['h'],
                                   (test_g.ndata['h'].shape[0], 1,
                                    test_g.ndata['h'].shape[1]))



test_g.edata['h'] = torch.reshape(test_g.edata['h'],
                                   (test_g.edata['h'].shape[0], 1,
                                    test_g.edata['h'].shape[1]))

In [None]:
# Convert to GPU
test_g = test_g.to('cuda')

In [None]:
testing_emb = dgi.encoder(test_g, test_g.ndata['h'], test_g.edata['h'])[1]
testing_emb = testing_emb.detach().cpu().numpy()

In [None]:
# Multimodal (Fusion) Learning

df_train = pd.DataFrame(training_emb,)
# map the id to the original data
df_train['id'] = train_g.edata['id'].detach().cpu().numpy()


df_raw_train = pd.DataFrame(X_train.drop(columns=["IPV4_SRC_ADDR", "IPV4_DST_ADDR", "h"]))
df_fuse_train = pd.merge(df_train, df_raw_train, on='id', how='left')
df_fuse_train = df_fuse_train.drop(columns=["id"])
df_fuse_train["Attacks"] = train_g.edata['Attack'].detach().cpu().numpy()
df_fuse_train["Label"] = train_g.edata['Label'].detach().cpu().numpy()

df_test = pd.DataFrame(testing_emb,)
# map the id to the original data
df_test['id'] = test_g.edata['id'].detach().cpu().numpy()

df_raw_test = pd.DataFrame(X_test.drop(columns=["IPV4_SRC_ADDR", "IPV4_DST_ADDR", "h"]))
df_raw_test = pd.merge(df_test, df_raw_test, on='id', how='left')
df_fuse_test = df_raw_test.drop(columns=["id"])
df_fuse_test["Attacks"] = test_g.edata['Attack'].detach().cpu().numpy()
df_fuse_test["Label"] = test_g.edata['Label'].detach().cpu().numpy()

# Embeddings CBLOF  Embeddings

In [None]:
import torch
import dgl
import numpy as np
import pandas as pd
import torch.optim as optim
import time
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
from sklearn.ensemble import IsolationForest
import gc

from tqdm import tqdm
import itertools

In [None]:
benign_fuse_train_samples = df_fuse_train[df_fuse_train.Label == 0].drop(columns=["Label", "Attacks"])
normal_fuse_train_samples = df_fuse_train.drop(columns=["Label", "Attacks"])

fuse_train_labels = df_fuse_train["Label"]
fuse_test_labels = df_fuse_test["Label"]

fuse_test_samples = df_fuse_test.drop(columns=["Label", "Attacks"])

In [None]:
fuse_test_samples

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,NUM_PKTS_512_TO_1024_BYTES,NUM_PKTS_1024_TO_1514_BYTES,TCP_WIN_MAX_IN,TCP_WIN_MAX_OUT,ICMP_TYPE,ICMP_IPV4_TYPE,DNS_QUERY_ID,DNS_QUERY_TYPE,DNS_TTL_ANSWER,FTP_COMMAND_RET_CODE
0,0.056001,0.003425,0.018792,0.026898,-0.019888,-0.009387,-0.017509,0.057528,0.006249,0.031639,...,5.195587e-08,5.195587e-08,0.000853,0.003325,6.110378e-09,6.110378e-09,1.021091e-08,1.021091e-08,0.000000,6.209696e-09
1,0.055526,0.003022,0.018597,0.028253,-0.020154,-0.009405,-0.017623,0.057264,0.006710,0.031146,...,5.249924e-08,5.249924e-08,0.000862,0.003360,6.174283e-09,6.174283e-09,1.031770e-08,1.031770e-08,0.000000,6.274639e-09
2,0.054998,0.003262,0.018682,0.028354,-0.020032,-0.008754,-0.017805,0.057765,0.005981,0.030930,...,5.676083e-08,5.676083e-08,0.000932,0.003633,6.675476e-09,6.675476e-09,1.115523e-08,1.115523e-08,0.000000,6.783979e-09
3,0.057571,0.001114,0.017634,0.024744,-0.019200,-0.013060,-0.019311,0.057819,0.006548,0.034403,...,5.195587e-08,5.195587e-08,0.000853,0.003325,6.110378e-09,6.110378e-09,1.021091e-08,1.021091e-08,0.000000,6.209696e-09
4,0.056988,0.002533,0.016816,0.026572,-0.020285,-0.012216,-0.019869,0.058244,0.006298,0.034181,...,0.000000e+00,0.000000e+00,0.000000,0.000000,6.070549e-08,6.070549e-08,6.169219e-08,4.600833e-09,0.000031,6.169219e-08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
226421,0.053617,0.001802,0.025390,0.030215,-0.014754,0.000219,-0.011493,0.058269,0.002775,0.031937,...,0.000000e+00,0.000000e+00,0.001509,0.001509,2.166202e-08,2.166202e-08,3.619889e-08,3.619889e-08,0.000000,2.201411e-08
226422,0.057234,0.004469,0.018136,0.025600,-0.019923,-0.009846,-0.017709,0.057610,0.006469,0.032421,...,6.783768e-08,0.000000e+00,0.000556,0.002902,1.892529e-10,1.892529e-10,1.333217e-08,1.333217e-08,0.000000,8.107869e-09
226423,0.054385,0.000323,0.027328,0.029952,-0.010274,-0.010299,-0.011491,0.056325,0.005336,0.037315,...,0.000000e+00,0.000000e+00,0.134820,0.000000,3.310606e-07,3.310606e-07,5.532276e-07,5.532276e-07,0.000000,3.364416e-07
226424,0.052916,0.001902,0.024080,0.031328,-0.016697,-0.000394,-0.013337,0.057979,0.003016,0.030761,...,0.000000e+00,0.000000e+00,0.003433,0.001695,2.433397e-08,2.433397e-08,4.066392e-08,4.066392e-08,0.000000,2.472949e-08


In [None]:
contamination = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]

In [None]:
from pyod.models.cblof import CBLOF
n_est = [5,6,7,9,10] # cant be lower than 5 or 4
params = list(itertools.product(n_est, contamination))
score = -1
bs = None
for n_est, con in tqdm(params):
    
    clf_if = CBLOF(n_clusters=n_est, contamination=con)
    clf_if.fit(benign_fuse_train_samples)
    y_pred = clf_if.predict(fuse_test_samples)
    test_pred = y_pred

    f1 = f1_score(fuse_test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(fuse_test_labels, bs, digits=4))

100%|██████████| 30/30 [03:57<00:00,  7.91s/it]

{'n_estimators': 9, 'con': 0.01}
0.9424623298192264
              precision    recall  f1-score   support

           0     0.9846    0.9882    0.9864    199328
           1     0.9108    0.8866    0.8985     27098

    accuracy                         0.9760    226426
   macro avg     0.9477    0.9374    0.9425    226426
weighted avg     0.9758    0.9760    0.9759    226426






In [None]:
n_est = [5,6,7,9,10]
contamination = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, contamination))
score = -1
bs = None
for n_est, con in tqdm(params):
    
    clf_if = CBLOF(n_clusters=n_est, contamination=con)
    clf_if.fit(normal_fuse_train_samples)
    y_pred = clf_if.predict(fuse_test_samples)
    test_pred = y_pred

    f1 = f1_score(fuse_test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(fuse_test_labels, bs, digits=4))

100%|██████████| 30/30 [03:32<00:00,  7.10s/it]

{'n_estimators': 9, 'con': 0.1}
0.7693711792848211
              precision    recall  f1-score   support

           0     0.9394    0.9587    0.9490    199328
           1     0.6421    0.5453    0.5898     27098

    accuracy                         0.9092    226426
   macro avg     0.7908    0.7520    0.7694    226426
weighted avg     0.9039    0.9092    0.9060    226426






In [None]:
# HBOS  Embeddings+Raw (Multimodal)

In [None]:
contamination = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]

In [None]:
from pyod.models.hbos import HBOS

n_est = [5,10,15,20,25,30]
params = list(itertools.product(n_est, contamination))
score = -1
bs = None
for n_est, con in tqdm(params):
    
    clf_if = HBOS(n_bins=n_est, contamination=con)
    clf_if.fit(benign_fuse_train_samples)
    y_pred = clf_if.predict(fuse_test_samples)
    test_pred = y_pred

    f1 = f1_score(fuse_test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(fuse_test_labels, bs, digits=4))

100%|██████████| 36/36 [03:00<00:00,  5.02s/it]

{'n_estimators': 15, 'con': 0.001}
0.9441422903671992
              precision    recall  f1-score   support

           0     0.9769    0.9988    0.9877    199328
           1     0.9890    0.8266    0.9006     27098

    accuracy                         0.9782    226426
   macro avg     0.9830    0.9127    0.9441    226426
weighted avg     0.9784    0.9782    0.9773    226426






In [None]:
n_est = [5,10,15,20,25,30]
contamination = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, contamination))
score = -1
bs = None
for n_est, con in tqdm(params):
    
    clf_if = HBOS(n_bins=n_est, contamination=con)
    clf_if.fit(normal_fuse_train_samples)
    y_pred = clf_if.predict(fuse_test_samples)
    test_pred = y_pred

    f1 = f1_score(fuse_test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(fuse_test_labels, bs, digits=4))

100%|██████████| 36/36 [03:22<00:00,  5.62s/it]

{'n_estimators': 30, 'con': 0.1}
0.9274905532400305
              precision    recall  f1-score   support

           0     0.9844    0.9804    0.9824    199328
           1     0.8601    0.8855    0.8726     27098

    accuracy                         0.9691    226426
   macro avg     0.9222    0.9330    0.9275    226426
weighted avg     0.9695    0.9691    0.9692    226426






In [None]:
##  PCA  Emb+Raw (Multimodal/Fusion) Learning

In [None]:
from pyod.models.pca import PCA
n_est = [5,10,15,20,25,30]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    clf_if = PCA(n_components=n_est, contamination=con)
    clf_if.fit(benign_fuse_train_samples)
    y_pred = clf_if.predict(fuse_test_samples)
    test_pred = y_pred

    f1 = f1_score(fuse_test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(fuse_test_labels, bs, digits=4))

  0%|          | 0/36 [00:00<?, ?it/s]

100%|██████████| 36/36 [03:17<00:00,  5.48s/it]

{'n_estimators': 5, 'con': 0.001}
0.9440471241661282
              precision    recall  f1-score   support

           0     0.9770    0.9987    0.9877    199328
           1     0.9884    0.8268    0.9004     27098

    accuracy                         0.9781    226426
   macro avg     0.9827    0.9127    0.9440    226426
weighted avg     0.9783    0.9781    0.9773    226426






In [None]:
n_est = [5,10,15,20,25,30]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    clf_if = PCA(n_components=n_est, contamination=con)
    clf_if.fit(normal_fuse_train_samples)
    y_pred = clf_if.predict(fuse_test_samples)
    test_pred = y_pred

    f1 = f1_score(fuse_test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(fuse_test_labels, bs, digits=4))

100%|██████████| 36/36 [03:15<00:00,  5.42s/it]

{'n_estimators': 10, 'con': 0.1}
0.9210641356112492
              precision    recall  f1-score   support

           0     0.9721    0.9934    0.9826    199328
           1     0.9421    0.7902    0.8595     27098

    accuracy                         0.9691    226426
   macro avg     0.9571    0.8918    0.9211    226426
weighted avg     0.9685    0.9691    0.9679    226426






In [None]:
##  IF  Emb+Raw

In [None]:
for i in range(len(benign_fuse_train_samples.columns)):
    benign_fuse_train_samples.rename(columns={benign_fuse_train_samples.columns[i]: f"feature {i}"}, inplace=True)

for i in range(len(normal_fuse_train_samples.columns)):
    normal_fuse_train_samples.rename(columns={normal_fuse_train_samples.columns[i]: f"feature {i}"}, inplace=True)

for i in range(len(fuse_test_samples.columns)):
    fuse_test_samples.rename(columns={fuse_test_samples.columns[i]: f"feature {i}"}, inplace=True)

In [None]:
from sklearn.ensemble import IsolationForest
n_est = [20, 50, 100, 150]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    clf_if = IsolationForest(n_estimators=n_est, contamination=con)
    clf_if.fit(benign_fuse_train_samples.to_numpy())
    y_pred = clf_if.predict(fuse_test_samples)
    test_pred = list(map(lambda x : 0 if x == 1 else 1, y_pred))

    f1 = f1_score(fuse_test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(fuse_test_labels, bs, digits=4))

  0%|          | 0/24 [00:00<?, ?it/s]

100%|██████████| 24/24 [00:52<00:00,  2.17s/it]

{'n_estimators': 150, 'con': 0.001}
0.9438914900526983
              precision    recall  f1-score   support

           0     0.9769    0.9987    0.9877    199328
           1     0.9883    0.8264    0.9001     27098

    accuracy                         0.9781    226426
   macro avg     0.9826    0.9125    0.9439    226426
weighted avg     0.9783    0.9781    0.9772    226426






In [None]:
from sklearn.ensemble import IsolationForest

n_est = [20, 50, 100, 150]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    clf_if = IsolationForest(n_estimators=n_est, contamination=con)
    clf_if.fit(normal_fuse_train_samples)
    y_pred = clf_if.predict(fuse_test_samples)
    test_pred = list(map(lambda x : 0 if x == 1 else 1, y_pred))

    f1 = f1_score(fuse_test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(fuse_test_labels, bs, digits=4))

100%|██████████| 24/24 [00:53<00:00,  2.23s/it]

{'n_estimators': 100, 'con': 0.1}
0.9371092613541756
              precision    recall  f1-score   support

           0     0.9844    0.9856    0.9850    199328
           1     0.8934    0.8851    0.8892     27098

    accuracy                         0.9736    226426
   macro avg     0.9389    0.9354    0.9371    226426
weighted avg     0.9735    0.9736    0.9735    226426




