In [1]:
import pandas as pd
import numpy as np
import torch
from sklearn import preprocessing
from dgl.data import DGLDataset
import dgl
import time
import networkx as nx
import category_encoders as ce
import torch.nn as nn
import torch.nn.functional as F
import dgl.function as fn
import torch
import tqdm
import math

from typing import *
from sklearn.preprocessing import StandardScaler, Normalizer
import socket
import struct
import random
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
file_name = "NF-CSE-CIC-IDS2018-v3.parquet"
# file_name = "final.csv"
data = pd.read_parquet(file_name)

In [6]:
data.Label.value_counts()

Label
0    17514626
1     2600903
Name: count, dtype: int64

In [7]:
data.rename(columns=lambda x: x.strip(), inplace=True)
data['IPV4_SRC_ADDR'] = data["IPV4_SRC_ADDR"].apply(str)
data['L4_SRC_PORT'] = data["L4_SRC_PORT"].apply(str)
data['IPV4_DST_ADDR'] = data["IPV4_DST_ADDR"].apply(str)
data['L4_DST_PORT'] = data["L4_DST_PORT"].apply(str)

In [8]:
data.drop(columns=["L4_SRC_PORT", "L4_DST_PORT"], inplace=True)

In [9]:
data.Attack.unique()

array(['Benign', 'FTP-BruteForce', 'SSH-Bruteforce',
       'DoS_attacks-GoldenEye', 'DoS_attacks-Slowloris',
       'DoS_attacks-SlowHTTPTest', 'DoS_attacks-Hulk',
       'DDoS_attacks-LOIC-HTTP', 'DDOS_attack-LOIC-UDP',
       'DDOS_attack-HOIC', 'Brute_Force_-Web', 'Brute_Force_-XSS',
       'SQL_Injection', 'Infilteration', 'Bot'], dtype=object)

In [None]:
# Scale the dataset based on your needs (machine capacity) ideally ~ 500k rows

# data = data.groupby(by='Attack').sample(frac=0.02, random_state=13)
data_attack = data[data['Label'] == 1]
data_benign = data[data['Label'] == 0].sample(frac=0.1, random_state=13)
data = pd.concat([data_attack, data_benign], axis=0)
data = data.sample(frac=0.1, random_state=13).reset_index(drop=True)
data.Label.value_counts()

Label
1    259694
0    175543
Name: count, dtype: int64

In [11]:
data.groupby(by="Attack").count()

Unnamed: 0_level_0,FLOW_START_MILLISECONDS,FLOW_END_MILLISECONDS,IPV4_SRC_ADDR,IPV4_DST_ADDR,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,OUT_BYTES,OUT_PKTS,...,FTP_COMMAND_RET_CODE,SRC_TO_DST_IAT_MIN,SRC_TO_DST_IAT_MAX,SRC_TO_DST_IAT_AVG,SRC_TO_DST_IAT_STDDEV,DST_TO_SRC_IAT_MIN,DST_TO_SRC_IAT_MAX,DST_TO_SRC_IAT_AVG,DST_TO_SRC_IAT_STDDEV,Label
Attack,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Benign,175543,175543,175543,175543,175543,175543,175543,175543,175543,175543,...,175543,175543,175543,175543,175543,175543,175543,175543,175543,175543
Bot,20730,20730,20730,20730,20730,20730,20730,20730,20730,20730,...,20730,20730,20730,20730,20730,20730,20730,20730,20730,20730
Brute_Force_-Web,169,169,169,169,169,169,169,169,169,169,...,169,169,169,169,169,169,169,169,169,169
Brute_Force_-XSS,56,56,56,56,56,56,56,56,56,56,...,56,56,56,56,56,56,56,56,56,56
DDOS_attack-HOIC,102984,102984,102984,102984,102984,102984,102984,102984,102984,102984,...,102984,102984,102984,102984,102984,102984,102984,102984,102984,102984
DDOS_attack-LOIC-UDP,313,313,313,313,313,313,313,313,313,313,...,313,313,313,313,313,313,313,313,313,313
DDoS_attacks-LOIC-HTTP,28935,28935,28935,28935,28935,28935,28935,28935,28935,28935,...,28935,28935,28935,28935,28935,28935,28935,28935,28935,28935
DoS_attacks-GoldenEye,6053,6053,6053,6053,6053,6053,6053,6053,6053,6053,...,6053,6053,6053,6053,6053,6053,6053,6053,6053,6053
DoS_attacks-Hulk,9818,9818,9818,9818,9818,9818,9818,9818,9818,9818,...,9818,9818,9818,9818,9818,9818,9818,9818,9818,9818
DoS_attacks-SlowHTTPTest,10535,10535,10535,10535,10535,10535,10535,10535,10535,10535,...,10535,10535,10535,10535,10535,10535,10535,10535,10535,10535


In [12]:
X = data.drop(columns=["Attack", "Label", "FLOW_START_MILLISECONDS", "FLOW_END_MILLISECONDS",
                       "SRC_TO_DST_IAT_MIN", "SRC_TO_DST_IAT_MAX", "SRC_TO_DST_IAT_AVG",
                       "SRC_TO_DST_IAT_STDDEV", "DST_TO_SRC_IAT_MIN", "DST_TO_SRC_IAT_MAX",
                       "DST_TO_SRC_IAT_AVG", "DST_TO_SRC_IAT_STDDEV"])
y = data[["Attack", "Label"]]

X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=13, stratify=y)

In [13]:
encoder = ce.TargetEncoder(cols=['TCP_FLAGS','L7_PROTO','PROTOCOL',
                                  'CLIENT_TCP_FLAGS','SERVER_TCP_FLAGS','ICMP_TYPE',
                                  'ICMP_IPV4_TYPE','DNS_QUERY_ID','DNS_QUERY_TYPE',
                                  'FTP_COMMAND_RET_CODE'])
encoder.fit(X_train, y_train.Label)

# Transform on training set
X_train = encoder.transform(X_train)

# Transform on testing set
X_test = encoder.transform(X_test)

In [14]:
X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)
X_train.fillna(0, inplace=True)
X_test.fillna(0, inplace=True)

In [15]:
# (Modified)
scaler = Normalizer()
cols_to_norm = list(set(list(X_train.iloc[:, 2:].columns))) # Ignore first two as the represents IP addresses
scaler.fit(X_train[cols_to_norm])

# Transform on training set
X_train[cols_to_norm] = scaler.transform(X_train[cols_to_norm])
X_train['h'] = X_train.iloc[:, 2:].values.tolist()
X_train['id'] = X_train.index

# Transform on testing set
X_test[cols_to_norm] = scaler.transform(X_test[cols_to_norm])
X_test['h'] = X_test.iloc[:, 2:].values.tolist()
X_test['id'] = X_test.index

train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

In [16]:
X_train.head()

Unnamed: 0,IPV4_SRC_ADDR,IPV4_DST_ADDR,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,OUT_BYTES,OUT_PKTS,TCP_FLAGS,CLIENT_TCP_FLAGS,...,TCP_WIN_MAX_IN,TCP_WIN_MAX_OUT,ICMP_TYPE,ICMP_IPV4_TYPE,DNS_QUERY_ID,DNS_QUERY_TYPE,DNS_TTL_ANSWER,FTP_COMMAND_RET_CODE,h,id
237065,172.31.66.5,172.31.0.2,4e-06,3.70965e-06,0.001821,2.9e-05,0.004522,2.9e-05,4e-06,4e-06,...,0.0,0.0,1.8e-05,1.8e-05,1.5e-05,4e-06,0.000558,1.8e-05,"[3.7774454035072763e-06, 3.709650241148367e-06...",237065
15205,172.31.66.40,172.31.0.2,1.5e-05,1.517947e-05,0.008051,0.00012,0.01538,0.00012,1.5e-05,1.5e-05,...,0.0,0.0,7.3e-05,7.3e-05,6.2e-05,1.5e-05,0.007209,7.2e-05,"[1.545688427322976e-05, 1.5179474048347746e-05...",15205
312483,18.221.219.4,172.31.69.25,1e-06,1.730706e-06,0.000104,2e-06,6.9e-05,2e-06,2e-06,2e-06,...,0.04655,0.0,1e-06,1e-06,1e-06,1e-06,0.0,1e-06,"[1.2394324546138451e-06, 1.7307060387105932e-0...",312483
277386,82.212.17.186,172.31.64.68,1.1e-05,5.376285e-07,0.002645,4.6e-05,0.001407,3.1e-05,2e-06,3e-06,...,0.12524,0.978439,9e-06,9e-06,1.1e-05,1.1e-05,0.0,9e-06,"[1.094307220318909e-05, 5.3762845853957e-07, 0...",277386
19084,172.31.69.13,172.31.69.7,2e-06,9.448487e-07,9.2e-05,2e-06,8.4e-05,2e-06,2e-06,2e-06,...,0.002153,0.0,1e-06,1e-06,1e-06,1e-06,0.0,1e-06,"[1.5046602182805569e-06, 9.448487146716061e-07...",19084


In [17]:
lab_enc = preprocessing.LabelEncoder()
lab_enc.fit(data["Attack"])

# Transform on training set
train["Attack"] = lab_enc.transform(train["Attack"])

# Transform on testing set
test["Attack"] = lab_enc.transform(test["Attack"])

In [18]:
# Training graph (Modified)

train['id'] = train.index

train_g = nx.from_pandas_edgelist(train, "IPV4_SRC_ADDR", "IPV4_DST_ADDR",
           ["h", "Label", "Attack", "id"], create_using=nx.MultiGraph())
train_g = train_g.to_directed()
train_g = dgl.from_networkx(train_g, edge_attrs=['h', 'Attack', 'Label', "id"])
nfeat_weight = torch.ones([train_g.number_of_nodes(),
train_g.edata['h'].shape[1]])
train_g.ndata['h'] = nfeat_weight

test['id'] = test.index

# Testing graph
test_g = nx.from_pandas_edgelist(test, "IPV4_SRC_ADDR", "IPV4_DST_ADDR",
            ["h", "Label", "Attack", "id"], create_using=nx.MultiGraph())
# print(test_g)
test_g = test_g.to_directed()
# print(test_g)
test_g = dgl.from_networkx(test_g, edge_attrs=['h', 'Attack', 'Label', "id"])
nfeat_weight = torch.ones([test_g.number_of_nodes(),
test_g.edata['h'].shape[1]])
test_g.ndata['h'] = nfeat_weight

In [19]:
import torch.nn as nn
import torch.nn.functional as F
import dgl.function as fn
import tqdm
import gc

class SAGELayer(nn.Module):
    def __init__(self, ndim_in, edims, ndim_out, activation):
      super(SAGELayer, self).__init__()
      self.W_apply = nn.Linear(ndim_in + edims , ndim_out)
      self.activation = F.relu
      self.W_edge = nn.Linear(128 * 2, 256)
      self.reset_parameters()

    def reset_parameters(self):
      gain = nn.init.calculate_gain('relu')
      nn.init.xavier_uniform_(self.W_apply.weight, gain=gain)

    def message_func(self, edges):
      return {'m':  edges.data['h']}

    def forward(self, g_dgl, nfeats, efeats):
      with g_dgl.local_scope():
        g = g_dgl
        g.ndata['h'] = nfeats
        g.edata['h'] = efeats
        g.update_all(self.message_func, fn.mean('m', 'h_neigh'))
        g.ndata['h'] = F.relu(self.W_apply(torch.cat([g.ndata['h'], g.ndata['h_neigh']], 2)))

        # Compute edge embeddings
        u, v = g.edges()
        edge = self.W_edge(torch.cat((g.srcdata['h'][u], g.dstdata['h'][v]), 2))
        return g.ndata['h'], edge

In [20]:
class SAGE(nn.Module):
    def __init__(self, ndim_in, ndim_out, edim,  activation):
      super(SAGE, self).__init__()
      self.layers = nn.ModuleList()
      self.layers.append(SAGELayer(ndim_in, edim, 128, F.relu))

    def forward(self, g, nfeats, efeats, corrupt=False):
      if corrupt:
        e_perm = torch.randperm(g.number_of_edges())
        #n_perm = torch.randperm(g.number_of_nodes())
        efeats = efeats[e_perm]
        #nfeats = nfeats[n_perm]
      for i, layer in enumerate(self.layers):
        #nfeats = layer(g, nfeats, efeats)
        nfeats, e_feats = layer(g, nfeats, efeats)
      #return nfeats.sum(1)
      return nfeats.sum(1), e_feats.sum(1)

In [21]:
class Discriminator(nn.Module):
    def __init__(self, n_hidden):
      super(Discriminator, self).__init__()
      self.weight = nn.Parameter(torch.Tensor(n_hidden, n_hidden))
      self.reset_parameters()

    def uniform(self, size, tensor):
      bound = 1.0 / math.sqrt(size)
      if tensor is not None:
        tensor.data.uniform_(-bound, bound)

    def reset_parameters(self):
      size = self.weight.size(0)
      self.uniform(size, self.weight)

    def forward(self, features, summary):
      features = torch.matmul(features, torch.matmul(self.weight, summary))
      return features

In [22]:
class DGI(nn.Module):
    def __init__(self, ndim_in, ndim_out, edim, activation):
      super(DGI, self).__init__()
      self.encoder = SAGE(ndim_in, ndim_out, edim,  F.relu)
      #self.discriminator = Discriminator(128)
      self.discriminator = Discriminator(256)
      self.loss = nn.BCEWithLogitsLoss()

    def forward(self, g, n_features, e_features):
      positive = self.encoder(g, n_features, e_features, corrupt=False)
      negative = self.encoder(g, n_features, e_features, corrupt=True)
      self.loss = nn.BCEWithLogitsLoss()

    def forward(self, g, n_features, e_features):
      positive = self.encoder(g, n_features, e_features, corrupt=False)
      negative = self.encoder(g, n_features, e_features, corrupt=True)

      positive = positive[1]
      negative = negative[1]

      summary = torch.sigmoid(positive.mean(dim=0))

      positive = self.discriminator(positive, summary)
      negative = self.discriminator(negative, summary)

      l1 = self.loss(positive, torch.ones_like(positive))
      l2 = self.loss(negative, torch.zeros_like(negative))

      return l1 + l2

In [23]:
ndim_in = train_g.ndata['h'].shape[1]
hidden_features = 128
ndim_out = 128
num_layers = 1
edim = train_g.edata['h'].shape[1]
learning_rate = 1e-3
epochs = 4000

In [24]:
dgi = DGI(ndim_in,
    ndim_out,
    edim,
    F.relu)

dgi = dgi.to('cuda')

dgi_optimizer = torch.optim.Adam(dgi.parameters(),
                lr=1e-3,
                weight_decay=0.)

In [25]:
# Format node and edge features for E-GraphSAGE
train_g.ndata['h'] = torch.reshape(train_g.ndata['h'],
                                   (train_g.ndata['h'].shape[0], 1,
                                    train_g.ndata['h'].shape[1]))

train_g.edata['h'] = torch.reshape(train_g.edata['h'],
                                   (train_g.edata['h'].shape[0], 1,
                                    train_g.edata['h'].shape[1]))

In [26]:
# Convert to GPU
train_g = train_g.to('cuda')

In [27]:
cnt_wait = 0
best = 1e9
best_t = 0
dur = []
node_features = train_g.ndata['h'] 
edge_features = train_g.edata['h']

for epoch in range(epochs):
    dgi.train()
    if epoch >= 3:
        t0 = time.time()

    dgi_optimizer.zero_grad()
    loss = dgi(train_g, node_features, edge_features)
    loss.backward()
    dgi_optimizer.step()

    if loss < best:
        best = loss
        best_t = epoch
        cnt_wait = 0
        torch.save(dgi.state_dict(), 'best_dgi_CSE_v3.pkl')
    else:
        cnt_wait += 1

  # if cnt_wait == patience:
  #     print('Early stopping!')
  #     break

    if epoch >= 3:
        dur.append(time.time() - t0)

    if epoch % 50 == 0:

        print("Epoch {:05d} | Time(s) {:.4f} | Loss {:.4f} | "
            "ETputs(KTEPS) {:.2f}".format(epoch, np.mean(dur),
              loss.item(),
              train_g.num_edges() / np.mean(dur) / 1000))

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Epoch 00000 | Time(s) nan | Loss 3.0305 | ETputs(KTEPS) nan
Epoch 00050 | Time(s) 0.3231 | Loss 1.3469 | ETputs(KTEPS) 1883.64
Epoch 00100 | Time(s) 0.3237 | Loss 0.8218 | ETputs(KTEPS) 1880.04
Epoch 00150 | Time(s) 0.3237 | Loss 0.3332 | ETputs(KTEPS) 1880.10
Epoch 00200 | Time(s) 0.3239 | Loss 0.2518 | ETputs(KTEPS) 1879.08
Epoch 00250 | Time(s) 0.3240 | Loss 0.2208 | ETputs(KTEPS) 1878.13
Epoch 00300 | Time(s) 0.3241 | Loss 0.2058 | ETputs(KTEPS) 1877.63
Epoch 00350 | Time(s) 0.3241 | Loss 0.2063 | ETputs(KTEPS) 1877.55
Epoch 00400 | Time(s) 0.3243 | Loss 0.1807 | ETputs(KTEPS) 1876.78
Epoch 00450 | Time(s) 0.3242 | Loss 1.1748 | ETputs(KTEPS) 1876.82
Epoch 00500 | Time(s) 0.3242 | Loss 0.8848 | ETputs(KTEPS) 1877.16
Epoch 00550 | Time(s) 0.3241 | Loss 0.2972 | ETputs(KTEPS) 1877.43
Epoch 00600 | Time(s) 0.3241 | Loss 0.2497 | ETputs(KTEPS) 1877.52
Epoch 00650 | Time(s) 0.3241 | Loss 0.2239 | ETputs(KTEPS) 1877.65
Epoch 00700 | Time(s) 0.3241 | Loss 0.2108 | ETputs(KTEPS) 1877.78
Ep

In [28]:
dgi.load_state_dict(torch.load('best_dgi_CSE_v3.pkl'))

  dgi.load_state_dict(torch.load('best_dgi_CSE_v3.pkl'))


<All keys matched successfully>

In [29]:
training_emb = dgi.encoder(train_g, train_g.ndata['h'], train_g.edata['h'])[1]
training_emb = training_emb.detach().cpu().numpy()

In [30]:
test_g.ndata['h'] = torch.reshape(test_g.ndata['h'],
                                   (test_g.ndata['h'].shape[0], 1,
                                    test_g.ndata['h'].shape[1]))



test_g.edata['h'] = torch.reshape(test_g.edata['h'],
                                   (test_g.edata['h'].shape[0], 1,
                                    test_g.edata['h'].shape[1]))

In [31]:
# Convert to GPU
test_g = test_g.to('cuda')

In [32]:
testing_emb = dgi.encoder(test_g, test_g.ndata['h'], test_g.edata['h'])[1]
testing_emb = testing_emb.detach().cpu().numpy()

In [33]:
# Multimodal (Fusion) Learning

df_train = pd.DataFrame(training_emb,)
# map the id to the original data
df_train['id'] = train_g.edata['id'].detach().cpu().numpy()


df_raw_train = pd.DataFrame(X_train.drop(columns=["IPV4_SRC_ADDR", "IPV4_DST_ADDR", "h"]))
df_fuse_train = pd.merge(df_train, df_raw_train, on='id', how='left')
df_fuse_train = df_fuse_train.drop(columns=["id"])
df_fuse_train["Attacks"] = train_g.edata['Attack'].detach().cpu().numpy()
df_fuse_train["Label"] = train_g.edata['Label'].detach().cpu().numpy()

df_test = pd.DataFrame(testing_emb,)
# map the id to the original data
df_test['id'] = test_g.edata['id'].detach().cpu().numpy()

df_raw_test = pd.DataFrame(X_test.drop(columns=["IPV4_SRC_ADDR", "IPV4_DST_ADDR", "h"]))
df_raw_test = pd.merge(df_test, df_raw_test, on='id', how='left')
df_fuse_test = df_raw_test.drop(columns=["id"])
df_fuse_test["Attacks"] = test_g.edata['Attack'].detach().cpu().numpy()
df_fuse_test["Label"] = test_g.edata['Label'].detach().cpu().numpy()

# Embeddings CBLOF  Embeddings

In [34]:
import torch
import dgl
import numpy as np
import pandas as pd
import torch.optim as optim
import time
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
from sklearn.ensemble import IsolationForest
from pyod.models.cblof import CBLOF
import gc

from tqdm import tqdm
import itertools

In [35]:
benign_fuse_train_samples = df_fuse_train[df_fuse_train.Label == 0].drop(columns=["Label", "Attacks"])
normal_fuse_train_samples = df_fuse_train.drop(columns=["Label", "Attacks"])

fuse_train_labels = df_fuse_train["Label"]
fuse_test_labels = df_fuse_test["Label"]

fuse_test_samples = df_fuse_test.drop(columns=["Label", "Attacks"])

In [None]:
fuse_test_samples

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,NUM_PKTS_512_TO_1024_BYTES,NUM_PKTS_1024_TO_1514_BYTES,TCP_WIN_MAX_IN,TCP_WIN_MAX_OUT,ICMP_TYPE,ICMP_IPV4_TYPE,DNS_QUERY_ID,DNS_QUERY_TYPE,DNS_TTL_ANSWER,FTP_COMMAND_RET_CODE
0,0.012272,0.049754,0.037376,-0.057863,0.045666,0.052595,0.128874,0.029061,0.023510,0.029213,...,0.000010,0.000010,0.039284,0.205148,6.460231e-07,6.460231e-07,9.441403e-07,9.441403e-07,0.000000,6.200296e-07
1,0.021169,0.018282,0.034445,-0.073610,0.018512,0.034474,0.132494,0.009300,0.019381,-0.017057,...,0.000000,0.000000,0.038075,0.026982,2.019403e-05,2.019403e-05,2.951287e-05,2.951287e-05,0.000000,1.938150e-05
2,0.022352,0.026855,-0.063682,-0.022877,0.072588,0.021886,0.129679,0.028414,-0.064747,0.029395,...,0.000015,0.000015,0.124720,0.974379,2.051016e-06,2.051016e-06,2.997489e-06,2.997489e-06,0.000000,1.968491e-06
3,0.022352,0.026855,-0.063682,-0.022877,0.072588,0.021886,0.129679,0.028414,-0.064747,0.029395,...,0.000011,0.000011,0.702095,0.699472,1.472351e-06,1.472351e-06,2.151790e-06,2.151790e-06,0.000000,1.413109e-06
4,0.052005,-0.009472,0.013403,-0.035060,0.066267,0.037680,0.095307,0.021834,-0.001620,-0.034225,...,0.000000,0.000000,0.000000,0.000000,1.919235e-07,1.919235e-07,1.580721e-07,2.044919e-08,0.000085,1.842012e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
240747,0.011691,-0.048981,0.082830,-0.106598,0.033035,0.078462,0.092463,0.061608,-0.015809,-0.014878,...,0.000000,0.000123,0.502245,0.502245,8.259376e-06,8.259376e-06,1.207079e-05,1.207079e-05,0.000000,7.927051e-06
240748,0.058499,0.035029,-0.018861,-0.028614,-0.007317,0.008847,0.151085,-0.060317,-0.055841,-0.036982,...,0.000000,0.000000,0.146438,0.000000,3.785920e-07,3.785920e-07,5.532990e-07,5.532990e-07,0.000000,3.633589e-07
240749,0.022840,-0.067292,0.068563,-0.088919,0.012151,0.100710,0.078976,0.043141,-0.040467,-0.016786,...,0.000000,0.000000,0.583955,0.291249,4.789559e-06,4.789559e-06,6.999774e-06,6.999774e-06,0.000000,4.596846e-06
240750,0.054009,-0.040630,0.086194,-0.087538,0.017969,0.065003,0.106513,0.036058,-0.100818,-0.016475,...,0.000014,0.000014,0.447120,0.873094,1.837817e-06,1.837817e-06,2.685905e-06,2.685905e-06,0.000000,1.763870e-06


In [34]:
contamination = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]

In [35]:
n_est = [5,6,7,9,10] # cant be lower than 5 or 4
params = list(itertools.product(n_est, contamination))
score = -1
bs = None
for n_est, con in tqdm(params):
    try:
        clf_if = CBLOF(n_clusters=n_est, contamination=con)
        clf_if.fit(benign_fuse_train_samples)
    except Exception as e:
        print(n_est)
        continue
    y_pred = clf_if.predict(fuse_test_samples)
    test_pred = y_pred

    f1 = f1_score(fuse_test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(fuse_test_labels, bs, digits=4))

 20%|██        | 6/30 [00:19<01:04,  2.68s/it]

5


100%|██████████| 30/30 [01:45<00:00,  3.50s/it]

{'n_estimators': 7, 'con': 0.2}
0.6936377703621189
              precision    recall  f1-score   support

           0     0.5906    0.7880    0.6752    104997
           1     0.8156    0.6319    0.7121    155818

    accuracy                         0.6948    260815
   macro avg     0.7031    0.7100    0.6936    260815
weighted avg     0.7250    0.6948    0.6972    260815






In [36]:
n_est = [5,6,7,9,10]
contamination = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, contamination))
score = -1
bs = None
for n_est, con in tqdm(params):
    try:
        clf_if = CBLOF(n_clusters=n_est, contamination=con)
        clf_if.fit(normal_fuse_train_samples)
    except Exception as e:
        print(n_est)
        continue
    y_pred = clf_if.predict(fuse_test_samples)
    test_pred = y_pred

    f1 = f1_score(fuse_test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(fuse_test_labels, bs, digits=4))

  7%|▋         | 2/30 [00:11<02:30,  5.38s/it]

5


 13%|█▎        | 4/30 [00:21<02:02,  4.72s/it]

5


 20%|██        | 6/30 [00:29<01:43,  4.32s/it]

5


100%|██████████| 30/30 [03:14<00:00,  6.47s/it]

{'n_estimators': 9, 'con': 0.1}
0.37825055031847954
              precision    recall  f1-score   support

           0     0.4095    0.9144    0.5657    104997
           1     0.6592    0.1116    0.1908    155818

    accuracy                         0.4348    260815
   macro avg     0.5343    0.5130    0.3783    260815
weighted avg     0.5587    0.4348    0.3417    260815






In [None]:
# HBOS  Embeddings+Raw (Multimodal)

In [None]:
contamination = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]

In [None]:
from pyod.models.hbos import HBOS

n_est = [5,10,15,20,25,30]
params = list(itertools.product(n_est, contamination))
score = -1
bs = None
for n_est, con in tqdm(params):
    
    clf_if = HBOS(n_bins=n_est, contamination=con)
    clf_if.fit(benign_fuse_train_samples)
    y_pred = clf_if.predict(fuse_test_samples)
    test_pred = y_pred

    f1 = f1_score(fuse_test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(fuse_test_labels, bs, digits=4))

100%|██████████| 36/36 [05:45<00:00,  9.60s/it]

{'n_estimators': 25, 'con': 0.05}
0.8471039149382792
              precision    recall  f1-score   support

           0     1.0000    0.9177    0.9571    374702
           1     0.5837    1.0000    0.7371     43236

    accuracy                         0.9262    417938
   macro avg     0.7918    0.9588    0.8471    417938
weighted avg     0.9569    0.9262    0.9343    417938






In [None]:
n_est = [5,10,15,20,25,30]
contamination = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, contamination))
score = -1
bs = None
for n_est, con in tqdm(params):
    
    clf_if = HBOS(n_bins=n_est, contamination=con)
    clf_if.fit(normal_fuse_train_samples)
    y_pred = clf_if.predict(fuse_test_samples)
    test_pred = y_pred

    f1 = f1_score(fuse_test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(fuse_test_labels, bs, digits=4))

100%|██████████| 36/36 [06:05<00:00, 10.16s/it]

{'n_estimators': 25, 'con': 0.1}
0.8471039149382792
              precision    recall  f1-score   support

           0     1.0000    0.9177    0.9571    374702
           1     0.5837    1.0000    0.7371     43236

    accuracy                         0.9262    417938
   macro avg     0.7918    0.9588    0.8471    417938
weighted avg     0.9569    0.9262    0.9343    417938






In [None]:
##  PCA  Emb+Raw (Multimodal/Fusion) Learning

In [None]:
from pyod.models.pca import PCA
n_est = [5,10,15,20,25,30]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    clf_if = PCA(n_components=n_est, contamination=con)
    clf_if.fit(benign_fuse_train_samples)
    y_pred = clf_if.predict(fuse_test_samples)
    test_pred = y_pred

    f1 = f1_score(fuse_test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(fuse_test_labels, bs, digits=4))

100%|██████████| 36/36 [05:07<00:00,  8.55s/it]

{'n_estimators': 5, 'con': 0.1}
0.8207887532486517
              precision    recall  f1-score   support

           0     1.0000    0.8987    0.9466    374702
           1     0.5325    1.0000    0.6949     43236

    accuracy                         0.9092    417938
   macro avg     0.7662    0.9493    0.8208    417938
weighted avg     0.9516    0.9092    0.9206    417938






In [None]:
n_est = [5,10,15,20,25,30]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    clf_if = PCA(n_components=n_est, contamination=con)
    clf_if.fit(normal_fuse_train_samples)
    y_pred = clf_if.predict(fuse_test_samples)
    test_pred = y_pred

    f1 = f1_score(fuse_test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(fuse_test_labels, bs, digits=4))

100%|██████████| 36/36 [05:49<00:00,  9.69s/it]

{'n_estimators': 15, 'con': 0.2}
0.8085873162698853
              precision    recall  f1-score   support

           0     1.0000    0.8893    0.9414    374702
           1     0.5103    1.0000    0.6758     43236

    accuracy                         0.9007    417938
   macro avg     0.7552    0.9446    0.8086    417938
weighted avg     0.9493    0.9007    0.9139    417938






In [None]:
##  IF  Emb+Raw

In [None]:
for i in range(len(benign_fuse_train_samples.columns)):
    benign_fuse_train_samples.rename(columns={benign_fuse_train_samples.columns[i]: f"feature {i}"}, inplace=True)

for i in range(len(normal_fuse_train_samples.columns)):
    normal_fuse_train_samples.rename(columns={normal_fuse_train_samples.columns[i]: f"feature {i}"}, inplace=True)

for i in range(len(fuse_test_samples.columns)):
    fuse_test_samples.rename(columns={fuse_test_samples.columns[i]: f"feature {i}"}, inplace=True)

In [None]:
from sklearn.ensemble import IsolationForest
n_est = [20, 50, 100, 150]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    clf_if = IsolationForest(n_estimators=n_est, contamination=con)
    clf_if.fit(benign_fuse_train_samples.to_numpy())
    y_pred = clf_if.predict(fuse_test_samples)
    test_pred = list(map(lambda x : 0 if x == 1 else 1, y_pred))

    f1 = f1_score(fuse_test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(fuse_test_labels, bs, digits=4))

100%|██████████| 24/24 [01:44<00:00,  4.35s/it]

{'n_estimators': 50, 'con': 0.1}
0.820368662501618
              precision    recall  f1-score   support

           0     1.0000    0.8984    0.9465    374702
           1     0.5317    1.0000    0.6943     43236

    accuracy                         0.9089    417938
   macro avg     0.7659    0.9492    0.8204    417938
weighted avg     0.9516    0.9089    0.9204    417938






In [None]:
from sklearn.ensemble import IsolationForest

n_est = [20, 50, 100, 150]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

for n_est, con in tqdm(params):
    clf_if = IsolationForest(n_estimators=n_est, contamination=con)
    clf_if.fit(normal_fuse_train_samples)
    y_pred = clf_if.predict(fuse_test_samples)
    test_pred = list(map(lambda x : 0 if x == 1 else 1, y_pred))

    f1 = f1_score(fuse_test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(fuse_test_labels, bs, digits=4))

100%|██████████| 24/24 [01:52<00:00,  4.70s/it]

{'n_estimators': 150, 'con': 0.2}
0.802806141381684
              precision    recall  f1-score   support

           0     0.9991    0.8861    0.9392    374702
           1     0.5015    0.9928    0.6664     43236

    accuracy                         0.8972    417938
   macro avg     0.7503    0.9395    0.8028    417938
weighted avg     0.9476    0.8972    0.9110    417938






## Supervised attack classification (Fusion)

We now train a supervised classifier on the fused features to predict multi-class attack labels:
- Features: embeddings + raw numeric features from `df_fuse_train`/`df_fuse_test` (without `Label`, `Attacks`).
- Target: `Attacks` (encoded integer classes from earlier `LabelEncoder`).
- Model: HistGradientBoostingClassifier (fast, strong on tabular data). Class imbalance handled via per-sample weights.
- Metrics: macro F1, per-class report, and confusion matrix.

In [36]:
# Prepare supervised train/test for attack classification
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, f1_score, confusion_matrix
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.utils import class_weight
import pickle

# Build train features/targets from already prepared fused DataFrames
X_sup_train = df_fuse_train.drop(columns=["Label", "Attacks"]).copy()
y_sup_train = df_fuse_train["Attacks"].copy()

X_sup_test = df_fuse_test.drop(columns=["Label", "Attacks"]).copy()
y_sup_test = df_fuse_test["Attacks"].copy()

# Compute sample weights to mitigate class imbalance on training set
classes = np.unique(y_sup_train)
class_w = class_weight.compute_class_weight(
    class_weight="balanced", classes=classes, y=y_sup_train
)
class_to_w = {c: w for c, w in zip(classes, class_w)}
sample_weight = y_sup_train.map(class_to_w).values

# Feature names to all str
X_sup_train.columns = X_sup_train.columns.map(str)
X_sup_test.columns = X_sup_test.columns.map(str)

## Classification Methods (Fused Features)

Now we'll compare multiple classification algorithms on the fused features:

In [34]:
# Random Forest with Grid Search
from sklearn.ensemble import RandomForestClassifier
import pickle

print("="*60)
print("RANDOM FOREST CLASSIFIER (Fused Features)")
print("="*60)

try:
    n_estimators_grid = [100, 200, 300]
    max_depth_grid = [10, 20, 30]
    params = list(itertools.product(n_estimators_grid, max_depth_grid))

    score = -1
    best_params = None
    best_model = None

    print(f"Testing {len(params)} configurations...")

    for n_est, depth in tqdm(params):
        try:
            rf_clf = RandomForestClassifier(
                n_estimators=n_est,
                max_depth=depth,
                class_weight='balanced',
                random_state=13,
                n_jobs=-1
            )
            
            rf_clf.fit(X_sup_train, y_sup_train)
            y_pred_rf = rf_clf.predict(X_sup_test)
            rf_f1 = f1_score(y_sup_test, y_pred_rf, average='macro')
            
            if rf_f1 > score:
                score = rf_f1
                best_params = {
                    'n_estimators': n_est,
                    'max_depth': depth
                }
                best_model = rf_clf
                # Save best model
                with open('best_rf_classifier_fused.pkl', 'wb') as f:
                    pickle.dump(rf_clf, f)
        except Exception as e:
            print(f"\nError with params (n={n_est}, d={depth}): {str(e)}")
            continue

    if best_model is not None:
        print("\nBest Parameters:", best_params)
        print(f"Best Macro F1: {score:.4f}")
        print("Model saved to: best_rf_classifier_fused.pkl\n")
        print(classification_report(y_sup_test, best_model.predict(X_sup_test), digits=4))
    else:
        print("\nAll configurations failed.")
        
except Exception as e:
    print(f"Unexpected error in Random Forest: {str(e)}")

RANDOM FOREST CLASSIFIER (Fused Features)
Testing 9 configurations...


100%|██████████| 9/9 [08:49<00:00, 58.82s/it]



Best Parameters: {'n_estimators': 200, 'max_depth': 10}
Best Macro F1: 0.6359
Model saved to: best_rf_classifier_fused.pkl

              precision    recall  f1-score   support

           0     0.9000    0.9617    0.9298    104997
           1     0.9997    0.9521    0.9753     12438
           2     0.8539    0.7451    0.7958       102
           3     0.5455    0.3529    0.4286        34
           4     1.0000    1.0000    1.0000     61790
           5     1.0000    1.0000    1.0000       188
           6     1.0000    0.9544    0.9767     17362
           7     1.0000    0.5000    0.6667      3632
           8     0.0000    0.0000    0.0000      5890
           9     0.0000    0.0000    0.0000      6320
          10     0.0000    0.0000    0.0000      2164
          11     0.6477    0.5000    0.5644     23242
          12     0.7015    0.9516    0.8076     11314
          13     0.2667    0.7500    0.3934        32
          14     1.0000    0.9998    0.9999     11310

    accur

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# XGBoost with Grid Search
try:
    from xgboost import XGBClassifier
    import pickle
    
    print("="*60)
    print("XGBOOST CLASSIFIER (Fused Features)")
    print("="*60)
    
    n_estimators_grid = [100, 200, 300]
    max_depth_grid = [6, 8, 10]
    learning_rate_grid = [0.01, 0.05, 0.1]
    
    params = list(itertools.product(n_estimators_grid, max_depth_grid, learning_rate_grid))
    
    score = -1
    best_params = None
    best_model = None
    
    print(f"Testing {len(params)} configurations...")
    
    for n_est, depth, lr in tqdm(params):
        try:
            xgb_clf = XGBClassifier(
                n_estimators=n_est,
                max_depth=depth,
                learning_rate=lr,
                random_state=13,
                tree_method='hist', 
                device="cuda"
            )
            
            xgb_clf.fit(X_sup_train, y_sup_train, sample_weight=sample_weight)
            y_pred_xgb = xgb_clf.predict(X_sup_test)
            xgb_f1 = f1_score(y_sup_test, y_pred_xgb, average='macro')
            
            if xgb_f1 > score:
                score = xgb_f1
                best_params = {
                    'n_estimators': n_est,
                    'max_depth': depth,
                    'learning_rate': lr
                }
                best_model = xgb_clf
                # Save best model
                xgb_clf.save_model('best_xgb_classifier_fused.json')
        except Exception as e:
            print(f"\nError with params (n={n_est}, d={depth}, lr={lr}): {str(e)}")
            continue
    
    if best_model is not None:
        print("\nBest Parameters:", best_params)
        print(f"Best Macro F1: {score:.4f}")
        print("Model saved to: best_xgb_classifier_fused.json\n")
        print(classification_report(y_sup_test, best_model.predict(X_sup_test), digits=4))
    else:
        print("\nAll configurations failed. XGBoost might not support your GPU.")
    
except ImportError:
    print("XGBoost not installed. Install with: pip install xgboost")
except Exception as e:
    print(f"Unexpected error: {str(e)}")

XGBOOST CLASSIFIER (Fused Features)
Testing 27 configurations...


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


100%|██████████| 27/27 [29:03<00:00, 64.58s/it]



Best Parameters: {'n_estimators': 200, 'max_depth': 6, 'learning_rate': 0.1}
Best Macro F1: 0.7945
Model saved to: best_xgb_classifier_fused.json

              precision    recall  f1-score   support

           0     0.9368    0.9688    0.9525     62919
           1     1.0000    1.0000    1.0000     12304
           2     0.8706    0.7708    0.8177        96
           3     0.3830    0.5625    0.4557        32
           4     1.0000    1.0000    1.0000     62294
           5     1.0000    1.0000    1.0000       200
           6     1.0000    0.9443    0.9714     17396
           7     1.0000    0.5000    0.6667      3620
           8     1.0000    0.9998    0.9999      5930
           9     0.2144    0.5000    0.3001      6334
          10     1.0000    0.9830    0.9914      2116
          11     0.7856    0.5000    0.6111     23204
          12     0.8363    0.8847    0.8598     11248
          13     0.2581    0.3333    0.2909        24
          14     1.0000    1.0000    1.00

: 

In [None]:
# CatBoost with Grid Search (Expanded, no l2_leaf_reg/border_count, fixed best score logic)
try:
    from catboost import CatBoostClassifier
    
    print("Starting CatBoost Grid Search (Expanded)...")
    
    # Expanded hyperparameter grid
    iterations_grid = [200, 300, 500]
    depth_grid = [4, 8, 10]
    learning_rate_grid = [0.01, 0.05, 0.1]
    
    params = list(itertools.product(iterations_grid, depth_grid, learning_rate_grid))
    print(f"Testing {len(params)} configurations...")
    
    best_score = -1
    best_params = None
    best_model = None
    best_model_path = None
    
    for iterations, depth, lr in tqdm(params):
        try:
            cat_clf = CatBoostClassifier(
                iterations=iterations,
                depth=depth,
                learning_rate=lr,
                auto_class_weights='Balanced',
                random_seed=13,
                verbose=False,
                task_type='GPU'  # Use 'CPU' if GPU not available
            )
            
            cat_clf.fit(X_sup_train, y_sup_train)
            y_pred_cat = cat_clf.predict(X_sup_test)
            cat_f1 = f1_score(y_sup_test, y_pred_cat, average='macro')
            
            if cat_f1 > best_score:
                best_score = cat_f1
                best_params = {
                    'iterations': iterations,
                    'depth': depth,
                    'learning_rate': lr
                }
                best_model = cat_clf
                # Save the best model with unique name for fused features
                best_model_path = "best_catboost_classifier_fused.cbm"
                best_model.save_model(best_model_path)
        except Exception as e:
            print(f"\nError with params (it={iterations}, d={depth}, lr={lr}): {str(e)}")
            continue
    
    print("\n" + "="*60)
    print("BEST CATBOOST HYPERPARAMETERS (Fused Features):")
    print(best_params)
    print(f"Best Macro F1: {best_score:.4f}")
    print("="*60)
    if best_model_path:
        print(f"\nBest CatBoost model saved to: {best_model_path}")
    
    # Final evaluation
    y_pred_best = best_model.predict(X_sup_test)
    print("\n" + "="*60)
    print("FINAL CLASSIFICATION REPORT (Best CatBoost - Fused):")
    print("="*60)
    print(classification_report(y_sup_test, y_pred_best, digits=4))
    
except ImportError:
    print("CatBoost not installed. Install with: pip install catboost")

Starting CatBoost Grid Search (Expanded)...
Testing 27 configurations...




In [37]:
# Extra Trees Classifier
from sklearn.ensemble import ExtraTreesClassifier
import pickle

print("="*60)
print("EXTRA TREES CLASSIFIER (Fused Features)")
print("="*60)

try:
    n_estimators_grid = [100, 200, 300]
    max_depth_grid = [10, 20, 30]

    params = list(itertools.product(n_estimators_grid, max_depth_grid))

    score = -1
    best_params = None
    best_model = None

    print(f"Testing {len(params)} configurations...")

    for n_est, depth in tqdm(params):
        try:
            et_clf = ExtraTreesClassifier(
                n_estimators=n_est,
                max_depth=depth,
                class_weight='balanced',
                random_state=13,
                n_jobs=-1
            )
            
            et_clf.fit(X_sup_train, y_sup_train)
            y_pred_et = et_clf.predict(X_sup_test)
            et_f1 = f1_score(y_sup_test, y_pred_et, average='macro')
            
            if et_f1 > score:
                score = et_f1
                best_params = {
                    'n_estimators': n_est,
                    'max_depth': depth
                }
                best_model = et_clf
                # Save best model
                with open('best_et_classifier_fused.pkl', 'wb') as f:
                    pickle.dump(et_clf, f)
        except Exception as e:
            print(f"\nError with params (n={n_est}, d={depth}): {str(e)}")
            continue

    if best_model is not None:
        print("\nBest Parameters:", best_params)
        print(f"Best Macro F1: {score:.4f}")
        print("Model saved to: best_et_classifier_fused.pkl\n")
        print(classification_report(y_sup_test, best_model.predict(X_sup_test), digits=4))
    else:
        print("\nAll configurations failed.")
        
except Exception as e:
    print(f"Unexpected error in Extra Trees: {str(e)}")

EXTRA TREES CLASSIFIER (Fused Features)
Testing 9 configurations...


100%|██████████| 9/9 [06:13<00:00, 41.54s/it]



Best Parameters: {'n_estimators': 300, 'max_depth': 10}
Best Macro F1: 0.8262
Model saved to: best_et_classifier_fused.pkl

              precision    recall  f1-score   support

           0     0.9986    0.9531    0.9753    104997
           1     0.9981    0.9992    0.9986     12438
           2     0.8636    0.7451    0.8000       102
           3     0.6000    0.3529    0.4444        34
           4     1.0000    1.0000    1.0000     61790
           5     1.0000    0.9894    0.9947       188
           6     1.0000    1.0000    1.0000     17362
           7     1.0000    1.0000    1.0000      3632
           8     1.0000    1.0000    1.0000      5890
           9     0.0000    0.0000    0.0000      6320
          10     1.0000    1.0000    1.0000      2164
          11     0.7862    1.0000    0.8803     23242
          12     0.6957    0.9868    0.8161     11314
          13     0.3261    0.9375    0.4839        32
          14     1.0000    1.0000    1.0000     11310

    accur

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Raw Features Classification (Comparison Baseline)

Now we'll train classifiers on **raw features only** (without graph embeddings) to compare the benefit of multimodal fusion:

In [38]:
# Prepare raw features (without embeddings)
# Use only the original numeric features from X_train/X_test

X_raw_train = X_train.drop(columns=["IPV4_SRC_ADDR", "IPV4_DST_ADDR", "h", "id"]).copy()
y_raw_train = y_train["Attack"].copy()

X_raw_test = X_test.drop(columns=["IPV4_SRC_ADDR", "IPV4_DST_ADDR", "h", "id"]).copy()
y_raw_test = y_test["Attack"].copy()

# Encode labels
y_raw_train_encoded = lab_enc.transform(y_train["Attack"])
y_raw_test_encoded = lab_enc.transform(y_test["Attack"])

# Compute sample weights
classes_raw = np.unique(y_raw_train_encoded)
class_w_raw = class_weight.compute_class_weight(
    class_weight="balanced", classes=classes_raw, y=y_raw_train_encoded
)
class_to_w_raw = {c: w for c, w in zip(classes_raw, class_w_raw)}
sample_weight_raw = pd.Series(y_raw_train_encoded).map(class_to_w_raw).values

# Feature names to str
X_raw_train.columns = X_raw_train.columns.map(str)
X_raw_test.columns = X_raw_test.columns.map(str)

print(f"Raw feature shape - Train: {X_raw_train.shape}, Test: {X_raw_test.shape}")
print(f"Number of classes: {len(np.unique(y_raw_train_encoded))}")

Raw feature shape - Train: (304665, 39), Test: (130572, 39)
Number of classes: 15


In [39]:
# Raw Features - Random Forest
from sklearn.ensemble import RandomForestClassifier
import pickle

print("="*60)
print("RANDOM FOREST (Raw Features Only)")

n_estimators_grid = [100, 200, 300]
max_depth_grid = [10, 20, 30]
params = list(itertools.product(n_estimators_grid, max_depth_grid))

score = -1
best_params = None
best_model = None

print(f"Testing {len(params)} configurations...")

for n_est, depth in tqdm(params):
    rf_clf = RandomForestClassifier(
        n_estimators=n_est,
        max_depth=depth,
        class_weight='balanced',
        random_state=13,
        n_jobs=-1
    )
    
    rf_clf.fit(X_raw_train, y_raw_train_encoded)
    y_pred = rf_clf.predict(X_raw_test)
    f1 = f1_score(y_raw_test_encoded, y_pred, average='macro')
    
    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est, 'max_depth': depth}
        best_model = rf_clf
        # Save best model
        with open('best_rf_classifier_raw.pkl', 'wb') as f:
            pickle.dump(rf_clf, f)

print("\nBest Parameters:", best_params)
print(f"Best Macro F1: {score:.4f}")
print("Model saved to: best_rf_classifier_raw.pkl\n")
print(classification_report(y_raw_test_encoded, best_model.predict(X_raw_test), digits=4))

RANDOM FOREST (Raw Features Only)
Testing 9 configurations...


100%|██████████| 9/9 [04:12<00:00, 28.11s/it]



Best Parameters: {'n_estimators': 100, 'max_depth': 30}
Best Macro F1: 0.6979
Model saved to: best_rf_classifier_raw.pkl

              precision    recall  f1-score   support

           0     0.9544    0.7950    0.8675     52663
           1     1.0000    0.9997    0.9998      6219
           2     0.1386    0.4510    0.2120        51
           3     0.3846    0.8824    0.5357        17
           4     0.9999    1.0000    1.0000     30895
           5     0.8440    0.9787    0.9064        94
           6     0.9988    0.9980    0.9984      8681
           7     0.9994    1.0000    0.9997      1816
           8     1.0000    0.9997    0.9998      2945
           9     0.2138    1.0000    0.3523      3160
          10     1.0000    0.9982    0.9991      1082
          11     0.0000    0.0000    0.0000     11621
          12     0.2557    0.6470    0.3665      5657
          13     0.3000    0.1875    0.2308        16
          14     1.0000    1.0000    1.0000      5655

    accurac

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [40]:
# Raw Features - XGBoost
try:
    from xgboost import XGBClassifier
    
    print("="*60)
    print("XGBOOST (Raw Features Only)")
    print("="*60)
    
    n_estimators_grid = [100, 200, 300]
    max_depth_grid = [6, 8, 10]
    learning_rate_grid = [0.01, 0.05, 0.1]
    
    params = list(itertools.product(n_estimators_grid, max_depth_grid, learning_rate_grid))
    
    score = -1
    best_params = None
    best_model = None
    
    print(f"Testing {len(params)} configurations...")
    
    for n_est, depth, lr in tqdm(params):
        xgb_clf = XGBClassifier(
            n_estimators=n_est,
            max_depth=depth,
            learning_rate=lr,
            random_state=13,
            tree_method='hist',
            device='cuda'
        )
        
        xgb_clf.fit(X_raw_train, y_raw_train_encoded, sample_weight=sample_weight_raw)
        y_pred = xgb_clf.predict(X_raw_test)
        f1 = f1_score(y_raw_test_encoded, y_pred, average='macro')
        
        if f1 > score:
            score = f1
            best_params = {'n_estimators': n_est, 'max_depth': depth, 'learning_rate': lr}
            best_model = xgb_clf
            # Save best model
            xgb_clf.save_model('best_xgb_classifier_raw.json')
    
    print("\nBest Parameters:", best_params)
    print(f"Best Macro F1: {score:.4f}")
    print("Model saved to: best_xgb_classifier_raw.json\n")
    print(classification_report(y_raw_test_encoded, best_model.predict(X_raw_test), digits=4))
    
except ImportError:
    print("XGBoost not installed.")

XGBOOST (Raw Features Only)
Testing 27 configurations...


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


100%|██████████| 27/27 [05:20<00:00, 11.88s/it]


Best Parameters: {'n_estimators': 200, 'max_depth': 8, 'learning_rate': 0.1}
Best Macro F1: 0.7387
Model saved to: best_xgb_classifier_raw.json

              precision    recall  f1-score   support

           0     0.9551    0.7654    0.8498     52663
           1     0.9994    1.0000    0.9997      6219
           2     0.2500    0.5882    0.3509        51
           3     0.3611    0.7647    0.4906        17
           4     0.9998    1.0000    0.9999     30895
           5     0.8426    0.9681    0.9010        94
           6     0.9988    0.9978    0.9983      8681
           7     0.9989    1.0000    0.9994      1816
           8     1.0000    1.0000    1.0000      2945
           9     0.0000    0.0000    0.0000      3160
          10     0.9972    0.9991    0.9982      1082
          11     0.7862    1.0000    0.8803     11621
          12     0.2341    0.6627    0.3460      5657
          13     0.2857    0.2500    0.2667        16
          14     1.0000    1.0000    1.0000


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [41]:
# Raw Features - CatBoost (Expanded grid, no l2_leaf_reg/border_count)
try:
    from catboost import CatBoostClassifier
    
    print("="*60)
    print("CATBOOST (Raw Features Only, Expanded)")
    print("="*60)
    
    iterations_grid = [200, 300, 500]
    depth_grid = [4, 8, 10]
    learning_rate_grid = [0.01, 0.05, 0.1]
    
    params = list(itertools.product(iterations_grid, depth_grid, learning_rate_grid))
    
    best_score = -1
    best_params = None
    best_model = None
    best_model_path = None
    
    print(f"Testing {len(params)} configurations...")
    
    for iterations, depth, lr in tqdm(params):
        try:
            cat_clf = CatBoostClassifier(
                iterations=iterations,
                depth=depth,
                learning_rate=lr,
                auto_class_weights='Balanced',
                random_seed=13,
                verbose=False,
                task_type='GPU'
            )
            
            cat_clf.fit(X_raw_train, y_raw_train_encoded)
            y_pred = cat_clf.predict(X_raw_test)
            f1 = f1_score(y_raw_test_encoded, y_pred, average='macro')
            
            if f1 > best_score:
                best_score = f1
                best_params = {'iterations': iterations, 'depth': depth, 'learning_rate': lr}
                best_model = cat_clf
                # Save the best model with unique name for raw features
                best_model_path = "best_catboost_classifier_raw.cbm"
                best_model.save_model(best_model_path)
        except Exception as e:
            print(f"\nError with params (it={iterations}, d={depth}, lr={lr}): {str(e)}")
            continue
    
    print("\nBest Parameters:", best_params)
    print(f"Best Macro F1: {best_score:.4f}\n")
    if best_model_path:
        print(f"Best CatBoost model saved to: {best_model_path}")
    print(classification_report(y_raw_test_encoded, best_model.predict(X_raw_test), digits=4))
    
except ImportError:
    print("CatBoost not installed.")

CATBOOST (Raw Features Only, Expanded)
Testing 27 configurations...


100%|██████████| 27/27 [05:12<00:00, 11.57s/it]



Best Parameters: {'iterations': 500, 'depth': 10, 'learning_rate': 0.1}
Best Macro F1: 0.7170

Best CatBoost model saved to: best_catboost_classifier_raw.cbm
              precision    recall  f1-score   support

           0     0.9556    0.7538    0.8428     52663
           1     0.9974    1.0000    0.9987      6219
           2     0.1571    0.6471    0.2529        51
           3     0.2456    0.8235    0.3784        17
           4     0.9997    1.0000    0.9999     30895
           5     0.8174    1.0000    0.8995        94
           6     0.9982    0.9976    0.9979      8681
           7     0.9983    0.9989    0.9986      1816
           8     0.9990    0.9980    0.9985      2945
           9     0.0000    0.0000    0.0000      3160
          10     0.9854    1.0000    0.9927      1082
          11     0.7862    1.0000    0.8803     11621
          12     0.2289    0.6689    0.3410      5657
          13     0.2857    0.1250    0.1739        16
          14     1.0000    1.0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [42]:
# Raw Features - Extra Trees
from sklearn.ensemble import ExtraTreesClassifier
import pickle

print("="*60)
print("EXTRA TREES (Raw Features Only)")
print("="*60)

n_estimators_grid = [100, 200, 300]
max_depth_grid = [10, 20, 30]

params = list(itertools.product(n_estimators_grid, max_depth_grid))

score = -1
best_params = None
best_model = None

print(f"Testing {len(params)} configurations...")

for n_est, depth in tqdm(params):
    et_clf = ExtraTreesClassifier(
        n_estimators=n_est,
        max_depth=depth,
        class_weight='balanced',
        random_state=13,
        n_jobs=-1
    )
    
    et_clf.fit(X_raw_train, y_raw_train_encoded)
    y_pred = et_clf.predict(X_raw_test)
    f1 = f1_score(y_raw_test_encoded, y_pred, average='macro')
    
    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est, 'max_depth': depth}
        best_model = et_clf
        # Save best model
        with open('best_et_classifier_raw.pkl', 'wb') as f:
            pickle.dump(et_clf, f)

print("\nBest Parameters:", best_params)
print(f"Best Macro F1: {score:.4f}")
print("Model saved to: best_et_classifier_raw.pkl\n")
print(classification_report(y_raw_test_encoded, best_model.predict(X_raw_test), digits=4))

EXTRA TREES (Raw Features Only)
Testing 9 configurations...


100%|██████████| 9/9 [02:50<00:00, 18.99s/it]



Best Parameters: {'n_estimators': 300, 'max_depth': 30}
Best Macro F1: 0.7311
Model saved to: best_et_classifier_raw.pkl

              precision    recall  f1-score   support

           0     0.9492    0.8420    0.8924     52663
           1     0.9997    0.9998    0.9998      6219
           2     0.1557    0.3725    0.2197        51
           3     0.3684    0.8235    0.5091        17
           4     1.0000    1.0000    1.0000     30895
           5     0.8542    0.8723    0.8632        94
           6     0.9977    0.9984    0.9980      8681
           7     1.0000    1.0000    1.0000      1816
           8     1.0000    1.0000    1.0000      2945
           9     0.0000    0.0000    0.0000      3160
          10     0.9991    0.9991    0.9991      1082
          11     0.7862    1.0000    0.8803     11621
          12     0.2858    0.5816    0.3833      5657
          13     0.2727    0.1875    0.2222        16
          14     1.0000    1.0000    1.0000      5655

    accurac

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Embeddings Only Classification (Graph Features)

Now we'll train classifiers on **embeddings only** (graph features without raw features) to isolate the value of graph-based learning:

In [37]:
# Prepare embeddings-only features (graph features without raw data)
# Extract only the embedding columns (first 256 dimensions from graph encoder)

# From the fused dataframes, extract only embedding columns
# df_fuse_train has: [0-255] = embeddings, [256+] = raw features
num_embedding_dims = 256  # Based on the DGI encoder output dimension

X_emb_train = df_fuse_train.iloc[:, :num_embedding_dims].copy()
y_emb_train = df_fuse_train["Attacks"].copy()

X_emb_test = df_fuse_test.iloc[:, :num_embedding_dims].copy()
y_emb_test = df_fuse_test["Attacks"].copy()

# Compute sample weights
classes_emb = np.unique(y_emb_train)
class_w_emb = class_weight.compute_class_weight(
    class_weight="balanced", classes=classes_emb, y=y_emb_train
)
class_to_w_emb = {c: w for c, w in zip(classes_emb, class_w_emb)}
sample_weight_emb = y_emb_train.map(class_to_w_emb).values

# Feature names to str
X_emb_train.columns = X_emb_train.columns.map(str)
X_emb_test.columns = X_emb_test.columns.map(str)

print(f"Embeddings shape - Train: {X_emb_train.shape}, Test: {X_emb_test.shape}")
print(f"Number of classes: {len(np.unique(y_emb_train))}")
print(f"Using only graph embeddings (no raw features)")

Embeddings shape - Train: (608554, 256), Test: (260815, 256)
Number of classes: 15
Using only graph embeddings (no raw features)


In [44]:
# Embeddings Only - Random Forest
from sklearn.ensemble import RandomForestClassifier
import pickle

print("="*60)
print("RANDOM FOREST (Embeddings Only)")
print("="*60)

n_estimators_grid = [100, 200, 300]
max_depth_grid = [10, 20, 30]
params = list(itertools.product(n_estimators_grid, max_depth_grid))

score = -1
best_params = None
best_model = None

print(f"Testing {len(params)} configurations...")

for n_est, depth in tqdm(params):
    rf_clf = RandomForestClassifier(
        n_estimators=n_est,
        max_depth=depth,
        class_weight='balanced',
        random_state=13,
        n_jobs=-1
    )
    
    rf_clf.fit(X_emb_train, y_emb_train)
    y_pred = rf_clf.predict(X_emb_test)
    f1 = f1_score(y_emb_test, y_pred, average='macro')
    
    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est, 'max_depth': depth}
        best_model = rf_clf
        # Save best model
        with open('best_rf_classifier_embeddings.pkl', 'wb') as f:
            pickle.dump(rf_clf, f)

print("\nBest Parameters:", best_params)
print(f"Best Macro F1: {score:.4f}")
print("Model saved to: best_rf_classifier_embeddings.pkl\n")
print(classification_report(y_emb_test, best_model.predict(X_emb_test), digits=4))

RANDOM FOREST (Embeddings Only)
Testing 9 configurations...


100%|██████████| 9/9 [15:00<00:00, 100.06s/it]



Best Parameters: {'n_estimators': 200, 'max_depth': 10}
Best Macro F1: 0.4367
Model saved to: best_rf_classifier_embeddings.pkl

              precision    recall  f1-score   support

           0     0.7489    0.9592    0.8411    104997
           1     0.9998    0.9522    0.9755     12438
           2     0.8043    0.7255    0.7629       102
           3     0.8000    0.1176    0.2051        34
           4     0.9970    0.5491    0.7082     61790
           5     0.0035    0.4096    0.0070       188
           6     1.0000    0.9542    0.9765     17362
           7     0.0000    0.0000    0.0000      3632
           8     0.0000    0.0000    0.0000      5890
           9     0.0000    0.0000    0.0000      6320
          10     0.0000    0.0000    0.0000      2164
          11     0.7862    0.5000    0.6113     23242
          12     0.6843    0.9245    0.7865     11314
          13     0.0051    1.0000    0.0101        32
          14     1.0000    0.5000    0.6667     11310

    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [45]:
# Embeddings Only - XGBoost
try:
    from xgboost import XGBClassifier
    
    print("="*60)
    print("XGBOOST (Embeddings Only)")
    print("="*60)
    
    n_estimators_grid = [100, 200, 300]
    max_depth_grid = [6, 8, 10]
    learning_rate_grid = [0.01, 0.05, 0.1]
    
    params = list(itertools.product(n_estimators_grid, max_depth_grid, learning_rate_grid))
    
    score = -1
    best_params = None
    best_model = None
    
    print(f"Testing {len(params)} configurations...")
    
    for n_est, depth, lr in tqdm(params):
        xgb_clf = XGBClassifier(
            n_estimators=n_est,
            max_depth=depth,
            learning_rate=lr,
            random_state=13,
            tree_method='hist',
            device='gpu'
        )
        
        xgb_clf.fit(X_emb_train, y_emb_train, sample_weight=sample_weight_emb)
        y_pred = xgb_clf.predict(X_emb_test)
        f1 = f1_score(y_emb_test, y_pred, average='macro')
        
        if f1 > score:
            score = f1
            best_params = {'n_estimators': n_est, 'max_depth': depth, 'learning_rate': lr}
            best_model = xgb_clf
            # Save best model
            xgb_clf.save_model('best_xgb_classifier_embeddings.json')
    
    print("\nBest Parameters:", best_params)
    print(f"Best Macro F1: {score:.4f}")
    print("Model saved to: best_xgb_classifier_embeddings.json\n")
    print(classification_report(y_emb_test, best_model.predict(X_emb_test), digits=4))
    
except ImportError:
    print("XGBoost not installed.")

XGBOOST (Embeddings Only)
Testing 27 configurations...


100%|██████████| 27/27 [30:41<00:00, 68.21s/it] 



Best Parameters: {'n_estimators': 300, 'max_depth': 6, 'learning_rate': 0.05}
Best Macro F1: 0.5536
Model saved to: best_xgb_classifier_embeddings.json

              precision    recall  f1-score   support

           0     0.8305    0.9623    0.8916    104997
           1     0.9999    0.8780    0.9350     12438
           2     0.0272    0.8627    0.0528       102
           3     0.8000    0.1176    0.2051        34
           4     0.9969    0.2991    0.4602     61790
           5     0.0032    0.6436    0.0065       188
           6     1.0000    1.0000    1.0000     17362
           7     1.0000    1.0000    1.0000      3632
           8     1.0000    0.5000    0.6667      5890
           9     0.0000    0.0000    0.0000      6320
          10     1.0000    0.5000    0.6667      2164
          11     0.7862    0.5000    0.6113     23242
          12     0.6982    0.9220    0.7947     11314
          13     0.0066    0.6562    0.0131        32
          14     1.0000    1.0000  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [39]:
# Embeddings Only - CatBoost (Expanded grid, no l2_leaf_reg/border_count)
try:
    from catboost import CatBoostClassifier
    
    print("="*60)
    print("CATBOOST (Embeddings Only, Expanded)")
    print("="*60)
    
    iterations_grid = [200, 300, 500]
    depth_grid = [4, 8, 10]
    learning_rate_grid = [0.01, 0.05, 0.1]
    
    params = list(itertools.product(iterations_grid, depth_grid, learning_rate_grid))
    
    best_score = -1
    best_params = None
    best_model = None
    best_model_path = None
    
    print(f"Testing {len(params)} configurations...")
    
    for iterations, depth, lr in tqdm(params):
        try:
            cat_clf = CatBoostClassifier(
                iterations=iterations,
                depth=depth,
                learning_rate=lr,
                auto_class_weights='Balanced',
                random_seed=13,
                verbose=False,
                task_type='GPU'
            )
            
            cat_clf.fit(X_emb_train, y_emb_train)
            y_pred = cat_clf.predict(X_emb_test)
            f1 = f1_score(y_emb_test, y_pred, average='macro')
            
            if f1 > best_score:
                best_score = f1
                best_params = {'iterations': iterations, 'depth': depth, 'learning_rate': lr}
                best_model = cat_clf
                # Save the best model with unique name for embeddings
                best_model_path = "best_catboost_classifier_embeddings.cbm"
                best_model.save_model(best_model_path)
        except Exception as e:
            print(f"\nError with params (it={iterations}, d={depth}, lr={lr}): {str(e)}")
            continue
    
    print("\nBest Parameters:", best_params)
    print(f"Best Macro F1: {best_score:.4f}\n")
    if best_model_path:
        print(f"Best CatBoost model saved to: {best_model_path}")
    print(classification_report(y_emb_test, best_model.predict(X_emb_test), digits=4))
    
except ImportError:
    print("CatBoost not installed.")

CATBOOST (Embeddings Only, Expanded)
Testing 27 configurations...


100%|██████████| 27/27 [14:59<00:00, 33.31s/it]



Best Parameters: {'iterations': 500, 'depth': 10, 'learning_rate': 0.1}
Best Macro F1: 0.7352

Best CatBoost model saved to: best_catboost_classifier_embeddings.cbm
              precision    recall  f1-score   support

           0     0.9933    0.9571    0.9749    104997
           1     0.9998    1.0000    0.9999     12438
           2     0.8721    0.7353    0.7979       102
           3     0.3947    0.8824    0.5455        34
           4     0.9970    0.4990    0.6651     61790
           5     0.0031    0.5106    0.0061       188
           6     1.0000    1.0000    1.0000     17362
           7     0.9997    1.0000    0.9999      3632
           8     1.0000    1.0000    1.0000      5890
           9     0.2138    0.5000    0.2995      6320
          10     1.0000    1.0000    1.0000      2164
          11     0.7862    0.5000    0.6113     23242
          12     0.7049    0.9406    0.8059     11314
          13     0.3333    0.3125    0.3226        32
          14     1.0000

In [47]:
# Embeddings Only - Extra Trees
from sklearn.ensemble import ExtraTreesClassifier
import pickle

print("="*60)
print("EXTRA TREES (Embeddings Only)")
print("="*60)

n_estimators_grid = [100, 200, 300]
max_depth_grid = [10, 20, 30]

params = list(itertools.product(n_estimators_grid, max_depth_grid))

score = -1
best_params = None
best_model = None

print(f"Testing {len(params)} configurations...")

for n_est, depth in tqdm(params):
    et_clf = ExtraTreesClassifier(
        n_estimators=n_est,
        max_depth=depth,
        class_weight='balanced',
        random_state=13,
        n_jobs=-1
    )
    
    et_clf.fit(X_emb_train, y_emb_train)
    y_pred = et_clf.predict(X_emb_test)
    f1 = f1_score(y_emb_test, y_pred, average='macro')
    
    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est, 'max_depth': depth}
        best_model = et_clf
        # Save best model
        with open('best_et_classifier_embeddings.pkl', 'wb') as f:
            pickle.dump(et_clf, f)

print("\nBest Parameters:", best_params)
print(f"Best Macro F1: {score:.4f}")
print("Model saved to: best_et_classifier_embeddings.pkl\n")
print(classification_report(y_emb_test, best_model.predict(X_emb_test), digits=4))

EXTRA TREES (Embeddings Only)
Testing 9 configurations...


100%|██████████| 9/9 [05:23<00:00, 35.97s/it]



Best Parameters: {'n_estimators': 200, 'max_depth': 10}
Best Macro F1: 0.6906
Model saved to: best_et_classifier_embeddings.pkl

              precision    recall  f1-score   support

           0     0.9987    0.9543    0.9760    104997
           1     0.9982    0.9761    0.9870     12438
           2     0.8636    0.7451    0.8000       102
           3     0.8000    0.2353    0.3636        34
           4     0.9969    0.5996    0.7488     61790
           5     0.0033    0.4415    0.0066       188
           6     1.0000    1.0000    1.0000     17362
           7     1.0000    1.0000    1.0000      3632
           8     1.0000    1.0000    1.0000      5890
           9     0.2138    1.0000    0.3523      6320
          10     1.0000    1.0000    1.0000      2164
          11     0.0000    0.0000    0.0000     23242
          12     0.6897    0.9881    0.8123     11314
          13     0.2083    0.6250    0.3125        32
          14     1.0000    1.0000    1.0000     11310

    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Summary Comparison Table

Create a comparison table of all methods (Fused vs Raw features):

In [55]:
# Create a summary comparison table
# NOTE: After running all cells above, manually collect the F1 scores and create a comparison

print("="*95)
print("PERFORMANCE COMPARISON: FUSED vs EMBEDDINGS vs RAW FEATURES")
print("="*95)
print("\nInstructions:")
print("1. Run all cells above to get F1 scores for each method")
print("2. Record the best Macro F1 score for each method")
print("3. Compare three approaches:")
print("   - Fused Features: Embeddings + Raw (Multimodal)")
print("   - Embeddings Only: Graph features only")
print("   - Raw Features: Traditional features only")
print("\nExpected format:")
print("-" * 95)
print(f"{'Method':<20} {'Fused (Emb+Raw)':<25} {'Embeddings Only':<25} {'Raw Only':<25}")
print("-" * 95)
print(f"{'Random Forest':<20} {'[INSERT SCORE]':<25} {'[INSERT SCORE]':<25} {'[INSERT SCORE]':<25}")
print(f"{'XGBoost':<20} {'[INSERT SCORE]':<25} {'[INSERT SCORE]':<25} {'[INSERT SCORE]':<25}")
print(f"{'CatBoost':<20} {'[INSERT SCORE]':<25} {'[INSERT SCORE]':<25} {'[INSERT SCORE]':<25}")
print(f"{'Extra Trees':<20} {'[INSERT SCORE]':<25} {'[INSERT SCORE]':<25} {'[INSERT SCORE]':<25}")
print("-" * 95)
print("\nAnalysis:")
print("- Fused features should show best performance (multimodal learning)")
print("- Embeddings capture structural/graph patterns")
print("- Raw features provide traditional statistical information")
print("- Compare to understand the contribution of each modality")

PERFORMANCE COMPARISON: FUSED vs EMBEDDINGS vs RAW FEATURES

Instructions:
1. Run all cells above to get F1 scores for each method
2. Record the best Macro F1 score for each method
3. Compare three approaches:
   - Fused Features: Embeddings + Raw (Multimodal)
   - Embeddings Only: Graph features only
   - Raw Features: Traditional features only

Expected format:
-----------------------------------------------------------------------------------------------
Method               Fused (Emb+Raw)           Embeddings Only           Raw Only                 
-----------------------------------------------------------------------------------------------
Random Forest        [INSERT SCORE]            [INSERT SCORE]            [INSERT SCORE]           
XGBoost              [INSERT SCORE]            [INSERT SCORE]            [INSERT SCORE]           
CatBoost             [INSERT SCORE]            [INSERT SCORE]            [INSERT SCORE]           
Extra Trees          [INSERT SCORE]           

## Key Insights & Interpretation

After running all experiments, analyze the results to answer:

1. **Which approach performs best overall?**
   - Fused features (multimodal) should ideally outperform single modalities
   - Compare the magnitude of improvements

2. **What is the value of graph embeddings?**
   - Compare Embeddings-only vs Raw-only to see if graph structure helps
   - If embeddings alone beat raw features, graph learning is beneficial

3. **Is multimodal fusion effective?**
   - Compare Fused vs (Embeddings + Raw separately)
   - Synergy should provide additional gains beyond individual modalities

4. **Which classifier is most suitable?**
   - Identify the best performing algorithm for each feature type
   - Consider computational cost vs accuracy trade-offs

5. **Feature contribution analysis:**
   - If Fused ≈ Embeddings > Raw: Graph structure dominates
   - If Fused ≈ Raw > Embeddings: Traditional features dominate
   - If Fused > Both: True synergy from multimodal learning