# Anomaly Detection Model

## Setup & EDA

### Import Libraries

In [1]:
# library for mathematical functions and data preprocessing such as table, etc
import pandas as pd
import numpy as np

# Library for preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split

# Library for machine learning functions/algorithms
from sklearn.cluster import DBSCAN
from sklearn.ensemble import IsolationForest
import tensorflow as tf
from keras.models import Model
from keras.layers import Input, Dense, Dropout
from keras.optimizers import SGD, Nadam
from keras.optimizers.legacy import Adam

# Use the legacy Adam optimizer
from mealpy import WOA, DE
from mealpy.utils.problem import FloatVar
from sklearn.neighbors import NearestNeighbors

#  Library for evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

### KDD99 Dataset

In [2]:
# data
df_kdd99_training = pd.read_csv('kdd99-training.csv')
df_kdd99_training.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,0,tcp,http,SF,181,5450,0,0,0,0,...,9,1.0,0.0,0.11,0.0,0.0,0.0,0.0,0.0,normal
1,0,tcp,http,SF,239,486,0,0,0,0,...,19,1.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,normal
2,0,tcp,http,SF,235,1337,0,0,0,0,...,29,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal
3,0,tcp,http,SF,219,1337,0,0,0,0,...,39,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal
4,0,tcp,http,SF,217,2032,0,0,0,0,...,49,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,normal


In [3]:
df_kdd99_testing = pd.read_csv('kdd99-testing.csv')
df_kdd99_testing.head()


Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,0,tcp,http,SF,54540,8314,0,0,0,2,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.03,0.03,back.
1,0,tcp,http,SF,54540,8314,0,0,0,2,...,255,1.0,0.0,0.0,0.0,0.01,0.01,0.04,0.04,back.
2,0,tcp,http,SF,54540,8314,0,0,0,2,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.03,0.03,back.
3,0,tcp,http,SF,54540,8314,0,0,0,2,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.02,0.02,back.
4,0,tcp,http,SF,54540,8314,0,0,0,2,...,255,1.0,0.0,0.0,0.0,0.01,0.01,0.05,0.05,back.


In [4]:
df_kdd99_training.describe()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
count,494020.0,494020.0,494020.0,494020.0,494020.0,494020.0,494020.0,494020.0,494020.0,494020.0,...,494020.0,494020.0,494020.0,494020.0,494020.0,494020.0,494020.0,494020.0,494020.0,494020.0
mean,47.9794,3025.616,868.5308,4.5e-05,0.006433,1.4e-05,0.034519,0.000152,0.148245,0.010212,...,232.471248,188.666052,0.753781,0.030906,0.601936,0.006684,0.176754,0.176443,0.058118,0.057412
std,707.747185,988219.1,33040.03,0.006673,0.134805,0.00551,0.782103,0.01552,0.355343,1.798328,...,64.744601,106.040205,0.41078,0.109259,0.481309,0.042133,0.380593,0.38092,0.23059,0.230141
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,45.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,255.0,46.0,0.41,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,520.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,255.0,255.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,1032.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,255.0,255.0,1.0,0.04,1.0,0.0,0.0,0.0,0.0,0.0
max,58329.0,693375600.0,5155468.0,1.0,3.0,3.0,30.0,5.0,1.0,884.0,...,255.0,255.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [5]:
df_kdd99_training.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 494020 entries, 0 to 494019
Data columns (total 42 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   duration                     494020 non-null  int64  
 1   protocol_type                494020 non-null  object 
 2   service                      494020 non-null  object 
 3   flag                         494020 non-null  object 
 4   src_bytes                    494020 non-null  int64  
 5   dst_bytes                    494020 non-null  int64  
 6   land                         494020 non-null  int64  
 7   wrong_fragment               494020 non-null  int64  
 8   urgent                       494020 non-null  int64  
 9   hot                          494020 non-null  int64  
 10  num_failed_logins            494020 non-null  int64  
 11  logged_in                    494020 non-null  int64  
 12  num_compromised              494020 non-null  int64  
 13 

In [6]:
df_kdd99_testing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 311028 entries, 0 to 311027
Data columns (total 42 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   duration                     311028 non-null  int64  
 1   protocol_type                311028 non-null  object 
 2   service                      311028 non-null  object 
 3   flag                         311028 non-null  object 
 4   src_bytes                    311028 non-null  int64  
 5   dst_bytes                    311028 non-null  int64  
 6   land                         311028 non-null  int64  
 7   wrong_fragment               311028 non-null  int64  
 8   urgent                       311028 non-null  int64  
 9   hot                          311028 non-null  int64  
 10  num_failed_logins            311028 non-null  int64  
 11  logged_in                    311028 non-null  int64  
 12  num_compromised              311028 non-null  int64  
 13 

In [7]:
# Check which column needs fixing
for column in df_kdd99_testing.columns:
    unique_values = df_kdd99_testing[column].unique()
    print(f"Unique values in '{column}':")
    print(unique_values)
    print()

Unique values in 'duration':
[    0     9     2 ...  5049 10121  5033]

Unique values in 'protocol_type':
['tcp' 'icmp' 'udp']

Unique values in 'service':
['http' 'telnet' 'login' 'imap4' 'eco_i' 'private' 'smtp' 'whois' 'ssh'
 'ctf' 'ecr_i' 'domain' 'ftp' 'gopher' 'finger' 'rje' 'time' 'uucp' 'efs'
 'other' 'vmnet' 'klogin' 'netbios_ns' 'pop_2' 'netstat' 'remote_job'
 'shell' 'hostnames' 'ftp_data' 'http_443' 'netbios_ssn' 'iso_tsap'
 'csnet_ns' 'link' 'ldap' 'supdup' 'echo' 'pop_3' 'courier' 'name' 'nnsp'
 'exec' 'printer' 'systat' 'daytime' 'bgp' 'sql_net' 'auth' 'Z39_50' 'mtp'
 'kshell' 'uucp_path' 'netbios_dgm' 'sunrpc' 'nntp' 'discard' 'domain_u'
 'urp_i' 'IRC' 'ntp_u' 'urh_i' 'X11' 'tim_i' 'harvest']

Unique values in 'flag':
['SF' 'RSTR' 'RSTO' 'REJ' 'S0' 'SH' 'S1' 'S3' 'S2' 'OTH' 'RSTOS0']

Unique values in 'src_bytes':
[  54540   53168   54020 ...     520    1237 5133876]

Unique values in 'dst_bytes':
[     8314      7300      1460 ...      6944      4850 400291060]

Unique

In [8]:
# Check which column needs fixing
for column in df_kdd99_training.columns:
    unique_values = df_kdd99_training[column].unique()
    print(f"Unique values in '{column}':")
    print(unique_values)
    print()

Unique values in 'duration':
[   0    1   79 ... 2695 2751  120]

Unique values in 'protocol_type':
['tcp' 'udp' 'icmp']

Unique values in 'service':
['http' 'smtp' 'finger' 'domain_u' 'auth' 'telnet' 'ftp' 'eco_i' 'ntp_u'
 'ecr_i' 'other' 'private' 'pop_3' 'ftp_data' 'rje' 'time' 'mtp' 'link'
 'remote_job' 'gopher' 'ssh' 'name' 'whois' 'domain' 'login' 'imap4'
 'daytime' 'ctf' 'nntp' 'shell' 'IRC' 'nnsp' 'http_443' 'exec' 'printer'
 'efs' 'courier' 'uucp' 'klogin' 'kshell' 'echo' 'discard' 'systat'
 'supdup' 'iso_tsap' 'hostnames' 'csnet_ns' 'pop_2' 'sunrpc' 'uucp_path'
 'netbios_ns' 'netbios_ssn' 'netbios_dgm' 'sql_net' 'vmnet' 'bgp' 'Z39_50'
 'ldap' 'netstat' 'urh_i' 'X11' 'urp_i' 'pm_dump' 'tftp_u' 'tim_i' 'red_i']

Unique values in 'flag':
['SF' 'S1' 'REJ' 'S2' 'S0' 'S3' 'RSTO' 'RSTR' 'RSTOS0' 'OTH' 'SH']

Unique values in 'src_bytes':
[181 239 235 ... 500 484 475]

Unique values in 'dst_bytes':
[ 5450   486  1337 ... 34557 10592 13828]

Unique values in 'land':
[0 1]

Unique valu

In [9]:
missing_values_kdd99_testing = df_kdd99_testing.isnull().sum()
print(missing_values_kdd99_testing)

duplicates_kdd99_testing = df_kdd99_testing.duplicated().sum()
print(duplicates_kdd99_testing)


duration                       0
protocol_type                  0
service                        0
flag                           0
src_bytes                      0
dst_bytes                      0
land                           0
wrong_fragment                 0
urgent                         0
hot                            0
num_failed_logins              0
logged_in                      0
num_compromised                0
root_shell                     0
su_attempted                   0
num_root                       0
num_file_creations             0
num_shells                     0
num_access_files               0
num_outbound_cmds              0
is_host_login                  0
is_guest_login                 0
count                          0
srv_count                      0
serror_rate                    0
srv_serror_rate                0
rerror_rate                    0
srv_rerror_rate                0
same_srv_rate                  0
diff_srv_rate                  0
srv_diff_h

In [10]:
missing_values_kdd99_training = df_kdd99_training.isnull().sum()
print(missing_values_kdd99_training)

duplicates_kdd99_training = df_kdd99_training.duplicated().sum()
print(duplicates_kdd99_training)


duration                       0
protocol_type                  0
service                        0
flag                           0
src_bytes                      0
dst_bytes                      0
land                           0
wrong_fragment                 0
urgent                         0
hot                            0
num_failed_logins              0
logged_in                      0
num_compromised                0
root_shell                     0
su_attempted                   0
num_root                       0
num_file_creations             0
num_shells                     0
num_access_files               0
num_outbound_cmds              0
is_host_login                  0
is_guest_login                 0
count                          0
srv_count                      0
serror_rate                    0
srv_serror_rate                0
rerror_rate                    0
srv_rerror_rate                0
same_srv_rate                  0
diff_srv_rate                  0
srv_diff_h

### UNSW-NB15 Dataset

In [11]:
# data
df_unsw_training = pd.read_csv('UNSW-training-set.csv')
df_unsw_testing = pd.read_csv('UNSW-testing-set.csv')
df_unsw_training.head()

Unnamed: 0,id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
0,1,0.121478,tcp,-,FIN,6,4,258,172,74.08749,...,1,1,0,0,0,1,1,0,Normal,0
1,2,0.649902,tcp,-,FIN,14,38,734,42014,78.473372,...,1,2,0,0,0,1,6,0,Normal,0
2,3,1.623129,tcp,-,FIN,8,16,364,13186,14.170161,...,1,3,0,0,0,2,6,0,Normal,0
3,4,1.681642,tcp,ftp,FIN,12,12,628,770,13.677108,...,1,3,1,1,0,2,1,0,Normal,0
4,5,0.449454,tcp,-,FIN,10,6,534,268,33.373826,...,1,40,0,0,0,2,39,0,Normal,0


In [12]:
df_unsw_testing.head()

Unnamed: 0,id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
0,1,1.1e-05,udp,-,INT,2,0,496,0,90909.0902,...,1,2,0,0,0,1,2,0,Normal,0
1,2,8e-06,udp,-,INT,2,0,1762,0,125000.0003,...,1,2,0,0,0,1,2,0,Normal,0
2,3,5e-06,udp,-,INT,2,0,1068,0,200000.0051,...,1,3,0,0,0,1,3,0,Normal,0
3,4,6e-06,udp,-,INT,2,0,900,0,166666.6608,...,1,3,0,0,0,2,3,0,Normal,0
4,5,1e-05,udp,-,INT,2,0,2126,0,100000.0025,...,1,3,0,0,0,2,3,0,Normal,0


In [13]:
# Check which column needs fixing
df_unsw_training.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175341 entries, 0 to 175340
Data columns (total 45 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   id                 175341 non-null  int64  
 1   dur                175341 non-null  float64
 2   proto              175341 non-null  object 
 3   service            175341 non-null  object 
 4   state              175341 non-null  object 
 5   spkts              175341 non-null  int64  
 6   dpkts              175341 non-null  int64  
 7   sbytes             175341 non-null  int64  
 8   dbytes             175341 non-null  int64  
 9   rate               175341 non-null  float64
 10  sttl               175341 non-null  int64  
 11  dttl               175341 non-null  int64  
 12  sload              175341 non-null  float64
 13  dload              175341 non-null  float64
 14  sloss              175341 non-null  int64  
 15  dloss              175341 non-null  int64  
 16  si

In [14]:
# Check which column needs fixing
df_unsw_testing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82332 entries, 0 to 82331
Data columns (total 45 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 82332 non-null  int64  
 1   dur                82332 non-null  float64
 2   proto              82332 non-null  object 
 3   service            82332 non-null  object 
 4   state              82332 non-null  object 
 5   spkts              82332 non-null  int64  
 6   dpkts              82332 non-null  int64  
 7   sbytes             82332 non-null  int64  
 8   dbytes             82332 non-null  int64  
 9   rate               82332 non-null  float64
 10  sttl               82332 non-null  int64  
 11  dttl               82332 non-null  int64  
 12  sload              82332 non-null  float64
 13  dload              82332 non-null  float64
 14  sloss              82332 non-null  int64  
 15  dloss              82332 non-null  int64  
 16  sinpkt             823

In [15]:
# Check which column needs fixing
for column in df_unsw_training.columns:
    unique_values = df_unsw_training[column].unique()
    print(f"Unique values in '{column}':")
    print(unique_values)
    print()
    

Unique values in 'id':
[     1      2      3 ... 175339 175340 175341]

Unique values in 'dur':
[0.121478 0.649902 1.623129 ... 3.71911  0.996503 1.557125]

Unique values in 'proto':
['tcp' 'udp' 'arp' 'ospf' 'icmp' 'igmp' 'rtp' 'ddp' 'ipv6-frag' 'cftp'
 'wsn' 'pvp' 'wb-expak' 'mtp' 'pri-enc' 'sat-mon' 'cphb' 'sun-nd' 'iso-ip'
 'xtp' 'il' 'unas' 'mfe-nsp' '3pc' 'ipv6-route' 'idrp' 'bna' 'swipe'
 'kryptolan' 'cpnx' 'rsvp' 'wb-mon' 'vmtp' 'ib' 'dgp' 'eigrp' 'ax.25'
 'gmtp' 'pnni' 'sep' 'pgm' 'idpr-cmtp' 'zero' 'rvd' 'mobile' 'narp' 'fc'
 'pipe' 'ipcomp' 'ipv6-no' 'sat-expak' 'ipv6-opts' 'snp' 'ipcv'
 'br-sat-mon' 'ttp' 'tcf' 'nsfnet-igp' 'sprite-rpc' 'aes-sp3-d' 'sccopmce'
 'sctp' 'qnx' 'scps' 'etherip' 'aris' 'pim' 'compaq-peer' 'vrrp' 'iatp'
 'stp' 'l2tp' 'srp' 'sm' 'isis' 'smp' 'fire' 'ptp' 'crtp' 'sps'
 'merit-inp' 'idpr' 'skip' 'any' 'larp' 'ipip' 'micp' 'encap' 'ifmp'
 'tp++' 'a/n' 'ipv6' 'i-nlsp' 'ipx-n-ip' 'sdrp' 'tlsp' 'gre' 'mhrp' 'ddx'
 'ippc' 'visa' 'secure-vmtp' 'uti' 'vines

In [16]:
# Check for null values
missing_values_unsw_training = df_unsw_training.isnull().sum()
print(missing_values_unsw_training)

# Check for duplicate rows
duplicate_unsw_training = df_unsw_training.duplicated().sum()
print(duplicate_unsw_training)

id                   0
dur                  0
proto                0
service              0
state                0
spkts                0
dpkts                0
sbytes               0
dbytes               0
rate                 0
sttl                 0
dttl                 0
sload                0
dload                0
sloss                0
dloss                0
sinpkt               0
dinpkt               0
sjit                 0
djit                 0
swin                 0
stcpb                0
dtcpb                0
dwin                 0
tcprtt               0
synack               0
ackdat               0
smean                0
dmean                0
trans_depth          0
response_body_len    0
ct_srv_src           0
ct_state_ttl         0
ct_dst_ltm           0
ct_src_dport_ltm     0
ct_dst_sport_ltm     0
ct_dst_src_ltm       0
is_ftp_login         0
ct_ftp_cmd           0
ct_flw_http_mthd     0
ct_src_ltm           0
ct_srv_dst           0
is_sm_ips_ports      0
attack_cat 

In [17]:
# Check for null values
missing_values_unsw_testing = df_unsw_testing.isnull().sum()
print(missing_values_unsw_testing)

# Check for duplicate rows
duplicate_unsw_testing = df_unsw_testing.duplicated().sum()
print(duplicate_unsw_testing)

id                   0
dur                  0
proto                0
service              0
state                0
spkts                0
dpkts                0
sbytes               0
dbytes               0
rate                 0
sttl                 0
dttl                 0
sload                0
dload                0
sloss                0
dloss                0
sinpkt               0
dinpkt               0
sjit                 0
djit                 0
swin                 0
stcpb                0
dtcpb                0
dwin                 0
tcprtt               0
synack               0
ackdat               0
smean                0
dmean                0
trans_depth          0
response_body_len    0
ct_srv_src           0
ct_state_ttl         0
ct_dst_ltm           0
ct_src_dport_ltm     0
ct_dst_sport_ltm     0
ct_dst_src_ltm       0
is_ftp_login         0
ct_ftp_cmd           0
ct_flw_http_mthd     0
ct_src_ltm           0
ct_srv_dst           0
is_sm_ips_ports      0
attack_cat 

## Preprocessing

### Remove Duplicates

#### KDD99

In [18]:
print(f"Number of rows before removing duplicates: {len(df_kdd99_testing)}")

# Remove duplicates and overwrite the original DataFrame
df_kdd99_testing = df_kdd99_testing.drop_duplicates()

# Check number of rows after removing duplicates
print(f"Number of rows after removing duplicates: {len(df_kdd99_testing)}")

# Check if any duplicates remain
remaining_duplicates_df_kdd99_testing = df_kdd99_testing.duplicated().sum()
print(f"Number of duplicate rows remaining: {remaining_duplicates_df_kdd99_testing}")


Number of rows before removing duplicates: 311028
Number of rows after removing duplicates: 97325
Number of duplicate rows remaining: 0


#### UNSW

In [19]:
print(f"Number of rows before removing duplicates: {len(df_kdd99_training)}")

# Remove duplicates and overwrite the original DataFrame
df_kdd99_training = df_kdd99_training.drop_duplicates()

# Check number of rows after removing duplicates
print(f"Number of rows after removing duplicates: {len(df_kdd99_training)}")

# Check if any duplicates remain
remaining_duplicates = df_kdd99_training.duplicated().sum()
print(f"Number of duplicate rows remaining: {remaining_duplicates}")


Number of rows before removing duplicates: 494020
Number of rows after removing duplicates: 145585
Number of duplicate rows remaining: 0


### Label Encoding

#### KDD99

In [20]:
le = LabelEncoder() 
#training
df_kdd99_training['protocol_type'] = le.fit_transform(df_kdd99_training['protocol_type'])
df_kdd99_training['service'] = le.fit_transform(df_kdd99_training['service'])
df_kdd99_training['flag'] = le.fit_transform(df_kdd99_training['flag'])

#testing
df_kdd99_testing['protocol_type'] = le.fit_transform(df_kdd99_testing['protocol_type'])
df_kdd99_testing['service'] = le.fit_transform(df_kdd99_testing['service'])
df_kdd99_testing['flag'] = le.fit_transform(df_kdd99_testing['flag'])


#### UNSW

In [21]:
#training
df_unsw_training['state'] = le.fit_transform(df_unsw_training['state'])
df_unsw_training['service'] = le.fit_transform(df_unsw_training['service'])
df_unsw_training['proto'] = le.fit_transform(df_unsw_training['proto'])
#testing
df_unsw_testing['state'] = le.fit_transform(df_unsw_testing['state'])
df_unsw_testing['service'] = le.fit_transform(df_unsw_testing['service'])
df_unsw_testing['proto'] = le.fit_transform(df_unsw_testing['proto'])

### Remove Useless Columns

In [22]:
columns_to_drop_kdd = ['label']

# training
df_autoencoder_kdd99_train = df_kdd99_training[df_kdd99_training['label'] == 'normal'].drop(columns=columns_to_drop_kdd)

y = df_kdd99_training['label']
df_kdd99_training = df_kdd99_training.drop(columns=columns_to_drop_kdd)
X = df_kdd99_training

df_kdd99_train, df_kdd99_validation, df_kdd99_train_label, df_kdd99_validation_label = train_test_split(
    X, y, test_size=0.05, random_state=42, stratify=y
)

# testing
df_kdd99_testing_label = df_kdd99_testing['label']
df_kdd99_testing = df_kdd99_testing.drop(columns=columns_to_drop_kdd)

columns_to_drop_unsw = ['attack_cat', 'id', 'label']

# training
df_autoencoder_unsw_train = df_unsw_training[df_unsw_training['label'] == 0].drop(columns=columns_to_drop_unsw)

y = df_unsw_training['label']
df_unsw_training = df_unsw_training.drop(columns=columns_to_drop_unsw)
X = df_unsw_training

df_unsw_train, df_unsw_validation, df_unsw_train_label, df_unsw_validation_label = train_test_split(
    X, y, test_size=0.05, random_state=42, stratify=y
)

# testing
df_unsw_testing_label = df_unsw_testing['label']
df_unsw_testing = df_unsw_testing.drop(columns=columns_to_drop_unsw)

In [23]:
# KDD99 sampling
df_kdd99_train = df_kdd99_train.sample(frac=0.50, random_state=42)
df_kdd99_validation = df_kdd99_validation.sample(frac=0.50, random_state=42)
df_kdd99_testing = df_kdd99_testing.sample(frac=0.50, random_state=42)
df_autoencoder_kdd99_train = df_autoencoder_kdd99_train.sample(frac=0.50, random_state=42)

# UNSW sampling
df_unsw_train = df_unsw_train.sample(frac=0.5, random_state=42)
df_unsw_validation = df_unsw_validation.sample(frac=0.5, random_state=42)
df_unsw_testing = df_unsw_testing.sample(frac=0.5, random_state=42)
df_autoencoder_unsw_train = df_autoencoder_unsw_train.sample(frac=0.5, random_state=42)

# Update corresponding labels to match sampled data
df_kdd99_train_label = df_kdd99_train_label.loc[df_kdd99_train.index]
df_kdd99_validation_label = df_kdd99_validation_label.loc[df_kdd99_validation.index]
df_kdd99_testing_label = df_kdd99_testing_label.loc[df_kdd99_testing.index]

df_unsw_train_label = df_unsw_train_label.loc[df_unsw_train.index]
df_unsw_validation_label = df_unsw_validation_label.loc[df_unsw_validation.index]
df_unsw_testing_label = df_unsw_testing_label.loc[df_unsw_testing.index]

### Change labels to attack or not attack

In [24]:
df_kdd99_train_label = df_kdd99_train_label.apply(lambda x: 0 if x == 'normal' else 1)
df_kdd99_validation_label = df_kdd99_validation_label.apply(lambda x: 0 if x == 'normal' else 1)
df_kdd99_testing_label = df_kdd99_testing_label.apply(lambda x: 0 if x == 'normal.' else 1)

### Check distribution

In [25]:
# Training dataset anomaly percentage
anomalous_kdd99_train = (df_kdd99_train_label.sum() / len(df_kdd99_train_label)) * 100

# Validation dataset anomaly percentage
anomalous_kdd99_validation = (df_kdd99_validation_label.sum() / len(df_kdd99_validation_label)) * 100

# Testing dataset anomaly percentage
anomalous_kdd99_testing = (df_kdd99_testing_label.sum() / len(df_kdd99_testing_label)) * 100

print(f"KDD99 Training Anomaly Percentage: {anomalous_kdd99_train:.2f}%")
print(f"KDD99 Validation Anomaly Percentage: {anomalous_kdd99_validation:.2f}%")
print(f"KDD99 Testing Anomaly Percentage: {anomalous_kdd99_testing:.2f}%")


KDD99 Training Anomaly Percentage: 39.59%
KDD99 Validation Anomaly Percentage: 38.71%
KDD99 Testing Anomaly Percentage: 42.08%


In [26]:
# Training dataset anomaly percentage
anomalous_unsw_train = (df_unsw_train_label.sum() / len(df_unsw_train_label)) * 100

# Validation dataset anomaly percentage
anomalous_unsw_validation = (df_unsw_validation_label.sum() / len(df_unsw_validation_label)) * 100

# Testing dataset anomaly percentage
anomalous_unsw_testing = (df_unsw_testing_label.sum() / len(df_unsw_testing_label)) * 100

print(f"UNSW Training Anomaly Percentage: {anomalous_unsw_train:.2f}%")
print(f"UNSW Validation Anomaly Percentage: {anomalous_unsw_validation:.2f}%")
print(f"UNSW Testing Anomaly Percentage: {anomalous_unsw_testing:.2f}%")


UNSW Training Anomaly Percentage: 68.02%
UNSW Validation Anomaly Percentage: 67.63%
UNSW Testing Anomaly Percentage: 54.67%


### Normalize Data

In [27]:
scaler = RobustScaler()

# kdd
df_kdd99_testing = pd.DataFrame(scaler.fit_transform(df_kdd99_testing), columns=df_kdd99_testing.columns)
df_autoencoder_kdd99_train = pd.DataFrame(scaler.fit_transform(df_autoencoder_kdd99_train),columns=df_autoencoder_kdd99_train.columns)
df_kdd99_train = pd.DataFrame(scaler.transform(df_kdd99_train),columns=df_kdd99_train.columns)
df_kdd99_validation = pd.DataFrame(scaler.transform(df_kdd99_validation),columns=df_kdd99_validation.columns)

# unsw
df_unsw_testing = pd.DataFrame(scaler.fit_transform(df_unsw_testing), columns=df_unsw_testing.columns)
df_autoencoder_unsw_train = pd.DataFrame(scaler.fit_transform(df_autoencoder_unsw_train),columns=df_autoencoder_unsw_train.columns)
df_unsw_train= pd.DataFrame(scaler.transform(df_unsw_train),columns=df_unsw_train.columns)
df_unsw_validation = pd.DataFrame(scaler.transform(df_unsw_validation),columns=df_unsw_validation.columns)


### Final Check for Dataset

#### KDD99

In [28]:
# kdd
df_kdd99_train_label.unique()


array([0, 1])

In [29]:
df_kdd99_testing_label.unique()


array([1, 0])

In [30]:
df_kdd99_testing.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48662 entries, 0 to 48661
Data columns (total 41 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   duration                     48662 non-null  float64
 1   protocol_type                48662 non-null  float64
 2   service                      48662 non-null  float64
 3   flag                         48662 non-null  float64
 4   src_bytes                    48662 non-null  float64
 5   dst_bytes                    48662 non-null  float64
 6   land                         48662 non-null  float64
 7   wrong_fragment               48662 non-null  float64
 8   urgent                       48662 non-null  float64
 9   hot                          48662 non-null  float64
 10  num_failed_logins            48662 non-null  float64
 11  logged_in                    48662 non-null  float64
 12  num_compromised              48662 non-null  float64
 13  root_shell      

In [31]:
df_autoencoder_kdd99_train.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43916 entries, 0 to 43915
Data columns (total 41 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   duration                     43916 non-null  float64
 1   protocol_type                43916 non-null  float64
 2   service                      43916 non-null  float64
 3   flag                         43916 non-null  float64
 4   src_bytes                    43916 non-null  float64
 5   dst_bytes                    43916 non-null  float64
 6   land                         43916 non-null  float64
 7   wrong_fragment               43916 non-null  float64
 8   urgent                       43916 non-null  float64
 9   hot                          43916 non-null  float64
 10  num_failed_logins            43916 non-null  float64
 11  logged_in                    43916 non-null  float64
 12  num_compromised              43916 non-null  float64
 13  root_shell      

In [32]:
df_kdd99_train.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69152 entries, 0 to 69151
Data columns (total 41 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   duration                     69152 non-null  float64
 1   protocol_type                69152 non-null  float64
 2   service                      69152 non-null  float64
 3   flag                         69152 non-null  float64
 4   src_bytes                    69152 non-null  float64
 5   dst_bytes                    69152 non-null  float64
 6   land                         69152 non-null  float64
 7   wrong_fragment               69152 non-null  float64
 8   urgent                       69152 non-null  float64
 9   hot                          69152 non-null  float64
 10  num_failed_logins            69152 non-null  float64
 11  logged_in                    69152 non-null  float64
 12  num_compromised              69152 non-null  float64
 13  root_shell      

In [33]:
df_kdd99_validation.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3640 entries, 0 to 3639
Data columns (total 41 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   duration                     3640 non-null   float64
 1   protocol_type                3640 non-null   float64
 2   service                      3640 non-null   float64
 3   flag                         3640 non-null   float64
 4   src_bytes                    3640 non-null   float64
 5   dst_bytes                    3640 non-null   float64
 6   land                         3640 non-null   float64
 7   wrong_fragment               3640 non-null   float64
 8   urgent                       3640 non-null   float64
 9   hot                          3640 non-null   float64
 10  num_failed_logins            3640 non-null   float64
 11  logged_in                    3640 non-null   float64
 12  num_compromised              3640 non-null   float64
 13  root_shell        

#### UNSW

In [34]:
df_unsw_testing.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41166 entries, 0 to 41165
Data columns (total 42 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   dur                41166 non-null  float64
 1   proto              41166 non-null  float64
 2   service            41166 non-null  float64
 3   state              41166 non-null  float64
 4   spkts              41166 non-null  float64
 5   dpkts              41166 non-null  float64
 6   sbytes             41166 non-null  float64
 7   dbytes             41166 non-null  float64
 8   rate               41166 non-null  float64
 9   sttl               41166 non-null  float64
 10  dttl               41166 non-null  float64
 11  sload              41166 non-null  float64
 12  dload              41166 non-null  float64
 13  sloss              41166 non-null  float64
 14  dloss              41166 non-null  float64
 15  sinpkt             41166 non-null  float64
 16  dinpkt             411

In [35]:
df_autoencoder_unsw_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28000 entries, 0 to 27999
Data columns (total 42 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   dur                28000 non-null  float64
 1   proto              28000 non-null  float64
 2   service            28000 non-null  float64
 3   state              28000 non-null  float64
 4   spkts              28000 non-null  float64
 5   dpkts              28000 non-null  float64
 6   sbytes             28000 non-null  float64
 7   dbytes             28000 non-null  float64
 8   rate               28000 non-null  float64
 9   sttl               28000 non-null  float64
 10  dttl               28000 non-null  float64
 11  sload              28000 non-null  float64
 12  dload              28000 non-null  float64
 13  sloss              28000 non-null  float64
 14  dloss              28000 non-null  float64
 15  sinpkt             28000 non-null  float64
 16  dinpkt             280

In [36]:
df_unsw_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83286 entries, 0 to 83285
Data columns (total 42 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   dur                83286 non-null  float64
 1   proto              83286 non-null  float64
 2   service            83286 non-null  float64
 3   state              83286 non-null  float64
 4   spkts              83286 non-null  float64
 5   dpkts              83286 non-null  float64
 6   sbytes             83286 non-null  float64
 7   dbytes             83286 non-null  float64
 8   rate               83286 non-null  float64
 9   sttl               83286 non-null  float64
 10  dttl               83286 non-null  float64
 11  sload              83286 non-null  float64
 12  dload              83286 non-null  float64
 13  sloss              83286 non-null  float64
 14  dloss              83286 non-null  float64
 15  sinpkt             83286 non-null  float64
 16  dinpkt             832

In [37]:
df_unsw_validation.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4384 entries, 0 to 4383
Data columns (total 42 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   dur                4384 non-null   float64
 1   proto              4384 non-null   float64
 2   service            4384 non-null   float64
 3   state              4384 non-null   float64
 4   spkts              4384 non-null   float64
 5   dpkts              4384 non-null   float64
 6   sbytes             4384 non-null   float64
 7   dbytes             4384 non-null   float64
 8   rate               4384 non-null   float64
 9   sttl               4384 non-null   float64
 10  dttl               4384 non-null   float64
 11  sload              4384 non-null   float64
 12  dload              4384 non-null   float64
 13  sloss              4384 non-null   float64
 14  dloss              4384 non-null   float64
 15  sinpkt             4384 non-null   float64
 16  dinpkt             4384 

In [38]:
df_unsw_train_label.unique()

array([1, 0])

In [39]:
df_kdd99_testing_label.unique()

array([1, 0])

## **Model Generation with Optimizer + Evaluation**

### +++++++++++++++++++++++++++++++++++++++++++++++ **DBSCAN** +++++++++++++++++++++++++++++++++++++++++++++++

In [40]:
def create_dbscan(eps, min_samples):
    """Create a DBSCAN model with specified parameters."""
    return DBSCAN(
        eps=float(eps),
        min_samples=int(min_samples),
        n_jobs=-1
    )

In [41]:
def objective_function(solution, X_train, X_val, y_val, dataset_name):
    """Improved objective function using validation set and classification metrics"""
    eps, min_samples = solution
    
    # Create and fit DBSCAN
    dbscan = DBSCAN(eps=eps, min_samples=int(min_samples))
    dbscan.fit(X_train)  # Train on training data
    
    # 1. Identify core points from training data
    core_samples_mask = np.zeros_like(dbscan.labels_, dtype=bool)
    core_samples_mask[dbscan.core_sample_indices_] = True
    core_points = X_train[core_samples_mask]
    
    # 2. For each validation point, calculate distance to nearest core point
    if len(core_points) > 0:
        nbrs = NearestNeighbors(n_neighbors=1).fit(core_points)
        distances, _ = nbrs.kneighbors(X_val)
    else:
        distances = np.array([[np.inf]] * len(X_val))
    
    # 3. Set dataset-specific threshold
    if dataset_name == "kdd99":
        percentile = 60
    elif dataset_name == "unsw":
        percentile = 40
    
    if len(core_points) > 0:
        train_distances, _ = nbrs.kneighbors(X_train)
        threshold = np.percentile(train_distances, percentile)
    else:
        threshold = np.inf
    
    # 4. Classify validation points
    y_pred = (distances > threshold).astype(int).flatten()
    
    # Handle degenerate cases
    if len(np.unique(y_pred)) == 1:
        return 1.0  # Worst possible fitness (all same prediction)
    
    # Use F1-score as the optimization metric (can be changed to other metrics)
    return 1 - f1_score(y_val, y_pred)  # Minimize (1 - F1)


In [42]:
def optimize_dbscan(X_train, X_val, y_val, optimizer_type, dataset_name, epochs=50, pop_size=10):
    problem_dict = {
        "bounds": FloatVar(lb=[0.1, 2], ub=[1.0, 20], name="delta"),
        "minmax": "min",
        "obj_func": lambda solution: objective_function(solution, X_train, X_val, y_val, dataset_name)
    }

    # Choose Optimizer
    if optimizer_type == "WOA":
        model = WOA.HI_WOA(epoch=epochs, pop_size=pop_size, feedback_max=10)
    elif optimizer_type == "DE":
        model = DE.JADE(epoch=epochs, pop_size=pop_size,
                        miu_f=0.5, miu_cr=0.5, pt=0.1, ap=0.1)
    else:
        raise ValueError("Optimizer not supported. Choose 'WOA' or 'DE'.")

    # Run Optimization
    g_best = model.solve(problem_dict)
    return g_best.solution

In [43]:
def evaluate_model(model, X_train, X_test, y_test, dataset_name):
    """Evaluate DBSCAN model performance on test set using distance-based approach."""
    # 1. Identify core points from training data
    model.fit(X_train)
    core_samples_mask = np.zeros_like(model.labels_, dtype=bool)
    core_samples_mask[model.core_sample_indices_] = True
    core_points = X_train[core_samples_mask]
    
    # 2. For each test point, calculate distance to nearest core point
    if len(core_points) > 0:
        nbrs = NearestNeighbors(n_neighbors=1).fit(core_points)
        distances, _ = nbrs.kneighbors(X_test)
    else:
        distances = np.array([[np.inf]] * len(X_test))
    
    # 3. Set dataset-specific threshold
    if dataset_name == "kdd99":
        percentile = 60
    elif dataset_name == "unsw":
        percentile = 40
    
    if len(core_points) > 0:
        train_distances, _ = nbrs.kneighbors(X_train)
        threshold = np.percentile(train_distances, percentile)
    else:
        threshold = np.inf
    
    # 4. Classify test points
    y_pred = (distances > threshold).astype(int).flatten()
    y_scores = distances.flatten()  # Use distances as scores for AUC
    
    return {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred),
        "f1": f1_score(y_test, y_pred),
        "auc": roc_auc_score(y_test, y_scores),
        "confusion_matrix": confusion_matrix(y_test, y_pred)
    }




In [44]:
def main_pipeline(dataset_name, optimizer_type):
    """Main pipeline with proper train/test split."""
    # Load dataset
    if dataset_name == "kdd99":
        X_train = df_kdd99_training
        X_val = df_kdd99_validation
        y_val = df_kdd99_validation_label
        X_test = df_kdd99_testing
        y_test = df_kdd99_testing_label
    elif dataset_name == "unsw":
        X_train = df_unsw_train
        X_val = df_unsw_validation
        y_val = df_unsw_validation_label
        X_test = df_unsw_testing
        y_test = df_unsw_testing_label
    else:
        raise ValueError("Dataset not supported. Choose 'kdd99' or 'unsw'.")
    
    # Optimize parameters
    best_params = optimize_dbscan(X_train, X_val, y_val, optimizer_type, dataset_name)
    
    # Create final model with best parameters
    eps, min_samples = best_params
    final_model = DBSCAN(eps=eps, min_samples=int(min_samples))
    
    # Evaluate on test set using distance-based approach
    metrics = evaluate_model(final_model, X_train, X_test, y_test, dataset_name)
    
    # Print results
    print(f"\n=== Results for {dataset_name} with {optimizer_type} ===")
    print("\nBest Parameters:")
    print(f"eps: {eps:.4f}")
    print(f"min_samples: {int(min_samples)}")
    
    print("\nTest Set Metrics:")
    print(f"Accuracy: {metrics['accuracy']:.4f}")
    print(f"Precision: {metrics['precision']:.4f}")
    print(f"Recall: {metrics['recall']:.4f}")
    print(f"F1 Score: {metrics['f1']:.4f}")
    print(f"AUC: {metrics['auc']:.4f}")
    print("Confusion Matrix:")
    print(metrics["confusion_matrix"])

In [45]:
main_pipeline('unsw', 'DE')

[WinError 2] The system cannot find the file specified
  File "c:\Users\DSAIRC 01\AppData\Local\Programs\Python\Python311\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "c:\Users\DSAIRC 01\AppData\Local\Programs\Python\Python311\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\DSAIRC 01\AppData\Local\Programs\Python\Python311\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Users\DSAIRC 01\AppData\Local\Programs\Python\Python311\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
2025/03/26 11:26:39 PM, INFO, mealpy.evolutionary_based.DE.JADE: Solving single objective optimization problem.
2


=== Results for unsw with DE ===

Best Parameters:
eps: 1.0000
min_samples: 18

Test Set Metrics:
Accuracy: 0.5467
Precision: 0.5467
Recall: 1.0000
F1 Score: 0.7069
AUC: 0.3403
Confusion Matrix:
[[    0 18660]
 [    0 22506]]


In [46]:
main_pipeline('kdd99', 'WOA')

2025/03/27 12:26:42 AM, INFO, mealpy.swarm_based.WOA.HI_WOA: Solving single objective optimization problem.
2025/03/27 12:32:16 AM, INFO, mealpy.swarm_based.WOA.HI_WOA: >>>Problem: P, Epoch: 1, Current best: 0.9387755102040817, Global best: 0.9387755102040817, Runtime: 176.76551 seconds
2025/03/27 12:35:18 AM, INFO, mealpy.swarm_based.WOA.HI_WOA: >>>Problem: P, Epoch: 2, Current best: 0.9386581469648563, Global best: 0.9386581469648563, Runtime: 181.96774 seconds
2025/03/27 12:38:20 AM, INFO, mealpy.swarm_based.WOA.HI_WOA: >>>Problem: P, Epoch: 3, Current best: 0.9226450405489707, Global best: 0.9226450405489707, Runtime: 182.14333 seconds
2025/03/27 12:41:26 AM, INFO, mealpy.swarm_based.WOA.HI_WOA: >>>Problem: P, Epoch: 4, Current best: 0.9226450405489707, Global best: 0.9226450405489707, Runtime: 185.84573 seconds
2025/03/27 12:44:31 AM, INFO, mealpy.swarm_based.WOA.HI_WOA: >>>Problem: P, Epoch: 5, Current best: 0.9226450405489707, Global best: 0.9226450405489707, Runtime: 184.62949 


=== Results for kdd99 with WOA ===

Best Parameters:
eps: 1.0000
min_samples: 2

Test Set Metrics:
Accuracy: 0.5479
Precision: 0.2158
Recall: 0.0283
F1 Score: 0.0500
AUC: 0.2044
Confusion Matrix:
[[26082  2104]
 [19897   579]]


In [47]:
main_pipeline('kdd99', 'DE')

2025/03/27 03:59:07 AM, INFO, mealpy.evolutionary_based.DE.JADE: Solving single objective optimization problem.
2025/03/27 04:04:24 AM, INFO, mealpy.evolutionary_based.DE.JADE: >>>Problem: P, Epoch: 1, Current best: 0.9944598337950139, Global best: 0.9944598337950139, Runtime: 161.59975 seconds
2025/03/27 04:07:11 AM, INFO, mealpy.evolutionary_based.DE.JADE: >>>Problem: P, Epoch: 2, Current best: 0.9386581469648563, Global best: 0.9386581469648563, Runtime: 167.17440 seconds
2025/03/27 04:10:10 AM, INFO, mealpy.evolutionary_based.DE.JADE: >>>Problem: P, Epoch: 3, Current best: 0.9386581469648563, Global best: 0.9386581469648563, Runtime: 178.88860 seconds
2025/03/27 04:13:12 AM, INFO, mealpy.evolutionary_based.DE.JADE: >>>Problem: P, Epoch: 4, Current best: 0.9386581469648563, Global best: 0.9386581469648563, Runtime: 181.31031 seconds
2025/03/27 04:16:21 AM, INFO, mealpy.evolutionary_based.DE.JADE: >>>Problem: P, Epoch: 5, Current best: 0.9386581469648563, Global best: 0.9386581469648


=== Results for kdd99 with DE ===

Best Parameters:
eps: 0.1000
min_samples: 2

Test Set Metrics:
Accuracy: 0.5545
Precision: 0.0275
Recall: 0.0017
F1 Score: 0.0032
AUC: 0.5339
Confusion Matrix:
[[26947  1239]
 [20441    35]]


# DONE

In [137]:
main_pipeline('unsw', 'WOA')


2025/03/26 10:29:57 PM, INFO, mealpy.swarm_based.WOA.HI_WOA: Solving single objective optimization problem.
2025/03/26 10:31:07 PM, INFO, mealpy.swarm_based.WOA.HI_WOA: >>>Problem: P, Epoch: 1, Current best: 0.4157303370786517, Global best: 0.4157303370786517, Runtime: 36.53319 seconds
2025/03/26 10:31:42 PM, INFO, mealpy.swarm_based.WOA.HI_WOA: >>>Problem: P, Epoch: 2, Current best: 0.3607355327203894, Global best: 0.3607355327203894, Runtime: 34.50083 seconds
2025/03/26 10:32:18 PM, INFO, mealpy.swarm_based.WOA.HI_WOA: >>>Problem: P, Epoch: 3, Current best: 0.3607355327203894, Global best: 0.3607355327203894, Runtime: 36.34108 seconds
2025/03/26 10:32:53 PM, INFO, mealpy.swarm_based.WOA.HI_WOA: >>>Problem: P, Epoch: 4, Current best: 0.3575932264456855, Global best: 0.3575932264456855, Runtime: 34.63153 seconds
2025/03/26 10:33:29 PM, INFO, mealpy.swarm_based.WOA.HI_WOA: >>>Problem: P, Epoch: 5, Current best: 0.3575932264456855, Global best: 0.3575932264456855, Runtime: 35.71510 secon


=== Results for unsw with WOA ===

Best Parameters:
eps: 0.9999
min_samples: 20

Test Set Metrics:
Accuracy: 0.5456
Precision: 0.5463
Recall: 0.9971
F1 Score: 0.7058
AUC: 0.3377
Confusion Matrix:
[[   21 18639]
 [   66 22440]]
