✅ Identify & drop highly correlated features

From your heatmap, examples of highly collinear pairs (r > 0.95):

spkts and sbytes

smean and spkts

stcpb and dtcpb

sinpkt and dmean

dbytes and dpkts

In [1]:
import pandas as pd

data = pd.read_parquet(r"Data\UNSW_NB15_training-set.parquet", engine="pyarrow")
print(data.head())

        dur proto service state  spkts  dpkts  sbytes  dbytes       rate  \
0  0.121478   tcp       -   FIN      6      4     258     172  74.087486   
1  0.649902   tcp       -   FIN     14     38     734   42014  78.473373   
2  1.623129   tcp       -   FIN      8     16     364   13186  14.170161   
3  1.681642   tcp     ftp   FIN     12     12     628     770  13.677108   
4  0.449454   tcp       -   FIN     10      6     534     268  33.373825   

          sload  ...  trans_depth  response_body_len  ct_src_dport_ltm  \
0  14158.942383  ...            0                  0                 1   
1   8395.112305  ...            0                  0                 1   
2   1572.271851  ...            0                  0                 1   
3   2740.178955  ...            0                  0                 1   
4   8561.499023  ...            0                  0                 2   

   ct_dst_sport_ltm  is_ftp_login  ct_ftp_cmd  ct_flw_http_mthd  \
0                 1            

In [2]:
#Drop one feature per pair:

high_corr_features = ['spkts', 'smean', 'dtcpb', 'dmean', 'dpkts']
data_reduced = data.drop(columns=high_corr_features)
data_reduced.columns


Index(['dur', 'proto', 'service', 'state', 'sbytes', 'dbytes', 'rate', 'sload',
       'dload', 'sloss', 'dloss', 'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin',
       'stcpb', 'dwin', 'tcprtt', 'synack', 'ackdat', 'trans_depth',
       'response_body_len', 'ct_src_dport_ltm', 'ct_dst_sport_ltm',
       'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'is_sm_ips_ports',
       'attack_cat', 'label'],
      dtype='object')

Encode categorical variables

Normalize or standardize numerical features

In [3]:
from sklearn.preprocessing import StandardScaler, LabelEncoder

data_encoded = data.copy()
for col in data_encoded.select_dtypes(include='category').columns:
    data_encoded[col] = LabelEncoder().fit_transform(data_encoded[col])

# Repeat for df_reduced
scaler = StandardScaler()
X_full = scaler.fit_transform(data_encoded.drop(columns=['attack_cat', 'label']))
X_reduced = scaler.fit_transform(data_encoded.drop(columns=high_corr_features + ['attack_cat', 'label']))
y = data_encoded['label']


In [4]:
X_reduced.shape

(175341, 29)

In [5]:
from sklearn.ensemble import IsolationForest
from sklearn.mixture import GaussianMixture
from sklearn.cluster import DBSCAN

# Isolation Forest
iso = IsolationForest(contamination=0.05, random_state=42)
pred_iso = iso.fit_predict(X_full)  # -1 = anomaly, 1 = normal


In [6]:
# GMM
gmm = GaussianMixture(n_components=2, random_state=42)
gmm_labels = gmm.fit_predict(X_full)


In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=10, random_state=42)
X_pca = pca.fit_transform(X_full)

dbscan = DBSCAN(eps=1.5, min_samples=5)
db_labels = dbscan.fit_predict(X_pca)


In [7]:
gmm_labels

array([0, 0, 0, ..., 1, 1, 1], shape=(175341,))

In [8]:
from sklearn.metrics import f1_score, adjusted_rand_score

f1 = f1_score(y, (pred_iso == -1).astype(int))
ari = adjusted_rand_score(y, (pred_iso == -1).astype(int))
print("Isolation Forest F1:", f1, "| ARI:", ari)


Isolation Forest F1: 0.08965872545040122 | ARI: 0.003743917028665791


In [9]:
from sklearn.metrics import f1_score, adjusted_rand_score

f1 = f1_score(y, gmm_labels.astype(int))
ari = adjusted_rand_score(y, gmm_labels.astype(int))
print("Gmm F1:", f1, "| ARI:", ari)


Gmm F1: 0.7283791995027622 | ARI: 0.1091433542732965


In [10]:
test_data = pd.read_parquet(r"Data\UNSW_NB15_testing-set.parquet", engine="pyarrow")
test_data.shape

(82332, 36)

In [11]:
test_data.columns

Index(['dur', 'proto', 'service', 'state', 'spkts', 'dpkts', 'sbytes',
       'dbytes', 'rate', 'sload', 'dload', 'sloss', 'dloss', 'sinpkt',
       'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin', 'tcprtt',
       'synack', 'ackdat', 'smean', 'dmean', 'trans_depth',
       'response_body_len', 'ct_src_dport_ltm', 'ct_dst_sport_ltm',
       'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'is_sm_ips_ports',
       'attack_cat', 'label'],
      dtype='object')

In [12]:
Test_data_copy = test_data.copy()

In [13]:
for col in test_data.select_dtypes(include='category').columns:
    test_data[col] = LabelEncoder().fit_transform(test_data[col])
X_test = scaler.fit_transform(test_data.drop(columns=['attack_cat', 'label']))
y_test = test_data['label']

In [14]:
print(X_test.shape)
print(y_test.shape)

(82332, 34)
(82332,)


In [32]:
X_full.shape

(175341, 34)

In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix


X_train = X_full
y_train = y
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
print(classification_report(y_test, pred))


              precision    recall  f1-score   support

           0       0.99      0.57      0.72     37000
           1       0.74      1.00      0.85     45332

    accuracy                           0.80     82332
   macro avg       0.86      0.78      0.79     82332
weighted avg       0.85      0.80      0.79     82332



In [None]:
support_vector = SVC(kernel='linear',random_state=42)
support_vector.fit(X_train, y_train)
pred = support_vector.predict(X_test)
print(classification_report(y_test, pred))

In [16]:
Knn = KNeighborsClassifier()
Knn.fit(X_train, y_train)
pred = Knn.predict(X_test)
print(classification_report(y_test, pred))


              precision    recall  f1-score   support

           0       0.94      0.66      0.77     37000
           1       0.78      0.97      0.86     45332

    accuracy                           0.83     82332
   macro avg       0.86      0.81      0.82     82332
weighted avg       0.85      0.83      0.82     82332



In [21]:
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)
pred = decision_tree.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.75      0.53      0.62     37000
           1       0.69      0.85      0.76     45332

    accuracy                           0.71     82332
   macro avg       0.72      0.69      0.69     82332
weighted avg       0.72      0.71      0.70     82332

