✅ Identify & drop highly correlated features

From your heatmap, examples of highly collinear pairs (r > 0.95):

spkts and sbytes

smean and spkts

stcpb and dtcpb

sinpkt and dmean

dbytes and dpkts

In [1]:
import pandas as pd

data = pd.read_parquet(r"Data\UNSW_NB15_training-set.parquet", engine="pyarrow")
print(data.head())

        dur proto service state  spkts  dpkts  sbytes  dbytes       rate  \
0  0.121478   tcp       -   FIN      6      4     258     172  74.087486   
1  0.649902   tcp       -   FIN     14     38     734   42014  78.473373   
2  1.623129   tcp       -   FIN      8     16     364   13186  14.170161   
3  1.681642   tcp     ftp   FIN     12     12     628     770  13.677108   
4  0.449454   tcp       -   FIN     10      6     534     268  33.373825   

          sload  ...  trans_depth  response_body_len  ct_src_dport_ltm  \
0  14158.942383  ...            0                  0                 1   
1   8395.112305  ...            0                  0                 1   
2   1572.271851  ...            0                  0                 1   
3   2740.178955  ...            0                  0                 1   
4   8561.499023  ...            0                  0                 2   

   ct_dst_sport_ltm  is_ftp_login  ct_ftp_cmd  ct_flw_http_mthd  \
0                 1            

In [2]:
#Drop one feature per pair:

high_corr_features = ['spkts', 'smean', 'dtcpb', 'dmean', 'dpkts']
data_reduced = data.drop(columns=high_corr_features)
data_reduced.columns


Index(['dur', 'proto', 'service', 'state', 'sbytes', 'dbytes', 'rate', 'sload',
       'dload', 'sloss', 'dloss', 'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin',
       'stcpb', 'dwin', 'tcprtt', 'synack', 'ackdat', 'trans_depth',
       'response_body_len', 'ct_src_dport_ltm', 'ct_dst_sport_ltm',
       'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'is_sm_ips_ports',
       'attack_cat', 'label'],
      dtype='object')

Encode categorical variables

Normalize or standardize numerical features

In [3]:
from sklearn.preprocessing import StandardScaler, LabelEncoder

data_encoded = data.copy()
for col in data_encoded.select_dtypes(include='category').columns:
    data_encoded[col] = LabelEncoder().fit_transform(data_encoded[col])

# Repeat for df_reduced
scaler = StandardScaler()
X_full = scaler.fit_transform(data_encoded.drop(columns=['attack_cat', 'label']))
X_reduced = scaler.fit_transform(data_encoded.drop(columns=high_corr_features + ['attack_cat', 'label']))
y = data_encoded['label']


In [4]:
X_reduced.shape

(175341, 29)

In [5]:
from sklearn.ensemble import IsolationForest
from sklearn.mixture import GaussianMixture
from sklearn.cluster import DBSCAN

# Isolation Forest
iso = IsolationForest(contamination=0.05, random_state=42)
pred_iso = iso.fit_predict(X_full)  # -1 = anomaly, 1 = normal


In [6]:
# GMM
gmm = GaussianMixture(n_components=2, random_state=42)
gmm_labels = gmm.fit_predict(X_full)


In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=10, random_state=42)
X_pca = pca.fit_transform(X_full)

dbscan = DBSCAN(eps=1.5, min_samples=5)
db_labels = dbscan.fit_predict(X_pca)


In [7]:
gmm_labels

array([0, 0, 0, ..., 1, 1, 1], shape=(175341,))

In [8]:
from sklearn.metrics import f1_score, adjusted_rand_score

f1 = f1_score(y, (pred_iso == -1).astype(int))
ari = adjusted_rand_score(y, (pred_iso == -1).astype(int))
print("Isolation Forest F1:", f1, "| ARI:", ari)


Isolation Forest F1: 0.08965872545040122 | ARI: 0.003743917028665791


In [9]:
from sklearn.metrics import f1_score, adjusted_rand_score

f1 = f1_score(y, gmm_labels.astype(int))
ari = adjusted_rand_score(y, gmm_labels.astype(int))
print("Gmm F1:", f1, "| ARI:", ari)


Gmm F1: 0.7283791995027622 | ARI: 0.1091433542732965


In [10]:
test_data = pd.read_parquet(r"Data\UNSW_NB15_testing-set.parquet", engine="pyarrow")
test_data.shape

(82332, 36)

In [11]:
test_data.columns

Index(['dur', 'proto', 'service', 'state', 'spkts', 'dpkts', 'sbytes',
       'dbytes', 'rate', 'sload', 'dload', 'sloss', 'dloss', 'sinpkt',
       'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin', 'tcprtt',
       'synack', 'ackdat', 'smean', 'dmean', 'trans_depth',
       'response_body_len', 'ct_src_dport_ltm', 'ct_dst_sport_ltm',
       'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'is_sm_ips_ports',
       'attack_cat', 'label'],
      dtype='object')

In [12]:
Test_data_copy = test_data.copy()

In [13]:
for col in test_data.select_dtypes(include='category').columns:
    test_data[col] = LabelEncoder().fit_transform(test_data[col])
X_test = scaler.fit_transform(test_data.drop(columns=['attack_cat', 'label']))
y_test = test_data['label']

In [14]:
print(X_test.shape)
print(y_test.shape)

(82332, 34)
(82332,)


In [15]:
X_full.shape

(175341, 34)

In [16]:
y.shape

(175341,)

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix


X_train = X_full
y_train = y
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
print(classification_report(y_test, pred))


              precision    recall  f1-score   support

           0       0.99      0.56      0.71     37000
           1       0.73      1.00      0.84     45332

    accuracy                           0.80     82332
   macro avg       0.86      0.78      0.78     82332
weighted avg       0.85      0.80      0.78     82332



In [None]:
support_vector = SVC(kernel='linear',random_state=42)
support_vector.fit(X_train, y_train)
pred = support_vector.predict(X_test)
print(classification_report(y_test, pred))

In [18]:
Knn = KNeighborsClassifier()
Knn.fit(X_train, y_train)
pred = Knn.predict(X_test)
print(classification_report(y_test, pred))


              precision    recall  f1-score   support

           0       0.94      0.66      0.77     37000
           1       0.78      0.97      0.86     45332

    accuracy                           0.83     82332
   macro avg       0.86      0.81      0.82     82332
weighted avg       0.85      0.83      0.82     82332



In [19]:
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)
pred = decision_tree.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.87      0.54      0.67     37000
           1       0.71      0.93      0.81     45332

    accuracy                           0.76     82332
   macro avg       0.79      0.74      0.74     82332
weighted avg       0.78      0.76      0.74     82332



Now let's fit knn, decsion trees, random forest and gmm (since they're performing quiet well) using dataset that we have removed some elements that had collinearlity

In [26]:
# GMM
gmm = GaussianMixture(n_components=2, random_state=42)
gmm_labels = gmm.fit_predict(X_reduced)

f1 = f1_score(y, gmm_labels.astype(int))
ari = adjusted_rand_score(y, gmm_labels.astype(int))
print("Gmm F1:", f1, "| ARI:", ari)


Gmm F1: 0.035621751104013555 | ARI: 0.0022309682063044673


In [20]:
X_test

array([[-0.21372745,  0.41056274, -0.6744059 , ..., -0.09061736,
        -0.20314282, -0.10607007],
       [-0.21372808,  0.41056274, -0.6744059 , ..., -0.09061736,
        -0.20314282, -0.10607007],
       [-0.21372872,  0.41056274, -0.6744059 , ..., -0.09061736,
        -0.20314282, -0.10607007],
       ...,
       [-0.21372978, -5.54527746, -0.6744059 , ..., -0.09061736,
        -0.20314282,  9.42773017],
       [-0.21372978, -5.54527746, -0.6744059 , ..., -0.09061736,
        -0.20314282,  9.42773017],
       [-0.21372787,  0.41056274, -0.6744059 , ..., -0.09061736,
        -0.20314282, -0.10607007]], shape=(82332, 34))

In [89]:
X_test = scaler.fit_transform(X_test)

In [21]:
X_test[:5]

array([[-0.21372745,  0.41056274, -0.6744059 ,  0.93269533, -0.1244551 ,
        -0.15181641, -0.04368361, -0.08736871,  0.05718099,  0.6439127 ,
        -0.26349797, -0.07353054, -0.11324391, -0.12217934, -0.09416902,
        -0.11217671, -0.14721835, -1.04791956, -0.77984004, -0.77675409,
        -1.00624379, -0.48202491, -0.41290971, -0.48407269,  0.52031938,
        -0.47537059, -0.17364821, -0.04190986, -0.46831162, -0.45018649,
        -0.09085748, -0.09061736, -0.20314282, -0.10607007],
       [-0.21372808,  0.41056274, -0.6744059 ,  0.93269533, -0.1244551 ,
        -0.15181641, -0.03630776, -0.08736871,  0.28656485,  4.53935073,
        -0.26349797, -0.07353054, -0.11324391, -0.12217982, -0.09416902,
        -0.11217671, -0.14721835, -1.04791956, -0.77984004, -0.77675409,
        -1.00624379, -0.48202491, -0.41290971, -0.48407269,  3.55671589,
        -0.47537059, -0.17364821, -0.04190986, -0.46831162, -0.45018649,
        -0.09085748, -0.09061736, -0.20314282, -0.10607007],
  

In [None]:
clf = RandomForestClassifier()
clf.fit(X_reduced, y_train)
pred = clf.predict(X_test)
print(classification_report(y_test, pred))

In [36]:
Knn = KNeighborsClassifier()
Knn.fit(X_reduced, y_train)
pred = Knn.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.94      0.65      0.77     37000
           1       0.77      0.97      0.86     45332

    accuracy                           0.82     82332
   macro avg       0.85      0.81      0.81     82332
weighted avg       0.85      0.82      0.82     82332



since this dataset is imbalanced we'll try to handle the dataset using different techniques and try and increase the F1 score of the minority class

In [22]:
from imblearn.over_sampling import SMOTE
sm  = SMOTE(random_state=42)
X_train, y_train = sm.fit_resample(X_full, y)

In [23]:
print('After OverSampling, the shape of train_X: {}'.format(X_train.shape))
print('After OverSampling, the shape of train_y: {} \n'.format(y_train.shape))

print("After OverSampling, counts of label '1': {}".format(sum(y_train == 1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_train == 0)))

After OverSampling, the shape of train_X: (238682, 34)
After OverSampling, the shape of train_y: (238682,) 

After OverSampling, counts of label '1': 119341
After OverSampling, counts of label '0': 119341


In [24]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix



In [25]:
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)
pred = decision_tree.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.45      0.94      0.61     37000
           1       0.54      0.06      0.11     45332

    accuracy                           0.45     82332
   macro avg       0.50      0.50      0.36     82332
weighted avg       0.50      0.45      0.33     82332



In [26]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.77      0.88      0.82     37000
           1       0.89      0.78      0.83     45332

    accuracy                           0.82     82332
   macro avg       0.83      0.83      0.82     82332
weighted avg       0.83      0.82      0.83     82332



In [27]:
Knn = KNeighborsClassifier()
Knn.fit(X_train, y_train)
pred = Knn.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.89      0.75      0.82     37000
           1       0.82      0.93      0.87     45332

    accuracy                           0.85     82332
   macro avg       0.86      0.84      0.84     82332
weighted avg       0.85      0.85      0.85     82332



At this point I decided to go with knn classifier and also random forest. There was not much I could do to the data to improve the metrics. Let's try fine tuning the parameteres using random searchCV and see if we can improve the metrics by 2-3%

In [35]:
#view the parameters of the current model
Knn.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [36]:
#view the parameters of the model
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

We specify different parameters for each of our selected model

In [None]:
knn_param_grid = {
    'algorithm':['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': [10, 20, 30, 40],
    'n_neighbors': [2,3,5,6],
    'weights': ['uniform', 'distance']  
}

clf_param_grid = {
    'bootstrap': [True, False],
    'min_samples_split': [2,4,5,6],
    'class_weight': None,
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_depth': None,
    'max_features': ['sqrt','log2', None],
    'monotonic_cst': None,
    'n_estimators': [10,20, 40, 50, 80,100],
    'oob_score': [True, False],
    'random_state': 42,
    }

In [None]:
from sklearn.model_selection import RandomizedSearchCV

