In [1]:
import pandas as pd
import numpy as np
import joblib
import time

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


## Isolation Forest

In [2]:
# Loading the preprocessed benign data
X_mon = pd.read_csv(
    "../datasets/feature_sets/cicids_monday_benign_isoforest_features_scaled.csv"
)

X_mon.shape


(529481, 78)

In [3]:
X_mon.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,68,69,70,71,72,73,74,75,76,77
0,1.803407,-0.36151,-0.009406,-0.00982,-0.083587,-0.006693,-0.412139,-0.393785,-0.486765,-0.392281,...,-0.007532,0.00658,-0.116587,-0.10887,-0.141407,-0.087755,-0.267178,-0.093323,-0.270187,-0.257263
1,1.803407,-0.36151,-0.009406,-0.00982,-0.083587,-0.006693,-0.412139,-0.393785,-0.486765,-0.392281,...,-0.007532,0.00658,-0.116587,-0.10887,-0.141407,-0.087755,-0.267178,-0.093323,-0.270187,-0.257263
2,1.803407,-0.36151,-0.009406,-0.00982,-0.083587,-0.006693,-0.412139,-0.393785,-0.486765,-0.392281,...,-0.007532,0.00658,-0.116587,-0.10887,-0.141407,-0.087755,-0.267178,-0.093323,-0.270187,-0.257263
3,1.803407,-0.36151,-0.009406,-0.00982,-0.083587,-0.006693,-0.412139,-0.393785,-0.486765,-0.392281,...,-0.007532,0.00658,-0.116587,-0.10887,-0.141407,-0.087755,-0.267178,-0.093323,-0.270187,-0.257263
4,1.817345,-0.36151,-0.009406,-0.00982,-0.083587,-0.006693,-0.412139,-0.393785,-0.486765,-0.392281,...,-0.007532,0.00658,-0.116587,-0.10887,-0.141407,-0.087755,-0.267178,-0.093323,-0.270187,-0.257263


#### Training the IsoForest Model

In [4]:
from sklearn.ensemble import IsolationForest

iso_forest = IsolationForest(
    n_estimators=100,
    contamination=0.05,
    random_state=42,
    n_jobs=-1
)

iso_forest.fit(X_mon)


0,1,2
,"n_estimators  n_estimators: int, default=100 The number of base estimators in the ensemble.",100
,"max_samples  max_samples: ""auto"", int or float, default=""auto"" The number of samples to draw from X to train each base estimator. - If int, then draw `max_samples` samples. - If float, then draw `max_samples * X.shape[0]` samples. - If ""auto"", then `max_samples=min(256, n_samples)`. If max_samples is larger than the number of samples provided, all samples will be used for all trees (no sampling).",'auto'
,"contamination  contamination: 'auto' or float, default='auto' The amount of contamination of the data set, i.e. the proportion of outliers in the data set. Used when fitting to define the threshold on the scores of the samples. - If 'auto', the threshold is determined as in the  original paper. - If float, the contamination should be in the range (0, 0.5]. .. versionchanged:: 0.22  The default value of ``contamination`` changed from 0.1  to ``'auto'``.",0.05
,"max_features  max_features: int or float, default=1.0 The number of features to draw from X to train each base estimator. - If int, then draw `max_features` features. - If float, then draw `max(1, int(max_features * n_features_in_))` features. Note: using a float number less than 1.0 or integer less than number of features will enable feature subsampling and leads to a longer runtime.",1.0
,"bootstrap  bootstrap: bool, default=False If True, individual trees are fit on random subsets of the training data sampled with replacement. If False, sampling without replacement is performed.",False
,"n_jobs  n_jobs: int, default=None The number of jobs to run in parallel for :meth:`fit`. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.",-1
,"random_state  random_state: int, RandomState instance or None, default=None Controls the pseudo-randomness of the selection of the feature and split values for each branching step and each tree in the forest. Pass an int for reproducible results across multiple function calls. See :term:`Glossary `.",42
,"verbose  verbose: int, default=0 Controls the verbosity of the tree building process.",0
,"warm_start  warm_start: bool, default=False When set to ``True``, reuse the solution of the previous call to fit and add more estimators to the ensemble, otherwise, just fit a whole new forest. See :term:`the Glossary `. .. versionadded:: 0.21",False


#### Internal Validation
- Benign and Anomalies Count Expected
- Has been trained exclusively on benign traffic, but it must predict anomalies representing deviations from learned normal behaviour.

In [5]:
preds = iso_forest.predict(X_mon)

pred_series = pd.Series(preds).map({1: "Normal", -1: "Anomaly"})
pred_series.value_counts()


Normal     503406
Anomaly     26075
Name: count, dtype: int64

In [6]:
# Comparing with labels
y_mon_true = pd.read_csv(
    "../datasets/feature_sets/cicids_monday_labels_encoded.csv"
)

comparison = pd.crosstab(
    y_mon_true.squeeze(),
    pred_series,
    rownames=["Actual"],
    colnames=["Isolation Forest Prediction"]
)

comparison

Isolation Forest Prediction,Anomaly,Normal
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,26075,503406


#### Saving the Trained Isolation Forest Model

In [7]:
joblib.dump(
    iso_forest,
    "../models/isolation_forest_cicids_monday.pkl"
)


['../models/isolation_forest_cicids_monday.pkl']

#### Testing it on another file (tuesday) that contains mixed dataset, both benign and attack

In [8]:
#LOading the mixed dataset, tuesday dataset
X_test = pd.read_csv("../datasets/preprocessed/cicids_tuesday_features_scaled.csv")

In [9]:
# Testing 
preds = iso_forest.predict(X_test)

pred_series = pd.Series(preds).map({1: "Normal", -1: "Anomaly"})
pred_series.value_counts()

Normal     416008
Anomaly     29637
Name: count, dtype: int64

In [10]:
# Comparing with labels
y_true = pd.read_csv(
    "../datasets/preprocessed/cicids_tuesday_labels_encoded.csv"
)

comparison = pd.crosstab(
    y_true.squeeze(),
    pred_series,
    rownames=["Actual"],
    colnames=["Isolation Forest Prediction"]
)

comparison


Isolation Forest Prediction,Anomaly,Normal
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,29637,402176
1,0,7935
2,0,5897


## Training the SGD-SVM 
- traditional SVM didn't work for large dataset
- SGD does all that SVM does and also optimized for large dataset.

In [2]:
# Loading features and labels
X = pd.read_csv("../datasets/feature_sets/cicids_combined_mixed_features.csv")
y = pd.read_csv("../datasets/feature_sets/cicids_combined_mixed_labels.csv").values.ravel()

print(X.shape, y.shape) 

(2827876, 78) (2827876,)


#### Train Test Split (stratified)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Training samples", X_train.shape[0])
print("Testing samples:", X_test.shape[0])

Training samples 2262300
Testing samples: 565576


### Building SGD-SVM Pipeline

In [4]:
sgd_svm_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("sgd_svm", SGDClassifier(
        loss="hinge",          # SVM-style loss
        max_iter=1000,
        tol=1e-3,
        n_jobs=-1,
        random_state=42
    ))
])

### Training the model

In [5]:
start_time = time.time()

sgd_svm_pipeline.fit(X_train, y_train)

end_time = time.time()
print(f"Training completed in {(end_time - start_time)/60:.2f} minutes")



Training completed in 1.09 minutes


### SGD-SVM Model Evaluation

In [6]:
y_pred = sgd_svm_pipeline.predict(X_test)

print("Classification Report:\n")
print(classification_report(y_test, y_pred))

Classification Report:



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


              precision    recall  f1-score   support

           0       0.95      0.99      0.97    454265
           1       0.94      0.93      0.94     61711
           2       0.97      0.65      0.78     47209
           3       0.82      0.51      0.63      1230
           4       0.75      0.15      0.25      1159
           5       0.00      0.00      0.00         2

    accuracy                           0.95    565576
   macro avg       0.74      0.54      0.59    565576
weighted avg       0.95      0.95      0.95    565576



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


#### Confusion matrix and accuracy score

In [8]:
# COnfusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)
print("'\n")
# Accuracy score
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.4f}")


Confusion Matrix:
 [[450032   3268    784    130     51      0]
 [  4291  57383     31      6      0      0]
 [ 16241     48  30920      0      0      0]
 [   416    122     54    633      5      0]
 [   954     29      0      5    171      0]
 [     2      0      0      0      0      0]]
'

Accuracy: 0.9533


#### Saving SVM model

In [9]:
joblib.dump(sgd_svm_pipeline, "../models/sgd_svm_model.pkl")
print("SGD-SVM model saved successfully.")

SGD-SVM model saved successfully.


## Random Forest Training

In [10]:
from sklearn.ensemble import RandomForestClassifier

- Datasets already loaded during the SGD-SVM traiing

### Initiliazation of Random forest

In [11]:
rf_model = RandomForestClassifier(
    n_estimators=150,     
    max_depth=None,      # let trees grow fully
    random_state=42,
    n_jobs=-1            # use all CPU cores
)

### Training the Model

In [12]:
start_time = time.time()

rf_model.fit(X_train, y_train)

end_time = time.time()
print(f"Random Forest training completed in {(end_time - start_time)/60:.2f} minutes")


Random Forest training completed in 19.67 minutes


#### Model Evaluation

In [13]:
y_pred_rf = rf_model.predict(X_test)

print("Classification Report:\n")
print(classification_report(y_test, y_pred_rf))

Classification Report:

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    454265
           1       1.00      1.00      1.00     61711
           2       1.00      1.00      1.00     47209
           3       0.97      0.90      0.93      1230
           4       0.99      1.00      0.99      1159
           5       1.00      1.00      1.00         2

    accuracy                           1.00    565576
   macro avg       0.99      0.98      0.99    565576
weighted avg       1.00      1.00      1.00    565576



#### confusion matrix and accuracy score

In [15]:
# confusion matrix
cm_rf = confusion_matrix(y_test, y_pred_rf)
print("Confusion Matrix:\n", cm_rf)

print("\n")
# accuracy score
rf_acc = accuracy_score(y_test, y_pred_rf)
print(f"Accuracy: {rf_acc:.4f}")

Confusion Matrix:
 [[454219      2     43      1      0      0]
 [    33  61644      7     27      0      0]
 [    18      7  47184      0      0      0]
 [     5    111      0   1105      9      0]
 [     4      0      0      1   1154      0]
 [     0      0      0      0      0      2]]


Accuracy: 0.9995


In [16]:
joblib.dump(rf_model, "../models/random_forest_model.pkl")
print("Random Forest model saved successfully.")

Random Forest model saved successfully.
