In [None]:
# Loading UNSW-NB15 dataset

import pandas as pd
import numpy as np

trainPATH = './UNSW_NB15/train.csv'
testPATH = './UNSW_NB15/test.csv'

trainDf = pd.read_csv(trainPATH)
testDf = pd.read_csv(testPATH)

trainDf = trainDf.drop(columns=['attack_cat'])
testDf = testDf.drop(columns=['attack_cat'])

In [None]:
print("Train:",trainDf.shape)
print("Test:",testDf.shape)

Data preprocessing
- handling missing values
- outlier detection 
- encoding categorical variables
- balancing classes
- normalization or standardization

In [None]:
# 1. handling missing values
print("Missing in train:\n", trainDf.isnull().sum())
print("Missing in test:\n", testDf.isnull().sum())

No missing values.

In [None]:
num_cols = trainDf.select_dtypes(include=['int64', 'float64']).columns
cat_cols = trainDf.select_dtypes(include=['object']).columns

test_num_cols = testDf.select_dtypes(include=['int64', 'float64']).columns
test_cat_cols = testDf.select_dtypes(include=['object']).columns

print('TrainDf numeric columns:',num_cols,'\n\nTrainDf categorical columns:',cat_cols)
print('\n TestDf numeric columns:',test_num_cols,'\n\nTestDf categorical columns:',test_cat_cols)


In [None]:
# 2. outlier detection and treatment

# outlier detection
def outlier_det(df, cols):
    outliers = {}

    for col in cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1

        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR

        outlier = df[(df[col] < lower) | (df[col] > upper)]
        outliers[col] = len(outlier) 

    outliers = dict(sorted(outliers.items(), key= lambda x: x[1], reverse=True))

    return outliers

In [None]:
print("Using IQR method to detect outliers gave the following \n Column: outlier amount")
print('TrainDf: ',outlier_det(trainDf,num_cols))
print('TestDf: ',outlier_det(testDf, test_num_cols))

In [None]:
outlier_percent = {
    col: f'{round((count / len(trainDf)) * 100, 2)}%'
    for col, count in outlier_det(trainDf, num_cols).items()
}

test_outlier_percent = {
    col: f'{round((count / len(testDf)) * 100, 2)}%'
    for col, count in outlier_det(testDf, test_num_cols).items()
}

print('Outlier percentages for each column','\n Column: outlier percentage')
print('TrainDf:', outlier_percent)
print('TestDf: ', test_outlier_percent)

In [None]:
# saved original before outlier treatment for plotting
trainDf_original = trainDf.copy()
testDf_original = testDf.copy()

In [None]:
# outlier treatment

def outlier_treat(df, cols):
    for col in cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1

        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR

        df[col] = np.where(df[col]< lower, lower, np.where(df[col] > upper, upper, df[col])) 

    return df

trainDf = outlier_treat(trainDf, num_cols)

print('After outlier treatment\nColumn: outlier')
print('TrainDf: ',outlier_det(trainDf, num_cols))


In [None]:
# 3. Encoding of categorical variables

# separating features from labels
x_train = trainDf.drop(columns=['label'])
y_train = trainDf['label']

x_test = testDf.drop(columns=['label'])
y_test = testDf['label']

In [None]:
from sklearn.preprocessing import OneHotEncoder

# Applying OneHotEncoder on train
ohe = OneHotEncoder(handle_unknown = 'ignore', sparse_output = False)
ohe.fit(x_train[cat_cols])

In [None]:
x_train_ohe = ohe.transform(x_train[cat_cols])
x_test_ohe  = ohe.transform(x_test[cat_cols])

In [None]:
ohe_cols = ohe.get_feature_names_out(cat_cols)

x_train_ohe = pd.DataFrame(x_train_ohe, columns=ohe_cols, index=x_train.index)
x_test_ohe  = pd.DataFrame(x_test_ohe,  columns=ohe_cols, index=x_test.index)

In [None]:
x_train_num = x_train.drop(columns=cat_cols)
x_test_num  = x_test.drop(columns=cat_cols)

# Combine numerical + encoded categorical features
x_train_encoded = pd.concat([x_train_num, x_train_ohe], axis=1)
x_test_encoded  = pd.concat([x_test_num,  x_test_ohe], axis=1)


In [None]:
print("Encoded train shape:", x_train_encoded.shape)
print("Encoded test shape:", x_test_encoded.shape)


In [None]:
# 4. Balancing classes on training set using SMOTE
from imblearn.over_sampling import SMOTE 

sm = SMOTE(random_state=42)

x_train_res, y_train_res = sm.fit_resample(x_train_encoded, y_train)

In [None]:
print('Before SMOTE:', y_train.value_counts())
print('\nAfter SMOTE:', y_train_res.value_counts())

In [None]:
# 5. Normalization or Standardization

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# scale training set only
x_train_scaled = scaler.fit_transform(x_train_res)
# transform test set
x_test_scaled = scaler.transform(x_test_encoded)

print("Train scaled shape:", x_train_scaled.shape)
print("Test scaled shape:", x_test_scaled.shape)


Exploratory Data Analysis (EDA)
-	Compute summary statistics (mean, median, variance, distribution, etc.)
-	Perform visualizations (histograms, box plots, correlation heatmaps, scatter plots, etc.)
-	Examine relationships between variables and the target feature
-	Present observations and insights from data patterns.


In [None]:
# summary statistics

trainDf_original.describe().T

### Summary Statistics of Numerical Features

The table above provides summary statistics (count, mean, standard deviation, min, quartiles, max) for all numerical features in the training dataset. These statistics help identify data distribution properties, potential skewness, and unusually large ranges that may indicate outliers.

**Key observations:**
- Several features (e.g., `sbytes`, `dbytes`, `dur`) show very large max values relative to their medians, suggesting heavy right-skewed distributions.
- Network traffic features often have long tails due to occasional large flows, which aligns with the observed results.
- The large standard deviations in several columns confirm the presence of extreme values, motivating the need for outlier detection and treatment.


In [None]:
trainDf_original[cat_cols].describe()


### Summary Statistics of Categorical Features

This table summarizes the categorical variables in the dataset, including the number of unique categories and the most frequent category for each feature.

**Insights:**
- The `proto` column is dominated by common protocols such as TCP and UDP.
- The `service` column contains many unique values due to a diverse set of network services.
- The `state` column indicates connection state transitions. Some states are significantly more frequent, reflecting typical network traffic patterns.


In [None]:
# visualizations (histograms, box plots, correlation heatmaps, scatter plots, etc.)

import seaborn as sns
import matplotlib.pyplot as plt

sns.countplot(x='label', data=trainDf_original)
plt.title("Normal vs Malicious Distribution (Training Set)")
plt.show()

### Class Distribution (Normal vs Malicious)

This count plot illustrates the distribution of the target variable (`label`) in the training dataset.

**Observations:**
- The dataset is imbalanced, with more malicious samples (label = 1) than normal samples (label = 0).
- This imbalance can negatively affect model performance, making class balancing (e.g., SMOTE) necessary.
- The imbalance also reflects the UNSW-NB15 dataset design, where the training split intentionally includes more attack traffic.


In [None]:
trainDf_original[num_cols].hist(figsize=(18, 15), bins=50, color='skyblue')
plt.suptitle("Numerical Feature Distributions (Histogram)", fontsize=16)
plt.tight_layout()
plt.show()

### Histograms of Numerical Features

The histograms above show the distribution of all numerical features. These plots help visualize skewness, spread, and feature ranges.

**Insights:**
- Many features exhibit heavy right-skew, meaning most values are small, with a few extremely large values.
- Features such as `sbytes`, `dbytes`, and `dur` show long-tailed distributions, common in network traffic datasets.
- The presence of long tails supports the need for outlier treatment using the IQR method.


In [None]:
plt.figure(figsize=(18, 10))
trainDf_original[num_cols].boxplot()
plt.xticks(rotation=90)
plt.title("Boxplots of Numerical Features (Raw Data)")
plt.show()

### Boxplot of Numerical Features (Before Outlier Treatment)

The boxplot visualizes the spread and outliers for each numerical feature.

**Insights:**
- Nearly all features contain extreme outliers, visible as distant points beyond whiskers.
- Outliers are typical in cybersecurity data due to high-traffic anomalies and rare attack patterns.
- This plot validates the statistical findings and demonstrates why outlier treatment is essential before model training.


In [None]:
plt.figure(figsize=(18, 12))
corr = trainDf_original[num_cols].corr()
sns.heatmap(corr, cmap='coolwarm', annot=False)
plt.title("Correlation Heatmap")
plt.show()

### Correlation Heatmap

This heatmap visualizes pairwise correlations between numerical features.

**Key observations:**
- Several packet- and byte-related features (e.g., `sbytes`, `spkts`, `dbytes`, `dpkts`) show moderate to strong correlations.
- Features such as TTL-based values (`sttl`, `dttl`) also correlate with each other.
- Most features show low correlation, suggesting that the dataset does not suffer from excessive multicollinearity, which is beneficial for training models.


In [None]:
plt.figure(figsize=(7,5))
sns.boxplot(x='label', y='sbytes', data=trainDf_original)
plt.title("sbytes vs Label")

plt.figure(figsize=(7,5))
sns.boxplot(x='label', y='dbytes', data=trainDf_original)
plt.title("dbytes vs Label")

plt.figure(figsize=(7,5))
sns.boxplot(x='label', y='dur', data=trainDf_original)
plt.title("dur vs Label")

plt.figure(figsize=(7,5))
sns.boxplot(x='label', y='ct_state_ttl', data=trainDf_original)
plt.title("ct_state_ttl vs Label")

plt.figure(figsize=(7,5))
sns.boxplot(x='label', y='rate', data=trainDf_original)
plt.title("rate vs Label")

plt.tight_layout()
plt.show()

### Numerical Feature Comparison (Benign vs Malicious)

These boxplots compare selected numerical features across benign (label 0) and malicious (label 1) traffic.

**Patterns observed:**
- **`sbytes` and `dbytes`**: Malicious flows tend to have higher byte counts compared to benign traffic.
- **`dur`**: Attack traffic shows greater variance in flow duration.
- **`ct_state_ttl`**: This aggregated state/TTL feature shows clear differences between classes, making it potentially useful for classification.
- **`rate`**: Malicious traffic often has higher connection rates, indicative of scanning or DoS behavior.

These patterns indicate strong predictive power for these features.


In [None]:
plt.figure(figsize=(7,5))
sns.scatterplot(x='sbytes', y='dbytes', hue='label', data=trainDf_original, alpha=0.4)
plt.title("Scatter Plot of sbytes vs dbytes")
plt.show()

### Scatter Plot of sbytes vs dbytes

This scatter plot shows the relationship between source bytes (`sbytes`) and destination bytes (`dbytes`), colored by class label.

**Insights:**
- Benign traffic is clustered near the origin, with lower byte counts.
- Malicious traffic shows more spread and includes many high-byte flows.
- The clear separation in some regions indicates this feature pair can help distinguish between benign and malicious samples.


In [None]:
trainDf_original['proto'].value_counts().plot(kind='bar', figsize=(10,5))
plt.title("Protocol Types Frequency")
plt.show()

trainDf_original['service'].value_counts().head(20).plot(kind='bar', figsize=(10,5))
plt.title("Top 20 Services")
plt.show()

trainDf_original['state'].value_counts().plot(kind='bar', figsize=(10,5))
plt.title("Connection State Frequency")
plt.show()

### Categorical Feature Distributions

Bar plots are shown for protocol types, top 20 services, and connection states.

**Insights:**
- **Protocol distribution** is dominated by TCP and UDP, which aligns with real-world internet traffic.
- **Service distribution** has a heavy tail with many rarely used services, some of which are used predominantly by attack traffic.
- **State distribution** reveals which connection transitions occur most frequently, with some states highly correlated with malicious activity.

Understanding these distributions helps interpret model behavior and feature importance later.


### Modeling
All students must apply at least three of the following algorithms:
1.	Decision Tree
2.	Naive Bayes
3.	Support Vector Machine (SVM)
4.	Artificial Neural Network (MLP or CNN depending on data type)
5.	Random Forest (optional)
6.	Logistic Regression (optional)


In [None]:
# 1. Decision Tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score
from sklearn.metrics import RocCurveDisplay

dt = DecisionTreeClassifier(random_state=42)

dt.fit(x_train_scaled, y_train_res)

y_pred_dt = dt.predict(x_test_scaled)
y_pred_prob_dt = dt.predict_proba(x_test_scaled)[:, 1]

print("Decision Tree Metrics:")
print("Accuracy:", accuracy_score(y_test, y_pred_dt))
print("ROC-AUC:", roc_auc_score(y_test, y_pred_prob_dt))
print("\nClassification Report:\n", classification_report(y_test, y_pred_dt))

# Confusion Matrix
cm_dt = confusion_matrix(y_test, y_pred_dt)
print("\nConfusion Matrix:\n", cm_dt)

# ROC Curve
RocCurveDisplay.from_estimator(dt, x_test_scaled, y_test)

In [None]:
# 2. Naive Bayes
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()

nb.fit(x_train_scaled, y_train_res)

y_pred_nb = nb.predict(x_test_scaled)
y_pred_prob_nb = nb.predict_proba(x_test_scaled)[:, 1]

print("Naive Bayes Metrics:")
print("Accuracy:", accuracy_score(y_test, y_pred_nb))
print("ROC-AUC:", roc_auc_score(y_test, y_pred_prob_nb))
print("\nClassification Report:\n", classification_report(y_test, y_pred_nb))

cm_nb = confusion_matrix(y_test, y_pred_nb)
print("\nConfusion Matrix:\n", cm_nb)

RocCurveDisplay.from_estimator(nb, x_test_scaled, y_test)

In [None]:
# 3. Random Forest
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    n_jobs=-1,
    random_state=42
)

# Train
rf.fit(x_train_res, y_train_res)

# Predict
y_pred_rf = rf.predict(x_test_encoded)
y_pred_prob_rf = rf.predict_proba(x_test_encoded)[:, 1]

print("Random Forest Metrics:")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("ROC-AUC:", roc_auc_score(y_test, y_pred_prob_rf))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))

# Confusion Matrix
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))

RocCurveDisplay.from_estimator(rf, x_test_scaled, y_test)

In [None]:
# 5-Fold Cross-Validation

from sklearn.model_selection import cross_val_score

rf_cv = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    n_jobs=-1,
    random_state=42
)

# 5-fold cross-validation
cv_scores = cross_val_score(
    rf_cv,
    x_train_encoded,
    y_train,
    cv=5,
    scoring='f1'
)

print("5-Fold CV F1 Scores:", cv_scores)
print("Mean F1:", cv_scores.mean())
print("Std Dev:", cv_scores.std())

### What this does:

- Splits the training set into 5 folds

- Re-trains and evaluates 5 times

- Computes average F1 score

- Shows stability of the model

In [None]:
from sklearn.model_selection import GridSearchCV

rf_params = {
    'n_estimators': [100, 200],
    'max_depth': [20, 40, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

rf_grid = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42, n_jobs=-1),
    param_grid=rf_params,
    cv=3,
    scoring='f1',
    n_jobs=-1
)

rf_grid.fit(x_train_encoded, y_train)

print("Best Parameters:", rf_grid.best_params_)
best_rf = rf_grid.best_estimator_


In [None]:
# Train tuned model
best_rf.fit(x_train_res, y_train_res)

# Predict using tuned model
y_pred_rf_best = best_rf.predict(x_test_encoded)
y_pred_prob_rf_best = best_rf.predict_proba(x_test_encoded)[:, 1]

print("Random Forest (Tuned) Metrics:")
print("Accuracy:", accuracy_score(y_test, y_pred_rf_best))
print("ROC-AUC:", roc_auc_score(y_test, y_pred_prob_rf_best))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf_best))

# Confusion Matrix
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_rf_best))

# ROC Curve
RocCurveDisplay.from_estimator(best_rf, x_test_scaled, y_test)


Parameter tuning was performed using GridSearchCV with 3-fold cross-validation. <br>
We optimized hyperparameters such as n_estimators, max_depth, min_samples_split, and min_samples_leaf to improve model performance. <br>
The best model found by the grid search was used for final evaluation.

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, matthews_corrcoef, cohen_kappa_score, balanced_accuracy_score

def evaluate_model(model_name, y_true, y_pred, y_prob):
    print(f"\n      {model_name} Evaluation      ")
    
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred)
    rec = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    roc = roc_auc_score(y_true, y_prob)
    cm = confusion_matrix(y_true, y_pred)
    mcc = matthews_corrcoef(y_true, y_pred)
    kappa = cohen_kappa_score(y_true, y_pred)
    bal_acc = balanced_accuracy_score(y_true, y_pred)

    print(f"Accuracy:      {acc:.4f}")
    print(f"Precision:     {prec:.4f}")
    print(f"Recall:        {rec:.4f}")
    print(f"F1-score:      {f1:.4f}")
    print(f"ROC-AUC:       {roc:.4f}")
    print(f"MCC:           {mcc:.4f}")
    print(f"Cohen’s Kappa: {kappa:.4f}")
    print(f"Balanced Acc:  {bal_acc:.4f}")
    print("\nConfusion Matrix:")
    print(cm)

    return {
        "Model": model_name,
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "F1-score": f1,
        "ROC-AUC": roc,
        "MCC": mcc,
        "Kappa": kappa,
        "Balanced Acc": bal_acc
    }


In [None]:
results_dt = evaluate_model(
    "Decision Tree",
    y_test,
    y_pred_dt,
    y_pred_prob_dt
)


In [None]:
results_nb = evaluate_model(
    "Naive Bayes",
    y_test,
    y_pred_nb,
    y_pred_prob_nb
)


In [None]:
results_rf = evaluate_model(
    "Random Forest",
    y_test,
    y_pred_rf,
    y_pred_prob_rf
)


In [None]:
results_tuned_rf = evaluate_model(
    "Tuned Random Forest",
    y_test,
    y_pred_rf_best,
    y_pred_prob_rf_best
)

In [None]:
plt.figure(figsize=(10, 7))

RocCurveDisplay.from_predictions(y_test, y_pred_prob_dt, name="Decision Tree")
RocCurveDisplay.from_predictions(y_test, y_pred_prob_nb, name="Naive Bayes")
RocCurveDisplay.from_predictions(y_test, y_pred_prob_rf, name="Random Forest")
RocCurveDisplay.from_predictions(y_test, y_pred_prob_rf_best, name="Tuned Random Forest")

plt.title("ROC Curves for All Models")
plt.show()


In [None]:
evaluation_df = pd.DataFrame([
    results_dt,
    results_nb,
    results_rf,
    results_tuned_rf
])

evaluation_df


### Results and Discussion

1. Best-Performing Model<br>
    Among the evaluated supervised models (Decision Tree, Naive Bayes, Random Forest), the Random Forest classifier demonstrated the strongest performance. <br>
    It achieved the highest overall metrics, including accuracy, F1-score, and ROC-AUC. <br>
    This indicates that Random Forest was the most effective model for distinguishing between normal and malicious network flows in the UNSW-NB15 dataset.

2. Interpretation of Results and Model Behavior<br>
The Decision Tree showed limited generalization and struggled with high-dimensional data, resulting in low recall and overall accuracy.<br>
Naive Bayes performed moderately but suffered from poor recall for attacks due to its strong independence assumptions.<br>
In contrast, Random Forest handled both numerical and categorical features effectively, capturing complex patterns in the data and maintaining a better balance between precision and recall.<br> 
Its ensemble structure reduced overfitting and produced the most reliable results across all evaluation metrics.

3. Limitations of the Dataset and the Models<br>
The UNSW-NB15 dataset contains highly imbalanced classes, skewed numerical distributions, and high-cardinality categorical features, all of which add complexity to model training.<br>
Additionally, the binary attack label collapses multiple attack types into one group, making classification less precise.<br>
The models also have inherent limitations: Decision Trees overfit easily, Naive Bayes oversimplifies feature relationships, and Random Forests can become computationally expensive and may still misclassify subtle or unseen attack patterns.

4. Practical Meaning of the Findings <br>
Feature importance analysis from the Random Forest model showed that attributes such as `ct_state_ttl`, `sbytes`, `dbytes`, and rate play major roles in detecting malicious flows.<br>
These features correspond to packet behavior, traffic volume, and connection state transitions—factors commonly altered during cyberattacks.<br>
This means the trained models can help identify suspicious network activities based on measurable deviations in packet rate, byte count, or TTL values, making them practically useful for intrusion detection systems.

## **Autoencoder for Anomaly Detection**
#### **1. Concept of Autoencoder-Based Anomaly Detection**

An **autoencoder** is a neural network trained to reconstruct its input.
For anomaly detection:
- Train the autoencoder only on benign (normal) traffic.
- The model learns the normal pattern of network flows.
- During testing, malicious traffic produces a high reconstruction error because it differs from normal traffic patterns.
- A threshold is set to classify:
    - Low error → Normal
    - High error → Anomalous (attack)

This method is useful because it can detect:
- unseen attacks
- novel intrusion patterns
- deviations from normal behavior

This gives an advantage over supervised models, which only detect attacks seen in training.

In [None]:
benign_mask = (y_train == 0)

X_benign_encoded = x_train_encoded[benign_mask].copy()
X_benign_encoded.columns = X_benign_encoded.columns.astype(str)

X_benign_scaled = scaler.transform(X_benign_encoded)


In [None]:
from sklearn.neural_network import MLPRegressor

input_dim = X_benign_scaled.shape[1]

autoencoder = MLPRegressor(
    hidden_layer_sizes=(128, 64, 32, 64, 128),  # optimized symmetric AE
    activation='relu',
    solver='adam',
    max_iter=80,
    random_state=42,
    verbose=True
)


In [None]:
autoencoder.fit(X_benign_scaled, X_benign_scaled)

In [None]:
# Ensure columns are strings
x_test_encoded.columns = x_test_encoded.columns.astype(str)
X_test_scaled = scaler.transform(x_test_encoded)

# AE reconstruction
X_test_reconstructed = autoencoder.predict(X_test_scaled)

# Reconstruction error (MSE)
mse = np.mean((X_test_scaled - X_test_reconstructed)**2, axis=1)

In [None]:
from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(y_test, mse)
youden_index = tpr - fpr
best_threshold = thresholds[np.argmax(youden_index)]

print("Best threshold (Youden):", best_threshold)


In [None]:
y_pred_ae = (mse > best_threshold).astype(int)


In [None]:
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix, classification_report
)

print("=== Autoencoder (Optimized) Results ===")
print("Accuracy:", accuracy_score(y_test, y_pred_ae))
print("Precision:", precision_score(y_test, y_pred_ae))
print("Recall:", recall_score(y_test, y_pred_ae))
print("F1-score:", f1_score(y_test, y_pred_ae))
print("ROC-AUC:", roc_auc_score(y_test, mse))
print("\nClassification Report:\n", classification_report(y_test, y_pred_ae))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_ae))


In [None]:
results_ae = {
    "Model": "Autoencoder",
    "Accuracy": accuracy_score(y_test, y_pred_ae),
    "Precision": precision_score(y_test, y_pred_ae),
    "Recall": recall_score(y_test, y_pred_ae),
    "F1-score": f1_score(y_test, y_pred_ae),
    "ROC-AUC": roc_auc_score(y_test, mse),
}

evaluation_df = pd.concat([
    evaluation_df,      # your supervised model results
    pd.DataFrame([results_ae])
], ignore_index=True)

evaluation_df


#### Best-Performing Supervised Model

The Random Forest classifier achieved the best overall performance among the supervised models, with:
- Accuracy: 0.703
- F1-score: 0.719
- ROC-AUC: 0.823
- Balanced Accuracy: 0.705

This indicates a strong ability to detect malicious traffic while maintaining a reasonable false-positive rate.
The tuned Random Forest slightly improved ROC-AUC (0.826), showing better ranking ability, although accuracy decreased marginally due to reduced overfitting.

### **Autoencoder vs Supervised Models**

The optimized Autoencoder produced a very different performance profile:
- Recall (attack detection): 0.856 - the highest of all models
- F1-score: 0.693 (competitive with Random Forest)
- Accuracy: 0.582
- ROC-AUC: 0.407 (expected for anomaly detection)

As an unsupervised anomaly detection model, the Autoencoder is trained only on benign traffic, learning the normal behavior of the network.<br>
It flags deviations as anomalies, achieving excellent attack recall but at the cost of more false positives.

This makes the Autoencoder particularly useful for:

- detecting unknown or novel attack types,
- identifying deviations from normal behavior,
- complementing supervised classifiers that rely on labeled attack patterns.

In contrast, the Random Forest is more accurate overall but may fail to detect unseen or subtle attacks, since it only learns from known labeled data.

## Conclusion

#### 1. Random Forest is best for known attacks, offering strong balanced performance.<br>
#### 2. Autoencoder is best for unknown or emerging attacks, offering high recall and anomaly sensitivity. <bt>
#### 3. Combining both approaches would yield a more robust Intrusion Detection System, where supervised learning handles known threats and the Autoencoder flags unexpected behavior.