In [7]:
import pandas as pd
import numpy as np
import platform
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns


# Detect operating system and set base paths
if platform.system() == "Windows":
    base_unsw = Path(r"C:\Users\kegem\OneDrive\Datascience Masters\DS Summer 2025 SEMESTER 3\DX799S O1 Data Science Capstone (Summer 1 2025)\Network Security DataSet\CSV Files\Training and Testing Sets")
    base_beth = Path(r"C:\Users\kegem\OneDrive\Datascience Masters\DS Summer 2025 SEMESTER 3\DX799S O1 Data Science Capstone (Summer 1 2025)\Beth DataSet")
    base_cyber = Path(r"C:\Users\kegem\OneDrive\Datascience Masters\DS Summer 2025 SEMESTER 3\DX799S O1 Data Science Capstone (Summer 1 2025)\Cybersecurity Attacks DataSets")
else:
    base_unsw = Path("/Users/kevinegemba/Library/CloudStorage/OneDrive-Personal/Datascience Masters/DS Summer 2025 SEMESTER 3/DX799S O1 Data Science Capstone (Summer 1 2025)/Network Security DataSet/CSV Files/Training and Testing Sets")
    base_beth = Path("/Users/kevinegemba/Library/CloudStorage/OneDrive-Personal/Datascience Masters/DS Summer 2025 SEMESTER 3/DX799S O1 Data Science Capstone (Summer 1 2025)/Beth DataSet")
    base_cyber = Path("/Users/kevinegemba/Library/CloudStorage/OneDrive-Personal/Datascience Masters/DS Summer 2025 SEMESTER 3/DX799S O1 Data Science Capstone (Summer 1 2025)/Cybersecurity Attacks DataSets")

# File paths
unsw_train_set = base_unsw / "UNSW_NB15_training-set.csv"
unsw_test_set = base_unsw / "UNSW_NB15_testing-set.csv"
beth_train_set = base_beth / "labelled_training_data.csv"
beth_test_set = base_beth / "labelled_testing_data.csv"
cyber_attack_set = base_cyber / "cybersecurity_attacks.csv"

# Load and test
datasets = {
    "UNSW-NB15 Training": unsw_train_set,
    "UNSW-NB15 Testing": unsw_test_set,
    "BETH Training": beth_train_set,
    "BETH Testing": beth_test_set,
    "Cybersecurity Attacks": cyber_attack_set
}

for name, path in datasets.items():
    try:
        df = pd.read_csv(path)
        print(f" {name} loaded successfully! Shape: {df.shape}")
    except FileNotFoundError:
        print(f" {name} - File not found. Check the path: {path}")
    except Exception as e:
        print(f" {name} - An error occurred: {e}")


 UNSW-NB15 Training loaded successfully! Shape: (175341, 45)
 UNSW-NB15 Testing loaded successfully! Shape: (82332, 45)
 BETH Training loaded successfully! Shape: (763144, 16)
 BETH Testing loaded successfully! Shape: (188967, 16)
 Cybersecurity Attacks loaded successfully! Shape: (40000, 25)


In [12]:
# Load the Cybersecurity Attacks dataset
cybersecurity_attacks = pd.read_csv(cyber_attack_set)

# Display the actual column names
print(cybersecurity_attacks.columns.tolist())



In [24]:

# Cybersecurity Attacks Dataset

print("\nKNN on Cybersecurity Attacks Dataset")
cyber_df = pd.read_csv(cyber_attack_set)
cyber_df.drop(columns=['Flow ID', 'Timestamp'], errors='ignore', inplace=True)
cyber_df.dropna(inplace=True)

#Drop non-numeric columns (e.g., timestamps, IPs, strings)
non_numeric_cols = cybersecurity_attacks.select_dtypes(include=['object']).columns
print("Dropping non-numeric columns:", non_numeric_cols.tolist())
X_cyber = cybersecurity_attacks.drop(columns=['Attack Type'] + non_numeric_cols.tolist(), errors='ignore')

# Use Attack Type as the classification target
y_cyber = cybersecurity_attacks['Attack Type']

# Scale features
scaler_cyber = StandardScaler()
X_cyber_scaled = scaler_cyber.fit_transform(X_cyber)
X_train_cyber, X_test_cyber, y_train_cyber, y_test_cyber = train_test_split(X_cyber_scaled, y_cyber, test_size=0.3, random_state=42)

knn_cyber = KNeighborsClassifier(n_neighbors=5, p=2)
knn_cyber.fit(X_train_cyber, y_train_cyber)
y_pred_cyber = knn_cyber.predict(X_test_cyber)
print(confusion_matrix(y_test_cyber, y_pred_cyber))
print(classification_report(y_test_cyber, y_pred_cyber))



KNN on Cybersecurity Attacks Dataset
[[1874 1247  875]
 [1855 1340  853]
 [1854 1290  812]]
              precision    recall  f1-score   support

        DDoS       0.34      0.47      0.39      3996
   Intrusion       0.35      0.33      0.34      4048
     Malware       0.32      0.21      0.25      3956

    accuracy                           0.34     12000
   macro avg       0.33      0.34      0.33     12000
weighted avg       0.33      0.34      0.33     12000




# KNN Model Results: Cybersecurity Attacks Dataset

## Model Overview
In this analysis, I applied the **K-Nearest Neighbors (KNN)** classifier to the Cybersecurity Attacks dataset using the Euclidean distance metric (`p=2`) and `k=5`.

Before modeling, wI:
- Dropped **non-numeric columns** such as IP addresses, protocol names, user and device metadata, and timestamps.
- Used **`Attack Type`** (DDoS, Intrusion, Malware) as the **multiclass classification target**.
- Scaled all numeric features using `StandardScaler` to ensure that distance-based calculations were not biased by feature scale.


## Confusion Matrix

[[1874 1247  875]
[1855 1340  853]
[1854 1290  812]]

- Each row represents actual class labels, and each column represents predicted class labels.
- For example, 1874 actual DDoS samples were correctly classified, while 1247 were misclassified as Intrusion and 875 as Malware.


## Classification Report

| Class       | Precision | Recall | F1-Score | Support |
|-------------|-----------|--------|----------|---------|
| DDoS        | 0.34      | 0.47   | 0.39     | 3996    |
| Intrusion   | 0.35      | 0.33   | 0.34     | 4048    |
| Malware     | 0.32      | 0.21   | 0.25     | 3956    |
| **Accuracy**|           |        | **0.34** | 12000   |

- **Macro average F1**: 0.33 — indicates the average F1 across classes, treating each class equally.
- **Weighted average F1**: 0.33 — takes into account class support (i.e., number of samples per class).

---

## Why KNN?

- KNN is a **simple, interpretable, and non-parametric model** — it makes no assumptions about the data distribution.
- It is effective when **local neighborhoods contain informative structure**.
- Useful for datasets where **relationships are not linearly separable** but still cluster in feature space.

---

## Limitations Observed

1. **Low overall accuracy (34%)** and **poor recall for Malware 

In [22]:
beth_all = pd.read_csv(beth_train_set)
print(beth_all['evil'].value_counts())

evil
0    763144
Name: count, dtype: int64


In [25]:

#  BETH Dataset (Ensuring Class 1 in Test Set)

print("\nKNN on BETH Dataset (With Balanced Test Set Including Class 1)")

# Load dataset
beth_df = pd.read_csv(beth_train_set)

# Clean and prepare the target variable
beth_df['evil'] = pd.to_numeric(beth_df['evil'], errors='coerce')

# Drop irrelevant or high-cardinality columns
beth_df.drop(columns=['args', 'stackAddresses', 'hostName', 'processName', 'eventName'], errors='ignore', inplace=True)

# Drop any rows missing the target
beth_df.dropna(subset=['evil'], inplace=True)

# Get dummies and ensure proper type for categorical variables
beth_df = pd.get_dummies(beth_df, drop_first=True)
beth_df = beth_df.astype({col: 'int64' for col in beth_df.select_dtypes('bool').columns})
beth_df.dropna(inplace=True)


# Manual Stratification to Include Class 1 in Test Set

# Separate positive and negative classes
beth_0 = beth_df[beth_df['evil'] == 0]
beth_1 = beth_df[beth_df['evil'] == 1]

# If too few class 1 examples, use them all for test
beth_1_test = beth_1
beth_0_train, beth_0_test = train_test_split(beth_0, test_size=0.3, random_state=42)

# Combine class 0 + class 1 into train/test sets
beth_train = beth_0_train
beth_test = pd.concat([beth_0_test, beth_1_test])
print("Test Set Evil Class Distribution:\n", beth_test['evil'].value_counts())

# Split features and targets
X_train_beth = beth_train.drop(columns='evil')
y_train_beth = beth_train['evil']
X_test_beth = beth_test.drop(columns='evil')
y_test_beth = beth_test['evil']

# Scale features
scaler_beth = StandardScaler()
X_train_beth_scaled = scaler_beth.fit_transform(X_train_beth)
X_test_beth_scaled = scaler_beth.transform(X_test_beth)

# Fit KNN
knn_beth = KNeighborsClassifier(n_neighbors=5, p=2)
knn_beth.fit(X_train_beth_scaled, y_train_beth)
y_pred_beth = knn_beth.predict(X_test_beth_scaled)

# Output performance
print(confusion_matrix(y_test_beth, y_pred_beth))
print(classification_report(y_test_beth, y_pred_beth))


KNN on BETH Dataset (With Balanced Test Set Including Class 1)
Test Set Evil Class Distribution:
 evil
0    228944
Name: count, dtype: int64
[[228944]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    228944

    accuracy                           1.00    228944
   macro avg       1.00      1.00      1.00    228944
weighted avg       1.00      1.00      1.00    228944







## KNN on BETH Dataset (With Balanced Test Set Including Class 1)

### Objective
I aimed to build a K-Nearest Neighbors (KNN) classifier to detect malicious activity (`evil = 1`) in the BETH dataset. However, the original dataset was highly imbalanced — specifically, the training data had **no instances of class 1**. This made classification impossible using standard train-test splits. To address this:

> I implemented **manual stratification**, forcing the rare class 1 examples into the test set to validate if KNN could detect them when present.

---

### Data Cleaning and Processing
- Irrelevant or high-cardinality columns like `args`, `stackAddresses`, `hostName`, `processName`, and `eventName` were dropped.
- The target column `evil` was coerced into numeric type.
- Missing values in the target were dropped.
- Categorical columns were one-hot encoded using `pd.get_dummies()`.
- Boolean columns were explicitly cast to `int64`.
- Remaining `NaNs` were dropped to ensure compatibility with KNN.

---

### Manual Stratified Split
- I split the dataset **by class label** to control the distribution:
  - All `evil = 1` rows (malicious activity) were reserved for testing.
  - The `evil = 0` rows were randomly split 70/30 for training and testing.
- The test set was constructed by **concatenating all class 1 examples** with the 30% sample from class 0.
  
**Resulting Test Set Distribution**:
```plaintext
0    228944

Despite the manual effort, the test set still contains only class 0 because class 1 appears to be entirely absent in the original dataset.


Model Training and Performance
	•	StandardScaler was used to normalize feature distributions — crucial for distance-based models like KNN.
	•	The KNN model was trained using k=5 neighbors and Euclidean distance (p=2).

Results:

Confusion Matrix:
[[228944]]

Classification Report:
              precision    recall  f1-score   support
           0       1.00      1.00      1.00    228944

Warning: Only one class was found in y_true. This leads to meaningless metrics like F1-score, and recall becomes trivially 1.0.

⸻

Interpretation and Limitations
	•	The classifier reports perfect accuracy, but this is misleading.
	•	The lack of any class 1 examples in the test set means the model was never asked to classify a malicious case.
	•	Consequently, the model never learned to distinguish between normal and abnormal behavior.

⸻

Recommendations
	1.	Inspect full dataset distribution:

beth_df['evil'].value_counts()

If class 1 is entirely absent, supervised classification is not possible.

	2.	If any class 1 samples exist, consider:
	•	SMOTE (Synthetic Minority Oversampling Technique) to generate synthetic samples of class 1.
	•	Downsampling the majority class to balance proportions.
	•	Anomaly Detection if class 1 is extremely rare (e.g., Isolation Forest, One-Class SVM).
	3.	If class 1 never appears:
	•	This dataset may only represent normal (non-attack) activity, and is better suited for unsupervised learning.

⸻

Conclusion

While KNN technically runs on this dataset, the results are not meaningful due to class imbalance. Manual stratification does not help if class 1 is absent altogether. I recommend shifting to unsupervised anomaly detection or using a different labeled dataset that contains both benign and malicious examples.


In [26]:
# UNSW-NB15 Dataset

print("\n KNN on UNSW-NB15 Dataset")
unsw_df = pd.read_csv(unsw_train_set)
unsw_df.drop(columns=['id', 'attack_cat', 'proto', 'service', 'state'], errors='ignore', inplace=True)
unsw_df['label'] = pd.to_numeric(unsw_df['label'], errors='coerce')
unsw_df.dropna(subset=['label'], inplace=True)
unsw_df = pd.get_dummies(unsw_df, drop_first=True)
unsw_df = unsw_df.astype({col: 'int64' for col in unsw_df.select_dtypes('bool').columns})
unsw_df.dropna(inplace=True)

X_unsw = unsw_df.drop(columns='label')
y_unsw = unsw_df['label']
scaler_unsw = StandardScaler()
X_unsw_scaled = scaler_unsw.fit_transform(X_unsw)
X_train_unsw, X_test_unsw, y_train_unsw, y_test_unsw = train_test_split(X_unsw_scaled, y_unsw, test_size=0.3, random_state=42)

knn_unsw = KNeighborsClassifier(n_neighbors=5, p=2)
knn_unsw.fit(X_train_unsw, y_train_unsw)
y_pred_unsw = knn_unsw.predict(X_test_unsw)
print(confusion_matrix(y_test_unsw, y_pred_unsw))
print(classification_report(y_test_unsw, y_pred_unsw))


 KNN on UNSW-NB15 Dataset
[[14679  2093]
 [ 1242 34589]]
              precision    recall  f1-score   support

           0       0.92      0.88      0.90     16772
           1       0.94      0.97      0.95     35831

    accuracy                           0.94     52603
   macro avg       0.93      0.92      0.93     52603
weighted avg       0.94      0.94      0.94     52603



## KNN on UNSW-NB15 Dataset

### Objective
The goal was to classify network traffic as **normal (label = 0)** or **attack (label = 1)** using the K-Nearest Neighbors (KNN) algorithm on the **UNSW-NB15** dataset. This dataset is a comprehensive benchmark for intrusion detection, featuring modern attack types and rich network features.

---

### Data Preprocessing Steps

- **Dropped columns** that are either high-cardinality (`id`, `proto`, `service`, `state`) or redundant (`attack_cat`) for binary classification.
- Converted `label` to numeric to ensure it can be used as a classification target.
- Removed rows with missing target labels.
- Applied **one-hot encoding** on categorical variables using `pd.get_dummies()` and ensured all boolean types were converted to integers.
- Dropped any remaining rows with null values to avoid errors during scaling and modeling.

---

### Feature Scaling and Model Training

- Features were standardized using `StandardScaler`, which is essential for KNN since it relies on distance metrics (e.g., Euclidean).
- The dataset was split into 70% training and 30% testing using `train_test_split` with a fixed `random_state=42` for reproducibility.
- KNN was trained with:
  - `k = 5` (number of neighbors)
  - `p = 2` (Euclidean distance)

---

### Performance Results

```plaintext
Confusion Matrix:
[[14679  2093]
 [ 1242 34589]]

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.88      0.90     16772
           1       0.94      0.97      0.95     35831

    accuracy                           0.94     52603
   macro avg       0.93      0.92      0.93     52603
weighted avg       0.94      0.94      0.94     52603


⸻

Interpretation
	•	High overall accuracy (94%) and strong precision/recall for both classes show that KNN performs well on this dataset.
	•	Class 1 (attacks) is detected with 94% precision and 97% recall, indicating low false positives and high true positives.
	•	Class 0 (normal) also shows solid performance but has slightly lower recall (88%), meaning it occasionally misclassifies normal traffic as attacks.

⸻

Limitations
	•	KNN is computationally expensive for large datasets since it stores the entire training set and computes distance at prediction time.
	•	The model does not learn patterns, making it sensitive to noise and irrelevant features.
	•	Scalability issues may arise when deploying in real-time systems without optimizations like KD-trees or Ball Trees.
	•	Performance is dependent on feature scaling, distance metric (p), and hyperparameter k.

⸻

Recommendations for Improvement
	•	Hyperparameter Tuning: Perform grid search to find the optimal value of k and experiment with different distance metrics (p=1, p=3, etc.).
	•	Feature Selection: Use PCA or mutual information to reduce dimensionality and retain only the most relevant features.
	•	Class Balancing: Although balanced here, further investigation into class distribution in other subsets or unseen data is important.
	•	Compare with Other Models: Try ensemble methods like Random Forest or boosting algorithms, which often outperform KNN on high-dimensional data.
	•	Model Interpretability: Use SHAP or permutation importance on simpler models to explain why predictions are made.

⸻

Conclusion

KNN served as a strong baseline on the UNSW-NB15 dataset, offering competitive classification performance with minimal tuning. However, for real-world or high-throughput scenarios, more scalable and interpretable models may be preferable.

