<a href="https://colab.research.google.com/github/Hushpuppyzac/DLI-Assignment/blob/main/Isaac_KNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
# ✅ STEP 1: Download and run notebook from GitHub (no Google Drive required)

import requests
import nbformat
from IPython import get_ipython

def run_notebook_from_github(url):
    """
    Downloads and executes a Jupyter notebook from a GitHub raw URL.

    Parameters:
    url (str): Raw GitHub URL to a .ipynb notebook file
    """
    print("Downloading notebook from GitHub...")
    try:
        response = requests.get(url)
        response.raise_for_status()
    except Exception as e:
        print(f"Failed to download notebook: {e}")
        return

    print("📖 Parsing notebook content...")
    try:
        notebook = nbformat.reads(response.text, as_version=4)
    except Exception as e:
        print(f"Failed to parse notebook: {e}")
        return

    ipython = get_ipython()
    print("Running notebook cells...\n")

    for i, cell in enumerate(notebook.cells):
        if cell.cell_type == 'code':
            try:
                print(f"▶  Executing cell [{i + 1}]...")
                ipython.run_cell(cell.source)
            except Exception as e:
                print(f" Error in cell [{i + 1}]: {e}")

    print("\n All executable cells have been processed.")

# 🔗 Use your GitHub notebook URL
notebook_url = "https://raw.githubusercontent.com/Hushpuppyzac/DLI-Assignment/main/CleanedData.ipynb"

# ▶️ Run it
run_notebook_from_github(notebook_url)

Downloading notebook from GitHub...
📖 Parsing notebook content...
Running notebook cells...

▶  Executing cell [2]...
 AFTER CLEANING
Total rows after cleaning: 223108
Label distribution after cleaning:
Label
1    128016
0     95092
Name: count, dtype: int64
Class Distribution Before Balancing:
Label
1    128016
0     95092
Name: count, dtype: int64

 AFTER UNDERSAMPLING
Total rows after balancing: 190184
Class balance:
Label
1    95092
0    95092
Name: count, dtype: int64

SAMPLE OF CLEANED DATAFRAME
Total Rows     : 223108
DDoS Attacks   : 128016
Benign Records : 95092

 Cleaned DataFrame (First 5 Rows):


Unnamed: 0,Destination Port,Flow Duration,Label
0,54865,3,0
1,55054,109,0
2,55055,52,0
3,46236,34,0
4,54863,3,0



 DDoS Samples (First 5):


Unnamed: 0,Destination Port,Flow Duration,Label
18883,80,1293792,1
18884,80,4421382,1
18885,80,1083538,1
18886,80,80034360,1
18887,80,642654,1



 Benign Samples (First 5):


Unnamed: 0,Destination Port,Flow Duration,Label
0,54865,3,0
1,55054,109,0
2,55055,52,0
3,46236,34,0
4,54863,3,0



 Sample Extracted Features (First 5):


Unnamed: 0,Flow Preview,pkt_length_diff,pkt_length_var_ratio,byte_ratio,duration_per_packet,avg_to_max_ratio
0,54865 | 3,0,1.0,1200000.0,1.5,1.5
1,55054 | 109,0,1.0,1.0,54.5,1.5
2,55055 | 52,0,1.0,1.0,26.0,1.5
3,46236 | 34,0,1.0,1.0,17.0,1.5
4,54863 | 3,0,1.0,1200000.0,1.5,1.5



 Final Columns:
['Destination Port', 'Flow Duration', 'Total Fwd Packets', 'Total Backward Packets', 'Total Length of Fwd Packets', 'Total Length of Bwd Packets', 'Fwd Packet Length Max', 'Fwd Packet Length Min', 'Fwd Packet Length Mean', 'Fwd Packet Length Std', 'Bwd Packet Length Max', 'Bwd Packet Length Min', 'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Fwd Header Length', 'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s', 'Min Packet Length', 'Max Packet Length', 'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count', 'SYN Flag Count', 'RST Flag Count', 'PSH Flag Count', 'ACK Flag Count', 'URG Flag Count', 'ECE Flag Count', 'Down/Up Ratio', 'Average Packet Size', 'Avg Fwd 

In [15]:
# ✅ STEP 2: Train KNN Classifier on Cleaned Dataset

# 🔹 Import required libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import numpy as np # Import numpy

# 🔹 Step 2.1: Check if cleaned data exists
try:
    df_balanced
except NameError:
    raise Exception("❌ 'df_balanced' not found. Make sure your notebook defines it.")

# 🔹 Step 2.2: Separate features and target
X = df_balanced.drop(columns=['Label'])  # Features
y = df_balanced['Label']                 # Target (0 = BENIGN, 1 = DDoS)

# 🔹 Step 2.3: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# 🔹 Step 2.4: Handle infinite and very large values
# Replace infinite values with NaN
X_train = X_train.replace([np.inf, -np.inf], np.nan)
X_test = X_test.replace([np.inf, -np.inf], np.nan)


# Impute NaN values with the median of the respective column in the training set
for col in X_train.columns:
    if X_train[col].isnull().any():
        median_val = X_train[col].median()
        X_train[col] = X_train[col].fillna(median_val)
        # Impute test set with the training set median to prevent data leakage
        X_test[col] = X_test[col].fillna(median_val)

# 🔹 Step 2.5: Feature scaling (important for distance-based algorithms like KNN)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 🔹 Step 2.6: Train KNN model
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train_scaled, y_train)

# 🔹 Step 2.7: Make predictions
y_pred = knn_model.predict(X_test_scaled)

# 🔹 Step 2.8: Evaluation
print("\n✅ STEP 3: KNN Model Evaluation")
print("=" * 60)
print("📊 Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\n📈 Classification Report:")
print(classification_report(y_test, y_pred))
print(f"🎯 Accuracy Score: {accuracy_score(y_test, y_pred):.4f}")
print("=" * 60)


✅ STEP 3: KNN Model Evaluation
📊 Confusion Matrix:
[[19015     4]
 [    9 19009]]

📈 Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     19019
           1       1.00      1.00      1.00     19018

    accuracy                           1.00     38037
   macro avg       1.00      1.00      1.00     38037
weighted avg       1.00      1.00      1.00     38037

🎯 Accuracy Score: 0.9997
