In [None]:
# ✅ STEP 1: Download and run notebook from GitHub (no Google Drive required)

import requests
import nbformat
from IPython import get_ipython

def run_notebook_from_github(url):
    """
    Downloads and executes a Jupyter notebook from a GitHub raw URL.

    Parameters:
    url (str): Raw GitHub URL to a .ipynb notebook file
    """
    print("Downloading notebook from GitHub...")
    try:
        response = requests.get(url)
        response.raise_for_status()
    except Exception as e:
        print(f"Failed to download notebook: {e}")
        return

    print("📖 Parsing notebook content...")
    try:
        notebook = nbformat.reads(response.text, as_version=4)
    except Exception as e:
        print(f"Failed to parse notebook: {e}")
        return

    ipython = get_ipython()
    print("Running notebook cells...\n")

    for i, cell in enumerate(notebook.cells):
        if cell.cell_type == 'code':
            try:
                print(f"▶  Executing cell [{i + 1}]...")
                ipython.run_cell(cell.source)
            except Exception as e:
                print(f" Error in cell [{i + 1}]: {e}")

    print("\n All executable cells have been processed.")

# 🔗 Use your GitHub notebook URL
notebook_url = "https://raw.githubusercontent.com/Hushpuppyzac/DLI-Assignment/main/CleanedData.ipynb"

# ▶️ Run it
run_notebook_from_github(notebook_url)


In [None]:
# ✅ STEP 2: Load uploaded CSV file
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             roc_auc_score, confusion_matrix, roc_curve, precision_recall_curve)

from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

# Load dataset
df = pd.read_csv("Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv")
df.columns = df.columns.str.strip()

# Clean
df = df[df[' Label'].isin(['BENIGN', 'DDoS'])].copy()
df.rename(columns={' Label': 'Label'}, inplace=True)
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
constant_cols = [col for col in df.columns if df[col].nunique() == 1]
df.drop(columns=constant_cols, inplace=True)

df['Label'] = df['Label'].apply(lambda x: 1 if x == 'DDoS' else 0)

# Split
X = df.drop('Label', axis=1).select_dtypes(include=[np.number])
y = df['Label']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)

# Balance training set
df_train = pd.concat([X_train, y_train], axis=1)
major = df_train[df_train['Label'] == 1]
minor = df_train[df_train['Label'] == 0]
major_down = resample(major, replace=False, n_samples=len(minor), random_state=42)
df_balanced = pd.concat([major_down, minor]).sample(frac=1, random_state=42)

X_train_bal = df_balanced.drop('Label', axis=1)
y_train_bal = df_balanced['Label']

# Feature engineering
def extract_features(df_input):
    df_input = df_input.copy()
    df_input['pkt_length_diff'] = df_input['Max Packet Length'] - df_input['Min Packet Length']
    df_input['pkt_length_var_ratio'] = (df_input['Max Packet Length'] / (df_input['Packet Length Mean'] + 1e-5)).round(3)
    df_input['byte_ratio'] = (df_input['Total Length of Fwd Packets'] / (df_input['Total Length of Bwd Packets'] + 1e-5)).round(3)
    df_input['duration_per_packet'] = (df_input['Flow Duration'] / (df_input['Total Fwd Packets'] + df_input['Total Backward Packets'] + 1e-5)).round(3)
    df_input['avg_to_max_ratio'] = (df_input['Average Packet Size'] / (df_input['Max Packet Length'] + 1e-5)).round(3)
    return df_input

X_train_f = extract_features(X_train_bal)
X_test_f = extract_features(X_test)

# Scale
scaler = StandardScaler()
cols = X_train_f.select_dtypes(include=np.number).columns
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train_f[cols]), columns=cols)
X_test_scaled = pd.DataFrame(scaler.transform(X_test_f[cols]), columns=cols)

# ✅ STEP 3: Train models
models = {
    "Decision Tree": DecisionTreeClassifier(random_state=42, max_depth=5, class_weight='balanced'),
    "Naive Bayes": GaussianNB(),
    "KNN": KNeighborsClassifier(n_neighbors=3)
}

results = {}
for name, model in models.items():
    model.fit(X_train_scaled, y_train_bal)
    y_pred = model.predict(X_test_scaled)
    y_prob = model.predict_proba(X_test_scaled)[:, 1]
    results[name] = {
        "model": model,
        "y_pred": y_pred,
        "y_prob": y_prob,
        "acc": accuracy_score(y_test, y_pred),
        "prec": precision_score(y_test, y_pred),
        "rec": recall_score(y_test, y_pred),
        "f1": f1_score(y_test, y_pred),
        "roc_auc": roc_auc_score(y_test, y_prob),
        "cm": confusion_matrix(y_test, y_pred)
    }

# ✅ STEP 4: Plot ROC and PR curves
plt.figure(figsize=(14, 5))
plt.subplot(1, 2, 1)
for name, res in results.items():
    fpr, tpr, _ = roc_curve(y_test, res['y_prob'])
    plt.plot(fpr, tpr, label=f"{name} (AUC={res['roc_auc']:.4f})")
plt.plot([0, 1], [0, 1], 'k--')
plt.title("ROC Curve")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()

plt.subplot(1, 2, 2)
for name, res in results.items():
    prec, rec, _ = precision_recall_curve(y_test, res['y_prob'])
    plt.plot(rec, prec, label=name)
plt.title("Precision-Recall Curve")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.legend()
plt.tight_layout()
plt.show()

# ✅ STEP 5: Confusion Matrices
for name, res in results.items():
    sns.heatmap(res['cm'], annot=True, fmt='d', cmap='Blues')
    plt.title(f"Confusion Matrix - {name}")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()

# ✅ STEP 6: Evaluation Summary
for name, res in results.items():
    print(f"📊 {name} Evaluation:")
    print(f"Accuracy : {res['acc']:.4f}")
    print(f"Precision: {res['prec']:.4f}")
    print(f"Recall   : {res['rec']:.4f}")
    print(f"F1 Score : {res['f1']:.4f}")
    print(f"ROC AUC  : {res['roc_auc']:.4f}")
    print("-" * 40)
