In [None]:
# 📦 Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.utils import resample
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, roc_auc_score, roc_curve, precision_recall_curve
)
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

np.random.seed(42)


In [None]:
# 📂 Load dataset
url = "https://raw.githubusercontent.com/Hushpuppyzac/DLI-Assignment/main/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv"
df = pd.read_csv(url)
df.columns = df.columns.str.strip()

# Clean 'Label' column
if ' Label' in df.columns:
    df.rename(columns={' Label': 'Label'}, inplace=True)

# Drop missing, constant, and duplicate rows
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
constant_cols = [col for col in df.columns if df[col].nunique() == 1]
df.drop(columns=constant_cols, inplace=True)

# Encode labels
df = df[df['Label'].isin(['BENIGN', 'DDoS'])]
df['Label'] = df['Label'].apply(lambda x: 1 if x == 'DDoS' else 0)


In [None]:
X = df.drop('Label', axis=1).select_dtypes(include=[np.number])
y = df['Label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)


In [None]:
# Balance training data via undersampling
df_train = pd.concat([X_train, y_train], axis=1)
df_major = df_train[df_train['Label'] == 1]
df_minor = df_train[df_train['Label'] == 0]
df_major_down = resample(df_major, replace=False, n_samples=len(df_minor), random_state=42)
df_balanced = pd.concat([df_major_down, df_minor]).sample(frac=1, random_state=42)

X_train_bal = df_balanced.drop('Label', axis=1)
y_train_bal = df_balanced['Label']

# Feature engineering
def extract_features(df_input):
    df_input = df_input.copy()
    df_input['pkt_length_diff'] = df_input['Max Packet Length'] - df_input['Min Packet Length']
    df_input['pkt_length_var_ratio'] = (df_input['Max Packet Length'] / (df_input['Packet Length Mean'] + 1e-5)).round(3)
    df_input['byte_ratio'] = (df_input['Total Length of Fwd Packets'] / (df_input['Total Length of Bwd Packets'] + 1e-5)).round(3)
    df_input['duration_per_packet'] = (df_input['Flow Duration'] / (df_input['Total Fwd Packets'] + df_input['Total Backward Packets'] + 1e-5)).round(3)
    df_input['avg_to_max_ratio'] = (df_input['Average Packet Size'] / (df_input['Max Packet Length'] + 1e-5)).round(3)
    return df_input

X_train_f = extract_features(X_train_bal)
X_test_f = extract_features(X_test)


In [None]:
scaler = StandardScaler()
cols = X_train_f.select_dtypes(include=[np.number]).columns
scaler.fit(X_train_f[cols])

X_train_scaled = pd.DataFrame(scaler.transform(X_train_f[cols]), columns=cols)
X_test_scaled = pd.DataFrame(scaler.transform(X_test_f[cols]), columns=cols)


In [None]:
models = {
    "Decision Tree": DecisionTreeClassifier(
        criterion='entropy', max_depth=5, min_samples_split=50, min_samples_leaf=20,
        class_weight='balanced', random_state=42),
    "Naive Bayes": GaussianNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000)
}

results = {}
for name, model in models.items():
    model.fit(X_train_scaled, y_train_bal)
    y_pred = model.predict(X_test_scaled)
    y_prob = model.predict_proba(X_test_scaled)[:, 1] if hasattr(model, "predict_proba") else None
    results[name] = {
        "model": model,
        "y_pred": y_pred,
        "y_prob": y_prob,
        "acc": accuracy_score(y_test, y_pred),
        "prec": precision_score(y_test, y_pred),
        "rec": recall_score(y_test, y_pred),
        "f1": f1_score(y_test, y_pred),
        "roc_auc": roc_auc_score(y_test, y_prob) if y_prob is not None else None,
        "cm": confusion_matrix(y_test, y_pred)
    }


In [None]:
# ROC and PR Curves
plt.figure(figsize=(15, 5))
plt.subplot(1, 2, 1)
for name, res in results.items():
    if res['y_prob'] is not None:
        fpr, tpr, _ = roc_curve(y_test, res['y_prob'])
        plt.plot(fpr, tpr, label=f"{name} (AUC={res['roc_auc']:.4f})")
plt.plot([0, 1], [0, 1], 'k--')
plt.title("ROC Curve")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()

plt.subplot(1, 2, 2)
for name, res in results.items():
    if res['y_prob'] is not None:
        prec, rec, _ = precision_recall_curve(y_test, res['y_prob'])
        plt.plot(rec, prec, label=f"{name}")
plt.title("Precision-Recall Curve")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
for name, res in results.items():
    plt.figure(figsize=(5, 4))
    sns.heatmap(res["cm"], annot=True, fmt='d', cmap="Blues")
    plt.title(f"Confusion Matrix - {name}")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()


In [None]:
for name, res in results.items():
    print(f"📊 {name} Evaluation:")
    print(f"Accuracy:  {res['acc']:.4f}")
    print(f"Precision: {res['prec']:.4f}")
    print(f"Recall:    {res['rec']:.4f}")
    print(f"F1 Score:  {res['f1']:.4f}")
    if res['roc_auc'] is not None:
        print(f"ROC AUC:   {res['roc_auc']:.4f}")
    print("-" * 40)
