In [None]:
# 📦 Install and import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, RocCurveDisplay
from IPython.display import display

# For reproducibility
np.random.seed(42)


In [None]:
# 📂 Load the dataset
url = "https://raw.githubusercontent.com/Hushpuppyzac/DLI-Assignment/main/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv"
df = pd.read_csv(url)
print(f"✅ Data loaded. Shape: {df.shape}")
display(df.head())


In [None]:
df.columns = df.columns.str.strip()

if ' Label' in df.columns:
    df.rename(columns={' Label': 'Label'}, inplace=True)

df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

constant_cols = [col for col in df.columns if df[col].nunique() == 1]
df.drop(columns=constant_cols, inplace=True)

df = df[df['Label'].isin(['BENIGN', 'DDoS'])]
df['Label'] = df['Label'].apply(lambda x: 1 if x == 'DDoS' else 0)


In [None]:
X = df.drop('Label', axis=1)
X = X.select_dtypes(include=[np.number])  # Keep numeric only
y = df['Label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [None]:
df_train = pd.concat([X_train, y_train], axis=1)
df_majority = df_train[df_train['Label'] == 1]
df_minority = df_train[df_train['Label'] == 0]

df_majority_downsampled = resample(df_majority, replace=False, n_samples=len(df_minority), random_state=42)
df_balanced = pd.concat([df_majority_downsampled, df_minority]).sample(frac=1, random_state=42).reset_index(drop=True)

X_train_balanced = df_balanced.drop('Label', axis=1)
y_train_balanced = df_balanced['Label']


In [None]:
def extract_features(df_input):
    df_input['pkt_length_diff'] = df_input['Max Packet Length'] - df_input['Min Packet Length']
    df_input['pkt_length_var_ratio'] = (df_input['Max Packet Length'] / (df_input['Packet Length Mean'] + 1e-5)).round(3)
    df_input['byte_ratio'] = (df_input['Total Length of Fwd Packets'] / (df_input['Total Length of Bwd Packets'] + 1e-5)).round(3)
    df_input['duration_per_packet'] = (df_input['Flow Duration'] / (df_input['Total Fwd Packets'] + df_input['Total Backward Packets'] + 1e-5)).round(3)
    df_input['avg_to_max_ratio'] = (df_input['Average Packet Size'] / (df_input['Max Packet Length'] + 1e-5)).round(3)
    return df_input

X_train_featured = extract_features(X_train_balanced.copy())
X_test_featured = extract_features(X_test.copy())


In [None]:
scaler = StandardScaler()
num_cols = X_train_featured.select_dtypes(include=[np.number]).columns.tolist()

scaler.fit(X_train_featured[num_cols])

X_train_scaled = pd.DataFrame(scaler.transform(X_train_featured[num_cols]), columns=num_cols, index=X_train_featured.index)
X_test_scaled = pd.DataFrame(scaler.transform(X_test_featured[num_cols]), columns=num_cols, index=X_test_featured.index)


In [None]:
dt_model = DecisionTreeClassifier(max_depth=10, min_samples_leaf=10, random_state=42)
dt_model.fit(X_train_scaled, y_train_balanced)

y_pred = dt_model.predict(X_test_scaled)
y_prob = dt_model.predict_proba(X_test_scaled)[:, 1]

acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob)

print(f"✅ Accuracy: {acc:.4f}")
print(f"✅ Precision: {prec:.4f}")
print(f"✅ Recall: {rec:.4f}")
print(f"✅ F1 Score: {f1:.4f}")
print(f"✅ ROC-AUC: {roc_auc:.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

RocCurveDisplay.from_estimator(dt_model, X_test_scaled, y_test)
plt.title("ROC Curve - Decision Tree")
plt.grid(True)
plt.show()
