# CICDDoS2019 Dataset

This notebook reproduces the same supervised learning pipeline described in the paper, excluding SVM.

Models included: Random Forest, KNN, Logistic Regression, Decision Tree.


### Import Libraries

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from imblearn.over_sampling import SMOTE

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
    confusion_matrix
)

pd.set_option('display.max_columns', None)
print("Libraries imported successfully.")

### Load the Dataset

In [None]:
# Try common paths so the notebook works both locally and in shared environments
candidate_paths = [
    "../DataSets/cicddos2019.csv",
    "./cicddos2019.csv",
    "/mnt/data/cicddos2019.csv",
]

csv_path = None
for p in candidate_paths:
    if os.path.exists(p):
        csv_path = p
        break

if csv_path is None:
    raise FileNotFoundError(
        "Could not find cicddos2019.csv. Put it in the same folder as this notebook, "
        "or in ../DataSets/, or update candidate_paths."
    )

df = pd.read_csv(csv_path, low_memory=False)

print(f"Loaded dataset from: {csv_path}")
print(f"Original Dataset Shape: {df.shape}")
df.head()


### Data Cleaning

In [None]:
print(f"Shape before cleaning: {df.shape}")

# 1) Clean column names
df.columns = df.columns.str.strip()

# Drop any accidental index columns like 'Unnamed: 0'
unnamed_cols = [c for c in df.columns if c.lower().startswith('unnamed')]
if unnamed_cols:
    df.drop(columns=unnamed_cols, inplace=True)

# 2) Handle infinite and missing values
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)

# 3) Remove duplicates
df.drop_duplicates(inplace=True)

print(f"Shape after cleaning: {df.shape}")


### Label Encoding (Binary: Benign vs Attack)

CICDDoS2019 often includes a `Class` column (Benign/Attack) and a `Label` column (attack type).
We convert to a simple binary target:
- 0 = Benign
- 1 = Attack


In [None]:
# Binary encoding using 'Class': 0 = Benign, 1 = Attack
df['Label'] = (df['Class'].astype(str).str.strip().str.lower() != 'benign').astype(int)

# Drop the original text label column after encoding
df.drop(columns=['Class'], inplace=True)

print("Label mapping: 0 = Benign, 1 = Attack")
print(df['Label'].value_counts().rename({0: "Benign", 1: "Attack"}))


### Feature Reduction and Train/Test Split (70/30)

In [None]:
X = df.drop('Label', axis=1)
y = df['Label']

# Keep only numeric features (this drops Timestamp if it is text)
X = X.select_dtypes(include=[np.number])

print(f"Numeric feature count: {X.shape[1]}")
print("Class distribution (full dataset):")
print(y.value_counts().rename({0: "Benign", 1: "Attack"}))

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.30,
    random_state=42,
    stratify=y
)

print("\nTrain shape:", X_train.shape, "Test shape:", X_test.shape)
print("Train distribution:")
print(y_train.value_counts().rename({0: "Benign", 1: "Attack"}))
print("Test distribution:")
print(y_test.value_counts().rename({0: "Benign", 1: "Attack"}))

### Normalization (StandardScaler)

In [None]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Normalization complete.")

### Handling Class Imbalance (SMOTE)

SMOTE is applied to the training set only, to avoid leaking information from the test set.

In [None]:
smote = SMOTE(random_state=42)

X_train_balanced, y_train_balanced = smote.fit_resample(X_train_scaled, y_train)

print("After SMOTE (training only):")
print("X_train_balanced shape:", X_train_balanced.shape)
print("y_train_balanced distribution:")
print(pd.Series(y_train_balanced).value_counts().rename({0: "Benign", 1: "Attack"}))

### Feature Selection using PCA

PCA is fitted on the (scaled, balanced) training features, then applied to the test set.

In [None]:
pca = PCA(n_components=0.95, random_state=42)

X_train_pca = pca.fit_transform(X_train_balanced)
X_test_pca = pca.transform(X_test_scaled)

print(f"Original feature count: {X.shape[1]}")
print(f"PCA components kept: {X_train_pca.shape[1]}")
print(f"Explained variance (sum): {pca.explained_variance_ratio_.sum():.4f}")

### Random Forest Model

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Standard defaults similar to the previous notebook
rf_model = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)

print("Training Random Forest...")
rf_model.fit(X_train_pca, y_train_balanced)

print("Predicting on Test Set...")
y_pred_rf = rf_model.predict(X_test_pca)

# Evaluate Performance
rf_accuracy = accuracy_score(y_test, y_pred_rf)
rf_precision = precision_score(y_test, y_pred_rf)
rf_recall = recall_score(y_test, y_pred_rf)
rf_f1 = f1_score(y_test, y_pred_rf)

print("RANDOM FOREST RESULTS")
print(f"Accuracy:  {rf_accuracy:.4f}")
print(f"Precision: {rf_precision:.4f}")
print(f"Recall:    {rf_recall:.4f}")
print(f"F1-Score:  {rf_f1:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf, target_names=["Benign", "Attack"]))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))


In [None]:
# Cross-validation on training data (optional, can take some time)
rf_scores = cross_val_score(rf_model, X_train_pca, y_train_balanced, cv=3, scoring="accuracy")
print("Random Forest CV scores:", rf_scores)
print("Mean CV accuracy:", rf_scores.mean())

### K-Nearest Neighbours (KNN) Model

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier(n_neighbors=5)

print("Training KNN...")
knn_model.fit(X_train_pca, y_train_balanced)

print("Predicting on Test Set...")
y_pred_knn = knn_model.predict(X_test_pca)

# Evaluate Performance
knn_accuracy = accuracy_score(y_test, y_pred_knn)
knn_precision = precision_score(y_test, y_pred_knn)
knn_recall = recall_score(y_test, y_pred_knn)
knn_f1 = f1_score(y_test, y_pred_knn)

print("KNN RESULTS")
print(f"Accuracy:  {knn_accuracy:.4f}")
print(f"Precision: {knn_precision:.4f}")
print(f"Recall:    {knn_recall:.4f}")
print(f"F1-Score:  {knn_f1:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred_knn, target_names=["Benign", "Attack"]))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_knn))


In [None]:
knn_scores = cross_val_score(knn_model, X_train_pca, y_train_balanced, cv=3, scoring="accuracy")
print("KNN CV scores:", knn_scores)
print("Mean CV accuracy:", knn_scores.mean())

### Logistic Regression Model

In [None]:
from sklearn.linear_model import LogisticRegression

# Keep it simple, increase max_iter for convergence
lr_model = LogisticRegression(max_iter=2000, n_jobs=-1)

print("Training Logistic Regression...")
lr_model.fit(X_train_pca, y_train_balanced)

print("Predicting on Test Set...")
y_pred_lr = lr_model.predict(X_test_pca)

# Evaluate Performance
lr_accuracy = accuracy_score(y_test, y_pred_lr)
lr_precision = precision_score(y_test, y_pred_lr)
lr_recall = recall_score(y_test, y_pred_lr)
lr_f1 = f1_score(y_test, y_pred_lr)

print("LOGISTIC REGRESSION RESULTS")
print(f"Accuracy:  {lr_accuracy:.4f}")
print(f"Precision: {lr_precision:.4f}")
print(f"Recall:    {lr_recall:.4f}")
print(f"F1-Score:  {lr_f1:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred_lr, target_names=["Benign", "Attack"]))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_lr))


In [None]:
lr_scores = cross_val_score(lr_model, X_train_pca, y_train_balanced, cv=3, scoring="accuracy")
print("Logistic Regression CV scores:", lr_scores)
print("Mean CV accuracy:", lr_scores.mean())

### Decision Tree Model

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier(random_state=42)

print("Training Decision Tree...")
dt_model.fit(X_train_pca, y_train_balanced)

print("Predicting on Test Set...")
y_pred_dt = dt_model.predict(X_test_pca)

# Evaluate Performance
dt_accuracy = accuracy_score(y_test, y_pred_dt)
dt_precision = precision_score(y_test, y_pred_dt)
dt_recall = recall_score(y_test, y_pred_dt)
dt_f1 = f1_score(y_test, y_pred_dt)

print("DECISION TREE RESULTS")
print(f"Accuracy:  {dt_accuracy:.4f}")
print(f"Precision: {dt_precision:.4f}")
print(f"Recall:    {dt_recall:.4f}")
print(f"F1-Score:  {dt_f1:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred_dt, target_names=["Benign", "Attack"]))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_dt))


In [None]:
dt_scores = cross_val_score(dt_model, X_train_pca, y_train_balanced, cv=3, scoring="accuracy")
print("Decision Tree CV scores:", dt_scores)
print("Mean CV accuracy:", dt_scores.mean())

### Summary Table (All Models)

In [None]:
results_df = pd.DataFrame([
    {"Model": "Random Forest",       "Accuracy": rf_accuracy,  "Precision": rf_precision,  "Recall": rf_recall,  "F1": rf_f1},
    {"Model": "KNN",                 "Accuracy": knn_accuracy, "Precision": knn_precision, "Recall": knn_recall, "F1": knn_f1},
    {"Model": "Logistic Regression", "Accuracy": lr_accuracy,  "Precision": lr_precision,  "Recall": lr_recall,  "F1": lr_f1},
    {"Model": "Decision Tree",       "Accuracy": dt_accuracy,  "Precision": dt_precision,  "Recall": dt_recall,  "F1": dt_f1},
]).sort_values("Accuracy", ascending=False)

results_df

### Optional: Accuracy Bar Plot

In [None]:
plt.figure(figsize=(7,4))
plt.bar(results_df["Model"], results_df["Accuracy"])
plt.xticks(rotation=20, ha="right")
plt.ylabel("Accuracy")
plt.title("Accuracy Comparison (CICDDoS2019)")
plt.ylim(0, 1)
plt.show()