# **Download Datasets**

In [27]:
# ! wget http://cicresearch.ca/CICDataset/CICDDoS2019/Dataset/CSVs/CSV-01-12.zip

# # Unzip zips and delete redundant files
# ! unzip CSV-01-12.zip && rm CSV-01-12.zip
# ! mv 01-12/DrDoS_SSDP.csv .
# ! rm -rf 01-12/

In [28]:
import kagglehub

import pandas as pd
import numpy as np
import os
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
# import joblib
# import matplotlib.pyplot as plt
# import seaborn as sns

In [29]:
# Download latest version
repository = "rodrigorosasilva/cic-ddos2019-30gb-full-dataset-csv-files"
csv_file_path = "01-12/DrDoS_SSDP.csv"
path = kagglehub.dataset_download(repository, path=csv_file_path)

# **Data Exploration**

In [30]:
csv_file_path = "/kaggle/input/cic-ddos2019-30gb-full-dataset-csv-files/01-12/DrDoS_SSDP.csv"

# always include low_memory=False when loading large network flow CSVs
df = pd.read_csv(csv_file_path, low_memory=False)

print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2611374 entries, 0 to 2611373
Data columns (total 88 columns):
 #   Column                        Dtype  
---  ------                        -----  
 0   Unnamed: 0                    int64  
 1   Flow ID                       object 
 2    Source IP                    object 
 3    Source Port                  int64  
 4    Destination IP               object 
 5    Destination Port             int64  
 6    Protocol                     int64  
 7    Timestamp                    object 
 8    Flow Duration                int64  
 9    Total Fwd Packets            int64  
 10   Total Backward Packets       int64  
 11  Total Length of Fwd Packets   float64
 12   Total Length of Bwd Packets  float64
 13   Fwd Packet Length Max        float64
 14   Fwd Packet Length Min        float64
 15   Fwd Packet Length Mean       float64
 16   Fwd Packet Length Std        float64
 17  Bwd Packet Length Max         float64
 18   Bwd Packet Length Min

In [31]:
print("======== All Columns ========")
for col in df.columns:
    print(col.strip())
print()

print("======== Label Values ========")
df[' Label'].unique()

Unnamed: 0
Flow ID
Source IP
Source Port
Destination IP
Destination Port
Protocol
Timestamp
Flow Duration
Total Fwd Packets
Total Backward Packets
Total Length of Fwd Packets
Total Length of Bwd Packets
Fwd Packet Length Max
Fwd Packet Length Min
Fwd Packet Length Mean
Fwd Packet Length Std
Bwd Packet Length Max
Bwd Packet Length Min
Bwd Packet Length Mean
Bwd Packet Length Std
Flow Bytes/s
Flow Packets/s
Flow IAT Mean
Flow IAT Std
Flow IAT Max
Flow IAT Min
Fwd IAT Total
Fwd IAT Mean
Fwd IAT Std
Fwd IAT Max
Fwd IAT Min
Bwd IAT Total
Bwd IAT Mean
Bwd IAT Std
Bwd IAT Max
Bwd IAT Min
Fwd PSH Flags
Bwd PSH Flags
Fwd URG Flags
Bwd URG Flags
Fwd Header Length
Bwd Header Length
Fwd Packets/s
Bwd Packets/s
Min Packet Length
Max Packet Length
Packet Length Mean
Packet Length Std
Packet Length Variance
FIN Flag Count
SYN Flag Count
RST Flag Count
PSH Flag Count
ACK Flag Count
URG Flag Count
CWE Flag Count
ECE Flag Count
Down/Up Ratio
Average Packet Size
Avg Fwd Segment Size
Avg Bwd Segment Size


array(['DrDoS_SSDP', 'BENIGN'], dtype=object)

# **Data Cleaning**

In [32]:
# Strip whitespace from all column names
df.columns = df.columns.str.strip()


# Handle missing values
print("Missing values before handling: ")
missing_values_initial = df.isnull().sum()
print(missing_values_initial[missing_values_initial > 0])


# Impute missing numerical values with median
for col in df.select_dtypes(include=np.number).columns:
    if df[col].isnull().any():
        df[col].fillna(df[col].median(), inplace=True)


print("Missing values after numerical imputation:")
missing_values_after_imputation = df.isnull().sum()
print(missing_values_after_imputation[missing_values_after_imputation > 0])

Missing values before handling: 
Flow Bytes/s    2
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)


Missing values after numerical imputation:
Series([], dtype: int64)


## Manual Feature Reduction

In [33]:
# # Drop irrelevant columns
# columns_to_drop = [
#     'Unnamed: 0', # Redundant index column
#     'Flow ID',
#     'Source IP',
#     'Destination IP',
#     'Timestamp',
#     'SimillarHTTP', # Still problematic/irrelevant due to object Dtype and low utility
#     'Fwd Header Length.1' # Drop one of the duplicate 'Fwd Header Length'
# ]

# # Ensure columns exist before dropping
# existing_columns_to_drop = [col for col in columns_to_drop if col in df.columns]
# df.drop(columns=existing_columns_to_drop, inplace=True)
# print(f"\nDropped columns: {existing_columns_to_drop}")

In [34]:
# Excluded Source IP and Destination IP as it's common practice to do so to
# prevent the model from simply memorizing IP addresses

# Define features that can be extracted from a single packet's headers
# Note: We use statistical proxies like 'Packet Length Mean' because the dataset is flow-based.
# We also keep 'Label' for training the model.
features_to_keep = [
    'Source Port',
    'Destination Port',
    'Protocol',
    # 'Min Packet Length',
    # 'Max Packet Length',
    # 'Packet Length Mean',
    # 'Fwd Header Length',
    # 'Bwd Header Length',
    # 'FIN Flag Count',
    # 'SYN Flag Count',
    # 'RST Flag Count',
    # 'PSH Flag Count',
    # 'ACK Flag Count',
    # 'URG Flag Count',
    'Label'
]

# Filter the DataFrame to only include the desired columns
# First, find which of our desired columns actually exist in the DataFrame
existing_columns = [col for col in features_to_keep if col in df.columns]

# Create a new DataFrame with only those columns
df = df[existing_columns]

print(f"Dataframe now has {len(df.columns)} columns: {list(df.columns)}")


Dataframe now has 4 columns: ['Source Port', 'Destination Port', 'Protocol', 'Label']


## Replace negative and positive infinities with NaN

In [35]:
# Handle infinite values (often appear due to division by zero, e.g., idle time with no packets)
df.replace([np.inf, -np.inf], np.nan, inplace=True)

## Extract Features and Target

In [36]:
X = df.drop('Label', axis=1)
y = df['Label']

## Drop rows with Nan

In [37]:
df.dropna(inplace=True)

## Fix Label Inconsistencies

In [38]:
# # Let's explicitly map 'BENIGN' to 'Benign' for cleaner labels in reports.
y.replace({'BENIGN': 'Benign'}, inplace=True)

print(y.unique())
print(df['Label'].unique())

['DrDoS_SSDP' 'Benign']
['DrDoS_SSDP' 'Benign']


# **Training the Model**

## Apply Label Encoding

In [39]:
le = LabelEncoder()
y_encoded = le.fit_transform(y)

print(f"Encoded labels mapping: {list(le.classes_)} -> {le.transform(le.classes_)}")
print(f"Sample of original labels (after mapping): {y.head().tolist()}")
print(f"Sample of encoded labels: {y_encoded[:5].tolist()}")

Encoded labels mapping: ['Benign', 'DrDoS_SSDP'] -> [0 1]
Sample of original labels (after mapping): ['DrDoS_SSDP', 'DrDoS_SSDP', 'DrDoS_SSDP', 'DrDoS_SSDP', 'DrDoS_SSDP']
Sample of encoded labels: [1, 1, 1, 1, 1]


## Create Train-Test Split

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42, stratify=y_encoded)

print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")
print(f"Training labels distribution: {np.bincount(y_train)}")
print(f"Testing labels distribution: {np.bincount(y_test)}")

Training data shape: (1827961, 3)
Testing data shape: (783413, 3)
Training labels distribution: [    534 1827427]
Testing labels distribution: [   229 783184]


## Train the DT

In [41]:
# Initialize the Decision Tree Classifier
dt_model = DecisionTreeClassifier(random_state=42)

start = time.time()
dt_model.fit(X_train, y_train)
print(f"Decision Tree training complete in {time.time() - start}s")

Decision Tree training complete in 2.7051868438720703s


## Benchmarking the model

In [42]:
# Assuming your trained model is named 'dt_model'
y_pred = dt_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)


target_names = le.classes_

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=target_names))

Accuracy: 1.0000
Confusion Matrix:
[[   229      0]
 [    10 783174]]

Classification Report:
              precision    recall  f1-score   support

      Benign       0.96      1.00      0.98       229
  DrDoS_SSDP       1.00      1.00      1.00    783184

    accuracy                           1.00    783413
   macro avg       0.98      1.00      0.99    783413
weighted avg       1.00      1.00      1.00    783413



# **Export the model to PPML**

In [43]:
!pip install nyoka



In [44]:
from nyoka import skl_to_pmml
from sklearn.pipeline import Pipeline


pipeline_obj = Pipeline([
    ("model", dt_model)
])


feature_names = list(X_train.columns)

skl_to_pmml(
    pipeline=pipeline_obj,
    col_names=feature_names,
    target_name='your_target_column_name', # Replace with your actual target name
    pmml_f_name='SSDP_flood_detection.pmml'
)

print("Model successfully exported to SSDP_flood_detection.pmml")



Model successfully exported to SSDP_flood_detection.pmml


# Redundant


In [45]:
import numpy as np

print("Number of inf/-inf values BEFORE replacement:")
initial_infs = np.isinf(df.select_dtypes(include=np.number)).sum().sum()
print(f"Total numerical infs: {initial_infs}")

# --- Perform the replacement ---
df.replace([np.inf, -np.inf], np.nan, inplace=True)

# After replacement:
print("\nNumber of inf/-inf values AFTER replacement:")
after_infs = np.isinf(df.select_dtypes(include=np.number)).sum().sum()
print(f"Total numerical infs: {after_infs}")


# Impute missing numerical values with median
for col in df.select_dtypes(include=np.number).columns:
    if df[col].isnull().any():
        df[col].fillna(df[col].median(), inplace=True)

# Impute missing numerical values with median
for col in df.select_dtypes(include=np.number).columns:
    if df[col].isnull().any():
        df[col].fillna(df[col].median(), inplace=True)

Number of inf/-inf values BEFORE replacement:
Total numerical infs: 0

Number of inf/-inf values AFTER replacement:
Total numerical infs: 0
