In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
import joblib
import matplotlib.pyplot as plt

In [None]:
# Load the dataset
df3 = pd.read_csv("/Change to your dataset's path, Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv")

df3.columns = df3.columns.str.strip()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Encode the 'Label' column
label_encoder = LabelEncoder()
df3['Label'] = label_encoder.fit_transform(df3['Label'])

In [None]:
# Important features
import pandas as pd

# Set display options to show all columns
pd.set_option('display.max_rows', None)

# Compute the correlation matrix
correlation_matrix = df3.corr()

# Extract correlations with the target column 'Label'
label_correlation = correlation_matrix['Label'].sort_values(ascending=False)

# Print all correlation values
print(label_correlation)

# Reset the display options if needed (optional)
pd.reset_option('display.max_rows')

Label                          1.000000
Bwd Packet Length Mean         0.603299
Avg Bwd Segment Size           0.603299
Bwd Packet Length Max          0.577323
Bwd Packet Length Std          0.576155
Packet Length Mean             0.454283
Average Packet Size            0.453472
Packet Length Std              0.443749
Max Packet Length              0.414399
Packet Length Variance         0.408089
PSH Flag Count                 0.247740
Flow IAT Std                   0.129630
Flow IAT Mean                  0.127503
Fwd IAT Max                    0.105457
Flow IAT Max                   0.100654
Fwd IAT Std                    0.099651
ACK Flag Count                 0.096975
Idle Max                       0.096775
Idle Mean                      0.087114
Idle Std                       0.078023
Idle Min                       0.061243
Subflow Bwd Bytes              0.041244
Total Length of Bwd Packets    0.041244
Fwd IAT Total                  0.040565
Active Min                     0.033821


In [None]:
necessary_features = [
    'Bwd Packet Length Mean',
    'Avg Bwd Segment Size',
    'Bwd Packet Length Max',
    'Bwd Packet Length Std',
    'Packet Length Mean',
    'Average Packet Size',
    'Packet Length Std',
    'Max Packet Length',
    'Packet Length Variance',
    'PSH Flag Count',
    'Flow IAT Std',
    'Flow IAT Mean',
    'Fwd IAT Max',
    'Flow IAT Max',
    'Fwd IAT Std',
    'ACK Flag Count',
    'Idle Max',
    'Idle Mean',
    'Idle Std',
    'Idle Min',
    'Subflow Bwd Bytes',
    'Total Length of Bwd Packets',
    'Fwd IAT Total',
    'Active Min',
    'Flow Duration',
    'Active Mean',
    'Fwd IAT Mean'
]

In [None]:
# Keep only relevant features and the label
df3 = df3[necessary_features + ['Label']]

In [None]:
print(df3.columns)

Index(['Bwd Packet Length Mean', 'Avg Bwd Segment Size',
       'Bwd Packet Length Max', 'Bwd Packet Length Std', 'Packet Length Mean',
       'Average Packet Size', 'Packet Length Std', 'Max Packet Length',
       'Packet Length Variance', 'PSH Flag Count', 'Flow IAT Std',
       'Flow IAT Mean', 'Fwd IAT Max', 'Flow IAT Max', 'Fwd IAT Std',
       'ACK Flag Count', 'Idle Max', 'Idle Mean', 'Idle Std', 'Idle Min',
       'Subflow Bwd Bytes', 'Total Length of Bwd Packets', 'Fwd IAT Total',
       'Active Min', 'Flow Duration', 'Active Mean', 'Fwd IAT Mean', 'Label'],
      dtype='object')


In [None]:
# Check if the data in Label balance or not (Will lead to over-fitting)

# Count the number of occurrences of each class
class_distribution = df3['Label'].value_counts()

# Print the distribution
print("Class Distribution:")
print(class_distribution)

total = 128027 + 97718  # Total samples
ddos_percentage = (128027 / total) * 100  # Percentage of DDoS samples
benign_percentage = (97718 / total) * 100  # Percentage of BENIGN samples
print(f"DDoS: {ddos_percentage:.2f}%")
print(f"BENIGN: {benign_percentage:.2f}%")

Class Distribution:
Label
1    128027
0     97718
Name: count, dtype: int64
DDoS: 56.71%
BENIGN: 43.29%


## **Training session**

In [None]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from joblib import dump

In [None]:
# Step 1: Prepare your features (X) and target (y)
# Assuming df3 is your DataFrame, where 'Label' is the target column
X = df3.drop(columns=['Label'])  # Features
y = df3['Label']  # Target

In [None]:
# Step 2: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Step 3: Train Random Forest
rf_model = RandomForestClassifier(class_weight='balanced', random_state=42)
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_predictions)

In [None]:
# Save the Random Forest model
dump(rf_model, 'random_forest_model.joblib')
print(f"Random Forest Accuracy: {rf_accuracy:.4f}")

# As the percentages
print(rf_accuracy * 100, "%")

Random Forest Accuracy: 0.9995
99.9490575649516 %


In [None]:
# Step 4: Train XGBoost
# Calculate scale_pos_weight for handling class imbalance
scale_pos_weight = y_train.value_counts()[0] / y_train.value_counts()[1]  # BENIGN/DDoS ratio
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', scale_pos_weight=scale_pos_weight, random_state=42)
xgb_model.fit(X_train, y_train)
xgb_predictions = xgb_model.predict(X_test)
xgb_accuracy = accuracy_score(y_test, xgb_predictions)

Parameters: { "use_label_encoder" } are not used.



In [None]:
# Save the XGBoost model
dump(xgb_model, 'xgboost_model.joblib')

print(f"XGBoost Accuracy: {xgb_accuracy:.4f}")
print(xgb_accuracy * 100, "%")

XGBoost Accuracy: 0.9996
99.95791711887307 %
