In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **Data Analysis Process**

## **Step 1: Import Libraries**

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset

## **Step 2: Analyze data**

In [None]:
df = pd.read_csv("/content/drive/MyDrive/security/merged.csv")
df.head()

  df = pd.read_csv("/content/drive/MyDrive/security/merged.csv")


Unnamed: 0.1,Unnamed: 0,Flow ID,Source IP,Source Port,Destination IP,Destination Port,Protocol,Timestamp,Flow Duration,Total Fwd Packets,...,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,SimillarHTTP,Inbound,Label
0,21243,192.168.50.6-23.194.142.213-57215-443-6,192.168.50.6,57215,23.194.142.213,443,6,2018-12-01 11:22:40.970628,20740,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,BENIGN
1,18666,192.168.50.253-224.0.0.5-0-0-0,192.168.50.253,0,224.0.0.5,0,0,2018-12-01 11:22:41.955234,5,4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,BENIGN
2,15346,8.0.6.4-8.6.0.1-0-0-0,8.6.0.1,0,8.0.6.4,0,0,2018-12-01 11:22:43.923507,2,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,BENIGN
3,160,192.168.50.254-224.0.0.5-0-0-0,192.168.50.254,0,224.0.0.5,0,0,2018-12-01 11:22:48.452989,3,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,BENIGN
4,19686,192.168.50.8-125.56.201.105-59307-80-6,192.168.50.8,59307,125.56.201.105,80,6,2018-12-01 11:22:49.599759,173888,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,BENIGN


In [None]:
num_of_label = df.groupby(' Label').size()
num_of_label

Unnamed: 0_level_0,0
Label,Unnamed: 1_level_1
BENIGN,24701
DrDoS_LDAP,148198
DrDoS_NTP,123591
DrDoS_NetBIOS,145922
DrDoS_SSDP,146107
DrDoS_UDP,131730
Syn,138539
UDP-lag,132720
WebDDoS,413


## **Step 3: Encode Label Column**

In [None]:
df[' Label'] = df[' Label'].apply(lambda x: 0 if x == 'BENIGN' else 1)

In [None]:
# ddos_type_map = {}
# for i in range(len(num_of_label)):
#     ddos_type_map[num_of_label.index[i]] = i
# ddos_type_map

{'BENIGN': 0,
 'DrDoS_LDAP': 1,
 'DrDoS_NTP': 2,
 'DrDoS_NetBIOS': 3,
 'DrDoS_SSDP': 4,
 'DrDoS_UDP': 5,
 'Syn': 6,
 'UDP-lag': 7,
 'WebDDoS': 8}

In [None]:
# df[' Label'] = df[' Label'].map(ddos_type_map)
# df.sample(5)

Unnamed: 0.1,Unnamed: 0,Flow ID,Source IP,Source Port,Destination IP,Destination Port,Protocol,Timestamp,Flow Duration,Total Fwd Packets,...,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,SimillarHTTP,Inbound,Label
304957,24715,172.16.0.5-192.168.50.1-46909-16711-17,172.16.0.5,46909,192.168.50.1,16711,17,2018-12-01 11:47:10.638302,1,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,3
974769,9197,172.16.0.5-192.168.50.1-37336-37336-6,172.16.0.5,37336,192.168.50.1,37336,6,2018-12-01 13:29:50.702139,33699367,4,...,0.0,1.0,1.0,33699360.0,0.0,33699365.0,33699365.0,0,1,7
239233,5161,172.16.0.5-192.168.50.1-967-34591-17,172.16.0.5,967,192.168.50.1,34591,17,2018-12-01 10:36:30.532894,1851,30,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,2
756568,636612,172.16.0.5-192.168.50.1-19843-19843-6,172.16.0.5,19843,192.168.50.1,19843,6,2018-12-01 13:30:34.842988,111747983,16,...,26.019224,49.0,1.0,15963980.0,4682704.0,23177260.0,12074480.0,0,1,6
175577,3814,172.16.0.5-192.168.50.1-634-37674-17,172.16.0.5,634,192.168.50.1,37674,17,2018-12-01 10:36:09.212344,989,28,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,2


### **Prioritizing using Numerical Columns**

- Categorical columns require encoding techniques like:
  > **One-Hot Encoding**: Expands dimensions, leading to high sparsity.<br>
  > **Label Encoding**: Imposes false ordinal relationships between categories.
- Numeric columns skip this step, saving preprocessing time and reducing risk of errors in encoding.
- ML and DL models operate on matrices and tensors filled with numbers.
- Numeric data is already in the required format, enabling faster processing without additional preprocessing.
- **Random Forest** calculates splits based on feature values, and **ANNs** perform weighted sums and activations—both rely on numeric inputs.


In [None]:
numeric_cols = list(df.select_dtypes(include=np.number))
numeric_cols

['Unnamed: 0',
 ' Source Port',
 ' Destination Port',
 ' Protocol',
 ' Flow Duration',
 ' Total Fwd Packets',
 ' Total Backward Packets',
 'Total Length of Fwd Packets',
 ' Total Length of Bwd Packets',
 ' Fwd Packet Length Max',
 ' Fwd Packet Length Min',
 ' Fwd Packet Length Mean',
 ' Fwd Packet Length Std',
 'Bwd Packet Length Max',
 ' Bwd Packet Length Min',
 ' Bwd Packet Length Mean',
 ' Bwd Packet Length Std',
 'Flow Bytes/s',
 ' Flow Packets/s',
 ' Flow IAT Mean',
 ' Flow IAT Std',
 ' Flow IAT Max',
 ' Flow IAT Min',
 'Fwd IAT Total',
 ' Fwd IAT Mean',
 ' Fwd IAT Std',
 ' Fwd IAT Max',
 ' Fwd IAT Min',
 'Bwd IAT Total',
 ' Bwd IAT Mean',
 ' Bwd IAT Std',
 ' Bwd IAT Max',
 ' Bwd IAT Min',
 'Fwd PSH Flags',
 ' Bwd PSH Flags',
 ' Fwd URG Flags',
 ' Bwd URG Flags',
 ' Fwd Header Length',
 ' Bwd Header Length',
 'Fwd Packets/s',
 ' Bwd Packets/s',
 ' Min Packet Length',
 ' Max Packet Length',
 ' Packet Length Mean',
 ' Packet Length Std',
 ' Packet Length Variance',
 'FIN Flag Count'

Counterpoint Handling:

- If categorical features are needed, convert them into numeric values first, e.g., target encoding or embedding layers in DL.

In [None]:
X = df[numeric_cols].drop(' Label', axis=1)
y = df[' Label']

## **Step 4: Split Dataset**

Split dataset into **train** and **test** sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = X_train.replace([np.inf, -np.inf], np.nan)
X_test = X_test.replace([np.inf, -np.inf], np.nan)

X_train = X_train.dropna()
X_test = X_test.dropna()

y_train = y_train[X_train.index]
y_test = y_test[X_test.index]

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# **Why use Random Forest?**

Random Forest (RF) is a tree-based ensemble method that excels at feature selection and preprocessing.

- Feature Importance Selection:
  > RF identifies the most important features (e.g., packet rates, flow durations) for classification, reducing dimensionality and noise.
- Initial Classification:
  > It provides a baseline classification that is interpretable and fast.
- Noise Reduction:
  > RF is resistant to overfitting and helps preprocess noisy data for ANN.
- Feature Engineering:
  > RF highlights patterns and relationships in the data that might not be obvious.

In [None]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

importances = rf.feature_importances_
importances_features = np.argsort(importances)[-10:]

X_train = X_train[:, importances_features]
X_test = X_test[:, importances_features]

In [None]:
importances

array([2.99751782e-03, 4.76584755e-02, 1.10352826e-01, 3.76417730e-03,
       8.75370649e-03, 7.05353844e-03, 4.87131279e-03, 1.58933915e-03,
       4.40299302e-02, 2.65086912e-03, 3.58873860e-03, 6.18379695e-03,
       2.12291190e-03, 5.06347488e-02, 1.93749601e-02, 3.37408643e-02,
       1.07311415e-03, 6.73336118e-03, 4.85065610e-03, 4.78818127e-03,
       1.02646004e-02, 7.34126306e-03, 2.52353612e-03, 3.86013217e-03,
       3.42687862e-03, 1.62780146e-03, 3.40479305e-03, 7.60900351e-04,
       1.31219477e-02, 5.83690403e-03, 9.63320988e-04, 9.53971861e-03,
       7.02896633e-03, 8.87821364e-03, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 1.06588090e-02, 5.06980775e-03, 3.55365709e-03,
       2.81841598e-02, 3.56316787e-03, 1.07256025e-02, 4.10145899e-03,
       8.41030391e-03, 5.64779979e-03, 0.00000000e+00, 6.31289698e-04,
       1.06248839e-02, 0.00000000e+00, 2.90353858e-03, 8.04005375e-02,
       1.33177108e-02, 0.00000000e+00, 3.16376805e-02, 3.75690032e-03,
      

In [None]:
importances_features

array([54, 15, 70,  8,  1, 13, 57, 51,  2, 81])

In [None]:
for i in importances_features:
    print(numeric_cols[i])

 Down/Up Ratio
 Bwd Packet Length Mean
 Init_Win_bytes_backward
 Total Length of Bwd Packets
 Source Port
Bwd Packet Length Max
 Avg Bwd Segment Size
 URG Flag Count
 Destination Port
 Inbound


# **Why use ANN?**

ANNs (Artificial Neural Networks) are deep learning models that are excellent for capturing non-linear patterns and complex relationships in data.

- Learning Non-linear Dependencies:
  > ANN can learn non-linear relationships between input features, making it ideal for DDoS attacks, which often involve irregular traffic patterns.
- Generalization for Complex Patterns:
  > ANN generalizes well when data has hidden structures, which may be missed by traditional machine learning models like RF.
- Real-Time Adaptation:
  > With proper training, ANNs handle real-time detection through rapid inference on incoming data streams.
- Sequential or Context Awareness:
  > Advanced versions like LSTMs can model sequences for time-sensitive attacks.

## **Step 1: Data Preparation**

In [None]:
# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).unsqueeze(1)

# Create DataLoader
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

## **Step 2: Initialize Model**

In [None]:
class DDoSNet(nn.Module):
    def __init__(self, input_size):
        super(DDoSNet, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_size, 64),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(32, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)


# Initialize Model
input_size = X_train.shape[1]
model = DDoSNet(input_size)

## **Step 3: Train the model**

In [None]:
# Loss and Optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Loop
num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    for batch_X, batch_y in train_loader:
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}")

Epoch 1/20, Loss: 0.0001
Epoch 2/20, Loss: 0.0210
Epoch 3/20, Loss: 0.0000
Epoch 4/20, Loss: 0.0000
Epoch 5/20, Loss: 0.0006
Epoch 6/20, Loss: 0.0000
Epoch 7/20, Loss: 0.0035
Epoch 8/20, Loss: 0.0000
Epoch 9/20, Loss: 0.0001
Epoch 10/20, Loss: 0.0000
Epoch 11/20, Loss: 0.0000
Epoch 12/20, Loss: 0.0036
Epoch 13/20, Loss: 0.0000
Epoch 14/20, Loss: 0.0002
Epoch 15/20, Loss: 0.0005
Epoch 16/20, Loss: 0.0000
Epoch 17/20, Loss: 0.0000
Epoch 18/20, Loss: 0.0000
Epoch 19/20, Loss: 0.0000
Epoch 20/20, Loss: 0.0000


## **Step 4: Model Evaluation**

In [None]:
# Evaluate Model
model.eval()
y_pred_list = []
with torch.no_grad():
    for batch_X, _ in test_loader:
        outputs = model(batch_X)
        y_pred_list.append(outputs.cpu().numpy())

# Convert predictions
y_pred = np.vstack(y_pred_list)
y_pred = (y_pred > 0.5).astype(int)

# Metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.9995624361886108
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4910
           1       1.00      1.00      1.00    187062

    accuracy                           1.00    191972
   macro avg       0.99      1.00      1.00    191972
weighted avg       1.00      1.00      1.00    191972

[[  4876     34]
 [    50 187012]]


In [None]:
X_test_tensor.shape

torch.Size([191972, 10])

## **Step 5: Model Testing**

In [None]:
# Predict
def predict_ddos(test_tensor):
    model.eval()
    with torch.no_grad():  # Disable gradient calculations for inference
        prediction = model(test_tensor)  # Get prediction

    # Apply threshold (0.5) to classify as 0 or 1
    result = (prediction > 0.5).int()
    return result

# # Print Results
# if result == 1:
#     print("DDoS Attack Detected!")
# else:
#     print("Normal Traffic")


In [None]:
attack = 0
not_attack = 0
for i in range(X_test_tensor.shape[0]):
    result = predict_ddos(X_test_tensor[i])
    if result == 1:
        attack += 1
    else:
        not_attack += 1

In [None]:
attack

187046