In [2]:
import pandas as pd

In [4]:
df1 = pd.read_csv('./data/metasploitable-2.csv')
df2 = pd.read_csv('./data/normal_data.csv')
df3 = pd.read_csv('./data/OVS.csv')

In [5]:
df = pd.concat([df1, df2, df3], ignore_index=True)

In [6]:
print("Shape of combined data:", df.shape)
print("\nUnique labels (attack types):", df['Label'].unique())
print("\nCount of each label:")
print(df['Label'].value_counts())

Shape of combined data: (343889, 84)

Unique labels (attack types): ['U2R' 'BFA' 'DDoS' 'DoS' 'Probe' 'Normal' 'DDoS ' 'Web-Attack' 'BOTNET']

Count of each label:
Label
Probe         98129
DDoS          73529
Normal        68424
DoS           53616
DDoS          48413
BFA            1405
Web-Attack      192
BOTNET          164
U2R              17
Name: count, dtype: int64


In [7]:
df['Label'] = df['Label'].str.strip()

In [8]:
print("Unique labels after cleaning:", df['Label'].unique())
print("\nUpdated count of each label:")
print(df['Label'].value_counts())

Unique labels after cleaning: ['U2R' 'BFA' 'DDoS' 'DoS' 'Probe' 'Normal' 'Web-Attack' 'BOTNET']

Updated count of each label:
Label
DDoS          121942
Probe          98129
Normal         68424
DoS            53616
BFA             1405
Web-Attack       192
BOTNET           164
U2R               17
Name: count, dtype: int64


In [9]:
df.drop(['Flow ID', 'Src IP', 'Dst IP', 'Timestamp'], axis=1, inplace=True)


In [10]:
print(df.isnull().sum())


Src Port         0
Dst Port         0
Protocol         0
Flow Duration    0
Tot Fwd Pkts     0
                ..
Idle Mean        0
Idle Std         0
Idle Max         0
Idle Min         0
Label            0
Length: 80, dtype: int64


In [13]:
import numpy as np

df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)


In [15]:
import os

In [16]:
os.makedirs("output", exist_ok=True)

In [18]:
filtered_df = df[df["Label"] != "U2R"]

In [19]:
normal_data = filtered_df[filtered_df["Label"] == "Normal"]

In [20]:
ddos_df = pd.concat([
    filtered_df[filtered_df["Label"] == "DDoS"],
    filtered_df[filtered_df["Label"] == "BFA"],  # assign BFA here
    normal_data
], ignore_index=True)

In [21]:
probe_df = pd.concat([
    filtered_df[filtered_df["Label"] == "Probe"],
    filtered_df[filtered_df["Label"] == "Web-Attack"],  # assign Web-Attack here
    normal_data
], ignore_index=True)

In [22]:
dos_df = pd.concat([
    filtered_df[filtered_df["Label"] == "DoS"],
    filtered_df[filtered_df["Label"] == "BOTNET"],  # assign BOTNET here
    normal_data
], ignore_index=True)

In [23]:
ddos_df.to_csv("output/ddos_data.csv", index=False)
probe_df.to_csv("output/probe_data.csv", index=False)
dos_df.to_csv("output/dos_data.csv", index=False)

In [24]:
print("Files saved in /output:")
print(" - ddos_data.csv")
print(" - probe_data.csv")
print(" - dos_data.csv")

Files saved in /output:
 - ddos_data.csv
 - probe_data.csv
 - dos_data.csv


In [25]:
files = {
    "DDoS Dataset": "output/ddos_data.csv",
    "Probe Dataset": "output/probe_data.csv",
    "DoS Dataset": "output/dos_data.csv"
}

In [26]:
for name, path in files.items():
    df = pd.read_csv(path)
    print(f"Label counts in {name}:")
    print(df['Label'].value_counts())
    print("\n" + "-"*40 + "\n")

Label counts in DDoS Dataset:
Label
DDoS      121942
Normal     68424
BFA         1405
Name: count, dtype: int64

----------------------------------------

Label counts in Probe Dataset:
Label
Probe         98129
Normal        68424
Web-Attack      192
Name: count, dtype: int64

----------------------------------------

Label counts in DoS Dataset:
Label
Normal    68424
DoS       53616
BOTNET      164
Name: count, dtype: int64

----------------------------------------

