<a href="https://colab.research.google.com/github/H-Gallagher/DDoS-Prediction-using-Machine-Learning/blob/main/CIC_DDoS2019_Combine_Attack_Datasets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# File Overview

The following file was created in order to combine the various attack types found in the CIC-DDoS2019 dataset which is publicly available at the following link: https://www.unb.ca/cic/datasets/ddos-2019.html

- Approximately 60000 observations sampled from each training-day attack dataset to create a new combined training dataset.
- Approximately 40000 observations sampled from each test-day attack dataset to create a new combined test dataset.

In [None]:
# Import dependencies.
import gc
import pandas as pd
import numpy as np

# Output paths

In [None]:
# Create train file to save sampled attack types to.
output_path_train="/content/drive/MyDrive/Colab Notebooks/CT5108_Data_Analytics_Project/data/CICDDoS2019/DDoS_combined_train.csv"

# Create test file to save sampled attack types to.
output_path_test="/content/drive/MyDrive/Colab Notebooks/CT5108_Data_Analytics_Project/data/CICDDoS2019/DDoS_combined_test.csv"

# Create combined training dataset

## UDP

In [None]:
# Import DrDoS_UDP.csv (Note file was renamed locally to "DrDoS_UDP_training_day.csv")
udp = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/CT5108_Data_Analytics_Project/data/CICDDoS2019/training/DrDoS_UDP_training_day.csv", 
                  header=0, sep=",", skipinitialspace=True)

In [None]:
# Set seed and perform stratified sampling.
np.random.seed(17)
seed = int(np.random.randint(1, 100, size=1)) 

# Stratified sampled dataset.
udp2 = udp.groupby('Label', group_keys=False).apply(lambda x: x.sample(frac=0.02, random_state=seed))

# Release memory: https://stackoverflow.com/questions/39100971/how-do-i-release-memory-used-by-a-pandas-dataframe
del udp
gc.collect()

179

In [None]:
# Inspect shape.
print(udp2.shape)

In [None]:
# Inspect value counts.
udp2.Label.value_counts()

DrDoS_UDP    62693
BENIGN          43
Name: Label, dtype: int64

In [None]:
# Write sampled dataset to combined file.
udp2.to_csv(output_path_train, mode='w', index=False)

## DrDoS_DNS

In [None]:
# Import DrDoS_DNS.csv
DrDoS_DNS = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/CT5108_Data_Analytics_Project/data/CICDDoS2019/training/DrDoS_DNS.csv", 
                        header=0, sep=",", skipinitialspace=True, nrows=1000000)  # Total is 1,048,575, too large to load into memory.

In [None]:
# Set seed and perform stratified sampling.
np.random.seed(33)
seed = int(np.random.randint(1, 100, size=1)) 

# Stratified sampled dataset.
DrDoS_DNS2 = DrDoS_DNS.groupby('Label', group_keys=False).apply(lambda x: x.sample(frac=0.06, random_state=seed))

# Release memory:
del DrDoS_DNS
gc.collect()

88

In [None]:
# Inspect shape and labels
print(DrDoS_DNS2.shape)

(60000, 88)
Benign labels: 120
UDP labels: 59880


In [None]:
# Inspect value counts.
DrDoS_DNS2.Label.value_counts()

In [None]:
# Append sampled dataset to combined file.
DrDoS_DNS2.to_csv(output_path_train, mode='a', index=False, header=False)

## DrDoS_LDAP

In [None]:
# Import DrDoS_LDAP.csv
DrDoS_LDAP = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/CT5108_Data_Analytics_Project/data/CICDDoS2019/training/DrDoS_LDAP.csv", 
                         header=0, sep=",", skipinitialspace=True) 

  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
# Set seed and perform stratified sampling.
np.random.seed(71)
seed = int(np.random.randint(1, 100, size=1)) 

# Stratified sampled dataset.
DrDoS_LDAP2 = DrDoS_LDAP.groupby('Label', group_keys=False).apply(lambda x: x.sample(frac=0.03, random_state=seed))

# Release memory:
del DrDoS_LDAP
gc.collect()

44

In [None]:
# Inspect shape.
print(DrDoS_LDAP2.shape)

(65446, 88)
Benign labels: 48
UDP labels: 65398


In [None]:
# Inspect value counts.
DrDoS_LDAP2.Label.value_counts()

DrDoS_LDAP    65398
BENIGN           48
Name: Label, dtype: int64

In [None]:
# Append sampled dataset to combined file.
DrDoS_LDAP2.to_csv(output_path_train, mode='a', index=False, header=False)

## MSSQL

In [None]:
# Import DrDoS_LDAP.csv
MSSQL = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/CT5108_Data_Analytics_Project/data/CICDDoS2019/training/DrDoS_MSSQL.csv", 
                    header=0, sep=",", skipinitialspace=True) 

In [None]:
# Set seed and perform stratified sampling.
np.random.seed(53)
seed = int(np.random.randint(1, 100, size=1)) 

# Stratified sampled dataset.
MSSQL2 = MSSQL.groupby('Label', group_keys=False).apply(lambda x: x.sample(frac=0.013, random_state=seed))

# Release memory:
del MSSQL
gc.collect()

44

In [None]:
# Inspect shape.
print(MSSQL2.shape)

(58818, 88)
Benign labels: 26
UDP labels: 58792


In [None]:
# Inspect value counts.
MSSQL2.Label.value_counts()

DrDoS_MSSQL    58792
BENIGN            26
Name: Label, dtype: int64

In [None]:
# Append sampled dataset to combined file.
MSSQL2.to_csv(output_path_train, mode='a', index=False, header=False)

## DrDoS_NetBIOS

In [None]:
# Import DrDoS_NetBIOS.csv
NetBIOS = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/CT5108_Data_Analytics_Project/data/CICDDoS2019/training/DrDoS_NetBIOS.csv", 
                      header=0, sep=",", skipinitialspace=True) 

  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
# Set seed and perform stratified sampling.
np.random.seed(37)
seed = int(np.random.randint(1, 100, size=1)) 

# Stratified sampled dataset.
NetBIOS2 = NetBIOS.groupby('Label', group_keys=False).apply(lambda x: x.sample(frac=0.0155, random_state=seed))

# Release memory.
del NetBIOS
gc.collect()

44

In [None]:
# Inspect shape.
print(NetBIOS2.shape)

(63472, 88)
Benign labels: 26
UDP labels: 63446


In [None]:
# Inspect value counts.
NetBIOS2.Label.value_counts()

DrDoS_NetBIOS    63446
BENIGN              26
Name: Label, dtype: int64

In [None]:
# Append sampled dataset to combined file.
NetBIOS2.to_csv(output_path_train, mode='a', index=False, header=False)

## DrDoS_NTP

In [None]:
# Import DrDoS_NTP.csv
DrDoS_NTP = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/CT5108_Data_Analytics_Project/data/CICDDoS2019/training/DrDoS_NTP.csv", 
                        header=0, sep=",", skipinitialspace=True) 

  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
# Set seed and perform stratified sampling.
np.random.seed(94)
seed = int(np.random.randint(1, 100, size=1)) 

# Stratified sampled dataset.
DrDoS_NTP2 = DrDoS_NTP.groupby('Label', group_keys=False).apply(lambda x: x.sample(frac=0.05, random_state=seed))

# Release memory.
del DrDoS_NTP
gc.collect()

In [None]:
# Inspect shape.
print(DrDoS_NTP2.shape)

(60850, 88)
Benign labels: 718
UDP labels: 60132


In [None]:
# Inspect value counts.
DrDoS_NTP2.Label.value_counts()

DrDoS_NTP    60132
BENIGN         718
Name: Label, dtype: int64

In [None]:
# Append sampled dataset to combined file.
DrDoS_NTP2.to_csv(output_path_train, mode='a', index=False, header=False)

## DrDoS_SNMP

In [None]:
# Import DrDoS_SNMP.csv
DrDoS_SNMP = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/CT5108_Data_Analytics_Project/data/CICDDoS2019/training/DrDoS_SNMP.csv", 
                         header=0, sep=",", skipinitialspace=True, nrows=1050000) 

  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
# Set seed and perform stratified sampling.
np.random.seed(11)
seed = int(np.random.randint(1, 100, size=1)) 

# Stratified sampled dataset.
DrDoS_SNMP2 = DrDoS_SNMP.groupby('Label', group_keys=False).apply(lambda x: x.sample(frac=0.06, random_state=seed))

# Release memory.
del DrDoS_NTP
gc.collect()

In [None]:
# Inspect shape.
print(DrDoS_SNMP2.shape)

(63000, 88)
Benign labels: 49
UDP labels: 62951


In [None]:
# Inspect value counts.
DrDoS_SNMP2.Label.value_counts()

DrDoS_SNMP    62951
BENIGN           49
Name: Label, dtype: int64

In [None]:
# Append sampled dataset to combined file.
DrDoS_SNMP2.to_csv(output_path_train, mode='a', index=False, header=False)

## DrDoS_SSDP

In [None]:
# Import DrDoS_SSDP.csv
DrDoS_SSDP = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/CT5108_Data_Analytics_Project/data/CICDDoS2019/training/DrDoS_SSDP.csv", 
                         header=0, sep=",", skipinitialspace=True) 

  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
# Set seed and perform stratified sampling.
np.random.seed(11)
seed = int(np.random.randint(1, 100, size=1)) 

# Stratified sampled dataset.
DrDoS_SSDP2 = DrDoS_SSDP.groupby('Label', group_keys=False).apply(lambda x: x.sample(frac=0.023, random_state=seed))

# Release memory.
del DrDoS_NTP
gc.collect()

In [None]:
del DrDoS_SSDP
gc.collect()

326

In [None]:
# Inspect shape.
print(DrDoS_SSDP2.shape)

(60062, 88)
Benign labels: 18
UDP labels: 60044


In [None]:
# Inspect value counts.
DrDoS_SSDP2.Label.value_counts()

DrDoS_SSDP    60044
BENIGN           18
Name: Label, dtype: int64

In [None]:
# Append sampled dataset to combined file.
DrDoS_SSDP2.to_csv(output_path_train, mode='a', index=False, header=False)

## Syn

In [None]:
# Import Syn.csv
Syn = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/CT5108_Data_Analytics_Project/data/CICDDoS2019/training/Syn.csv", 
                  header=0, sep=",", skipinitialspace=True) 

  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
# Set seed and perform stratified sampling.
np.random.seed(11)
seed = int(np.random.randint(1, 100, size=1)) 

# Stratified sampled dataset.
Syn2 = Syn.groupby('Label', group_keys=False).apply(lambda x: x.sample(frac=0.04, random_state=seed))

# Release memory.
del Syn
gc.collect()

276

In [None]:
# Inspect shape.
print(Syn2.shape)

(63308, 88)
Benign labels: 16
UDP labels: 63292


In [None]:
# Inspect value counts.
Syn2.Label.value_counts()

Syn       63292
BENIGN       16
Name: Label, dtype: int64

In [None]:
# Append sampled dataset to combined file.
Syn2.to_csv(output_path_train, mode='a', index=False, header=False)

## TFTP

In [None]:
# Import TFTP.csv
tftp = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/CT5108_Data_Analytics_Project/data/CICDDoS2019/training/TFTP.csv", 
                   header=0, sep=",", skipinitialspace=True, nrows=1000000) 

  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
# Set seed and perform stratified sampling.
np.random.seed(87)
seed = int(np.random.randint(1, 100, size=1)) 

# Stratified sampled dataset.
tftp2 = tftp.groupby('Label', group_keys=False).apply(lambda x: x.sample(frac=0.06, random_state=seed))

# Release memory.
del tftp
gc.collect()

In [None]:
# Inspect shape.
print(tftp2.shape)

(60000, 88)
Benign labels: 70
UDP labels: 59930


In [None]:
# Inspect value counts.
tftp2.Label.value_counts()

TFTP      59930
BENIGN       70
Name: Label, dtype: int64

In [None]:
# Append sampled dataset to combined file.
tftp2.to_csv(output_path_train, mode='a', index=False, header=False)

## UDPLag

In [None]:
# Import UDPLag.csv
UDPLag = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/CT5108_Data_Analytics_Project/data/CICDDoS2019/training/UDPLag.csv", 
                     header=0, sep=",", skipinitialspace=True) 

  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
# Set seed and perform stratified sampling.
np.random.seed(87)
seed = int(np.random.randint(1, 100, size=1)) 

# Stratified sampled dataset.
UDPLag2 = UDPLag.groupby('Label', group_keys=False).apply(lambda x: x.sample(frac=0.16, random_state=seed))

In [None]:
# Inspect shape.
print(UDPLag2.shape)

(59297, 88)
Benign labels: 593
UDP Lag labels: 58634
WebDDoS labels: 70


In [None]:
# Inspect value counts.
UDPLag.Label.value_counts()

UDP-lag    366461
BENIGN       3705
WebDDoS       439
Name: Label, dtype: int64

In [None]:
# Append sampled dataset to combined file.
UDPLag2.to_csv(output_path_train, mode='a', index=False, header=False)

# Create combined test dataset

## LDAP

In [None]:
# Import LDAP.csv
LDAP_test = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/CT5108_Data_Analytics_Project/data/CICDDoS2019/testing/LDAP.csv",
                        header=0, sep=",", skipinitialspace=True) 

  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
# Set seed and perform stratified sampling.
np.random.seed(87)
seed = int(np.random.randint(1, 100, size=1)) 

# Stratified sampled dataset.
LDAP_test2 = LDAP_test.groupby('Label', group_keys=False).apply(lambda x: x.sample(frac=0.019, random_state=seed))

In [None]:
# Inspect shape.
print(LDAP_test2.shape)

(40151, 88)
Benign labels: 97
LDAP labels: 36199
NetBIOS labels: 3855


In [None]:
# Inspect value counts.
LDAP_test2.Label.value_counts()

LDAP       36199
NetBIOS     3855
BENIGN        97
Name: Label, dtype: int64

In [None]:
# Write sampled dataset to combined file.
LDAP_test2.to_csv(output_path, mode='w', index=False)

## MSSQL

In [None]:
# Import MSSQL.csv
MSSQL = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/CT5108_Data_Analytics_Project/data/CICDDoS2019/testing/MSSQL.csv", 
                    header=0, sep=",", skipinitialspace=True, nrows=1100000) 

  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
# Set seed and perform stratified sampling.
np.random.seed(3)
seed = int(np.random.randint(1, 100, size=1)) 

# Stratified sampled dataset.
MSSQL2 = MSSQL.groupby('Label', group_keys=False).apply(lambda x: x.sample(frac=0.035, random_state=seed))

# Release memory.
del MSSQL
gc.collect()

211

In [None]:
# Inspect shape.
print(MSSQL2.shape)

(38500, 88)
Benign labels: 37
LDAP labels: 348
MSSQL labels: 38115


In [None]:
# Inspect value counts.
MSSQL2.Label.value_counts()

MSSQL     38115
LDAP        348
BENIGN       37
Name: Label, dtype: int64

In [None]:
# Append sampled dataset to combined file.
MSSQL2.to_csv(output_path, mode='a', index=False, header=False)

## NetBIOS

In [None]:
# Import NetBIOS.csv
NetBIOS = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/CT5108_Data_Analytics_Project/data/CICDDoS2019/testing/NetBIOS.csv", 
                      header=0, sep=",", skipinitialspace=True) 

  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
# Set seed and perform stratified sampling.
np.random.seed(3)
seed = int(np.random.randint(1, 100, size=1)) 

# Stratified sampled dataset.
NetBIOS2 = NetBIOS.groupby('Label', group_keys=False).apply(lambda x: x.sample(frac=0.012, random_state=seed))

# Release memory.
del NetBIOS
gc.collect()

295

In [None]:
# Inspect shape.
print(NetBIOS2.shape)

(41471, 88)
Benign labels: 16
NetBIOS labels: 41455


In [None]:
# Inspect value counts.
NetBIOS2.Label.value_counts()

NetBIOS    41455
BENIGN        16
Name: Label, dtype: int64

In [None]:
# Append sampled dataset to combined file.
NetBIOS2.to_csv(output_path, mode='a', index=False, header=False)

## Portmap

In [None]:
# Import Portmap.csv
Portmap = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/CT5108_Data_Analytics_Project/data/CICDDoS2019/testing/Portmap.csv", 
                      header=0, sep=",", skipinitialspace=True) 

  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
# Set seed and perform stratified sampling.
np.random.seed(88)
seed = int(np.random.randint(1, 100, size=1)) 

# Stratified sampled dataset.
Portmap2 = Portmap.groupby('Label', group_keys=False).apply(lambda x: x.sample(frac=0.2, random_state=seed))

In [None]:
# Inspect shape.
print(Portmap2.shape)

(38339, 88)
Benign labels: 947
NetBIOS labels: 37392


In [None]:
# Inspect value counts.
Portmap2.Label.value_counts()

Portmap    37392
BENIGN       947
Name: Label, dtype: int64

In [None]:
# Append sampled dataset to combined file.
Portmap2.to_csv(output_path, mode='a', index=False, header=False)

## Syn2

In [None]:
# Import Syn2.csv
Syn2 = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/CT5108_Data_Analytics_Project/data/CICDDoS2019/testing/Syn2.csv", 
                   header=0, sep=",", skipinitialspace=True) 

  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
# Set seed and perform stratified sampling.
np.random.seed(67)
seed = int(np.random.randint(1, 100, size=1)) 

# Stratified sampled dataset.
Syn22 = Syn2.groupby('Label', group_keys=False).apply(lambda x: x.sample(frac=0.009, random_state=seed))

# Release memory.
del Syn2
gc.collect()

575

In [None]:
# Inspect shape.
print(Syn22.shape)

(38885, 88)
Benign labels: 322
NetBIOS labels: 38563


In [None]:
# Inspect value counts.
Syn22.Label.value_counts()

Syn       38563
BENIGN      322
Name: Label, dtype: int64

In [None]:
# Append sampled dataset to combined file.
Syn22.to_csv(output_path, mode='a', index=False, header=False)

## UDP

In [None]:
# Import UDP.csv (Note file name renamed to "UDP_testing_day".csv locally)
udp = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/CT5108_Data_Analytics_Project/data/CICDDoS2019/testing/UDP_testing_day.csv", 
                  header=0, sep=",", skipinitialspace=True)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
# Set seed and perform stratified sampling.
np.random.seed(67)
seed = int(np.random.randint(1, 100, size=1)) 

# Stratified sampled dataset.
udp2 = udp.groupby('Label', group_keys=False).apply(lambda x: x.sample(frac=0.011, random_state=seed))

In [None]:
# Inspect shape.
print(udp2.shape)

(41603, 88)
Benign labels: 34
UDP labels: 41301
MSSQL labels: 268


In [None]:
# Inspect value counts.
udp2.Label.value_counts()

UDP       41301
MSSQL       268
BENIGN       34
Name: Label, dtype: int64

In [None]:
# Append sampled dataset to combined file.
udp2.to_csv(output_path, mode='a', index=False, header=False)

## UDP Lag

In [None]:
# Import UDPLag.csv
UDPLag = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/CT5108_Data_Analytics_Project/data/CICDDoS2019/testing/UDPLag.csv", 
                     header=0, sep=",", skipinitialspace=True)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
# Set seed and perform stratified sampling.
np.random.seed(55)
seed = int(np.random.randint(1, 100, size=1)) 

# Stratified sampled dataset.
UDPLag2 = UDPLag.groupby('Label', group_keys=False).apply(lambda x: x.sample(frac=0.055, random_state=seed))

In [None]:
# Inspect shape.
print(UDPLag2.shape)

(39884, 88)
Benign labels: 224
UDPLag labels: 103
UDP labels: 6186
Syn labels: 33371


In [None]:
# Inspect value counts.
UDPLag.Label.value_counts()

Syn       606749
UDP       112475
BENIGN      4068
UDPLag      1873
Name: Label, dtype: int64

In [None]:
# Append sampled dataset to combined file.
UDPLag2.to_csv(output_path, mode='a', index=False, header=False)

# Load new datasets

In [None]:
# Load newly created combined training dataset and inspect.
train = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/CT5108_Data_Analytics_Project/data/CICDDoS2019/DDoS_combined_train.csv", header=0, sep=",", skipinitialspace=True)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
train.shape

(676989, 88)

In [None]:
train.Label.value_counts()

DrDoS_LDAP       65398
DrDoS_NetBIOS    63446
Syn              63292
DrDoS_SNMP       62951
DrDoS_UDP        62693
DrDoS_NTP        60132
DrDoS_SSDP       60044
TFTP             59930
DrDoS_DNS        59880
DrDoS_MSSQL      58792
UDP-lag          58634
BENIGN            1727
WebDDoS             70
Name: Label, dtype: int64

In [None]:
# Load newly created combined test dataset and inspect.
test = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/CT5108_Data_Analytics_Project/data/CICDDoS2019/DDoS_combined_test.csv", header=0, sep=",", skipinitialspace=True)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
test.shape

(278833, 88)

In [None]:
test.Label.value_counts()

Syn        71934
UDP        47487
NetBIOS    45310
MSSQL      38383
Portmap    37392
LDAP       36547
BENIGN      1677
UDPLag       103
Name: Label, dtype: int64