# 01 - Explore the CICIoT2023 dataset

Imports

In [1]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

In [2]:
DATASET_DIRECTORY = 'datasets/'

Import the Dataset

In [7]:
# Find all CSV files in the dataset directory and sort them
df_sets = [k for k in os.listdir(DATASET_DIRECTORY) if k.endswith('.csv')]
df_sets.sort()

# Split the dataset into training and test sets. 80% training, 20% test
training_sets = df_sets[:int(len(df_sets)*.8)]
test_sets = df_sets[int(len(df_sets)*.8):]

Examine file counts and size

In [8]:
# Print the number of files in each set
print('Training sets: {}'.format(len(training_sets)))
print('Test sets: {}'.format(len(test_sets)))

Training sets: 125
Test sets: 32


In [9]:
# Show the min, max and average file size. Output size in MB
print('Min file size: {} MB'.format(round(min([os.path.getsize(DATASET_DIRECTORY + k) for k in df_sets]) / 1000000, 2)))
print('Max file size: {} MB'.format(round(max([os.path.getsize(DATASET_DIRECTORY + k) for k in df_sets]) / 1000000, 2)))
print('Avg file size: {} MB'.format(round(np.mean([os.path.getsize(DATASET_DIRECTORY + k) for k in df_sets]) / 1000000, 2)))

Min file size: 62.41 MB
Max file size: 132.99 MB
Avg file size: 82.28 MB


In [10]:
# Read the last csv from training sets into a dataframe
df = pd.read_csv(DATASET_DIRECTORY + training_sets[-1])


# Print the number of rows and columns in the dataframe
print('Rows: {}'.format(df.shape[0]))
print('Columns: {}'.format(df.shape[1]))

# Print the first 5 rows of the dataframe
df.head()

Rows: 444704
Columns: 47


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,0.025926,56.06,6.0,63.82,3.52848,3.52848,0.0,0.0,1.0,0.0,...,0.496995,54.2,82981080.0,9.5,10.405144,0.704371,2.49895,0.1,141.55,DoS-SYN_Flood
1,0.0,54.0,6.0,64.0,26.996367,26.996367,0.0,0.0,1.0,0.0,...,0.0,54.0,83089820.0,9.5,10.392305,0.0,0.0,0.0,141.55,DDoS-SYN_Flood
2,0.0,54.0,6.0,64.0,2.986424,2.986424,0.0,1.0,0.0,1.0,...,0.0,54.0,83343830.0,9.5,10.392305,0.0,0.0,0.0,141.55,DDoS-RSTFINFlood
3,0.0,53.46,5.94,63.36,66.531372,66.531372,0.0,0.0,1.0,0.0,...,0.13185,54.06,83094040.0,9.5,10.39565,0.186918,0.195153,0.09,141.55,DDoS-SYN_Flood
4,0.0,81.0,6.0,64.0,10.473893,10.473893,0.0,0.0,0.0,0.0,...,0.0,54.0,82925830.0,9.5,10.392305,0.0,0.0,0.0,141.55,DoS-TCP_Flood


Columns used in the paper

In [3]:
X_columns = [
    'flow_duration', 'Header_Length', 'Protocol Type', 'Duration',
       'Rate', 'Srate', 'Drate', 'fin_flag_number', 'syn_flag_number',
       'rst_flag_number', 'psh_flag_number', 'ack_flag_number',
       'ece_flag_number', 'cwr_flag_number', 'ack_count',
       'syn_count', 'fin_count', 'urg_count', 'rst_count', 
    'HTTP', 'HTTPS', 'DNS', 'Telnet', 'SMTP', 'SSH', 'IRC', 'TCP',
       'UDP', 'DHCP', 'ARP', 'ICMP', 'IPv', 'LLC', 'Tot sum', 'Min',
       'Max', 'AVG', 'Std', 'Tot size', 'IAT', 'Number', 'Magnitue',
       'Radius', 'Covariance', 'Variance', 'Weight', 
]
y_column = 'label'

In [4]:
# Show the number of X_columns and y_column
print('X_columns: {}'.format(len(X_columns)))
print('y_column: {}'.format(y_column))

X_columns: 46
y_column: label


In [5]:
# Show the number of unique values in the y_column
print('Unique values in y_column: {}'.format(len(df[y_column].unique())))

# Print them out
print(df[y_column].unique())

# Save them as attack_labels
attack_labels = df[y_column].unique()


NameError: name 'df' is not defined

In [14]:
# Count the number of rows for each attack label
for label in attack_labels:
    print('{}: {}'.format(label, df[df[y_column] == label].shape[0]))
    

DoS-SYN_Flood: 19312
DDoS-SYN_Flood: 38859
DDoS-RSTFINFlood: 38401
DoS-TCP_Flood: 25498
DDoS-UDP_Flood: 51022
DDoS-UDP_Fragmentation: 2669
DDoS-SynonymousIP_Flood: 34362
DDoS-ICMP_Flood: 68587
Mirai-greeth_flood: 9502
DDoS-PSHACK_Flood: 39323
DDoS-TCP_Flood: 42872
Mirai-greip_flood: 7174
DDoS-SlowLoris: 199
DoS-UDP_Flood: 31567
Recon-HostDiscovery: 1272
DDoS-ICMP_Fragmentation: 4331
DDoS-ACK_Fragmentation: 2663
BenignTraffic: 10463
Mirai-udpplain: 8470
DNS_Spoofing: 1681
MITM-ArpSpoofing: 2920
Recon-PortScan: 870
Recon-OSScan: 970
DoS-HTTP_Flood: 703
DictionaryBruteForce: 129
DDoS-HTTP_Flood: 267
BrowserHijacking: 50
VulnerabilityScan: 378
SqlInjection: 41
CommandInjection: 46
Recon-PingSweep: 20
XSS: 46
Backdoor_Malware: 25
Uploading_Attack: 12


In [15]:
# Creating a dictionary of attack types for 33 attack classes + 1 for benign traffic
dict_34_classes = {'BenignTraffic': 0 ,
                    'DDoS-RSTFINFlood' :1, 'DDoS-PSHACK_Flood':2,  'DDoS-SYN_Flood':3, 'DDoS-UDP_Flood':4, 'DDoS-TCP_Flood':5, 
                    'DDoS-ICMP_Flood':6, 'DDoS-SynonymousIP_Flood':7, 'DDoS-ACK_Fragmentation':8, 'DDoS-UDP_Fragmentation':9, 'DDoS-ICMP_Fragmentation':10, 
                    'DDoS-SlowLoris':11, 'DDoS-HTTP_Flood':12, 'DoS-UDP_Flood':13, 'DoS-SYN_Flood':14, 'DoS-TCP_Flood':15, 'DoS-HTTP_Flood':16,                 # DDoS and DoS
                    'Mirai-greeth_flood': 17, 'Mirai-greip_flood': 18, 'Mirai-udpplain': 19,                                                                    # Mirai 
                    'Recon-PingSweep': 20, 'Recon-OSScan': 21, 'Recon-PortScan': 22, 'VulnerabilityScan': 23, 'Recon-HostDiscovery': 24,                        # Reconnaissance
                    'DNS_Spoofing': 25, 'MITM-ArpSpoofing': 26,                                                                                                 # Spoofing
                    'BrowserHijacking': 27, 'Backdoor_Malware': 28, 'XSS': 29, 'Uploading_Attack': 30, 'SqlInjection': 31, 'CommandInjection': 32,              # Web
                    'DictionaryBruteForce': 33}                                                                                                                 # Brute Force 

dict_7_classes = {  0: 0 ,
                    1 :1, 2:1,  3:1, 4:1, 5:1, 6:1, 7:1, 8:1, 9:1, 10:1, 11:1, 12:1, 13:1, 14:1, 15:1, 16:1,                                                    # DDoS and DoS                    
                    17: 2, 18: 2, 19: 2,                                                                                                                        # Mirai
                    20: 3, 21: 3, 22: 3, 23: 3, 24: 3,                                                                                                          # Reconnaissance
                    25: 4, 26: 4,                                                                                                                               # Spoofing
                    27: 5, 28: 5, 29: 5, 30: 5, 31: 5, 32: 5,                                                                                                   # Web
                    33: 6}                                                                                                                                      # Brute Force

dict_2_classes = {  0: 0 ,
                    1 :1, 2:1,  3:1, 4:1, 5:1, 6:1, 7:1, 8:1, 9:1, 10:1, 11:1, 12:1, 13:1, 14:1, 15:1, 16:1,                                                    # DDoS and DoS  
                    17: 1, 18: 1, 19: 1,                                                                                                                        # Mirai 
                    20: 1, 21: 1, 22: 1, 23: 1, 24: 1,                                                                                                          # Reconnaissance
                    25: 1, 26: 1,                                                                                                                               # Spoofing
                    27: 1, 28: 1, 29: 1, 30: 1, 31: 1, 32: 1,                                                                                                   # Web
                    33: 1}                                                                                                                                      # Brute Force

# Map y column to the dict_34_classes values
df['label'] = df['label'].map(dict_34_classes)


In [16]:
# show first 10 rows of y column
df[y_column].head(10)


0    14
1     3
2     1
3     3
4    15
5     4
6    15
7     9
8     7
9     7
Name: label, dtype: int64

In [17]:
# show the unique values in the y column and their counts
print(df[y_column].value_counts())

# Count how many different unique values are in y column
print('Unique values in y_column: {}'.format(len(df[y_column].unique())))

6     68587
4     51022
5     42872
2     39323
3     38859
1     38401
7     34362
13    31567
15    25498
14    19312
0     10463
17     9502
19     8470
18     7174
10     4331
26     2920
9      2669
8      2663
25     1681
24     1272
21      970
22      870
16      703
23      378
12      267
11      199
33      129
27       50
32       46
29       46
31       41
28       25
20       20
30       12
Name: label, dtype: int64
Unique values in y_column: 34


In [18]:
# Further map the y_column to the dict_7_classes values
df['label'] = df['label'].map(dict_7_classes)

# Show the first 10 rows of the y column
df[y_column].head(10)

# Show the unique values in the y column and their counts
print(df[y_column].value_counts())


1    400635
2     25146
0     10463
4      4601
3      3510
5       220
6       129
Name: label, dtype: int64
