# Imports

In [None]:
import pandas as pd
import glob
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Load data

In [None]:
path = 'data/CIC_IoT2023/'

all_files = glob.glob(os.path.join(path , "*.csv"))

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

frame = pd.concat(li, axis=0, ignore_index=True)

In [None]:
all_files

In [None]:
frame.head()

In [None]:
frame.info()

In [None]:
frame['label'].unique()

In [None]:
frame['label'].value_counts()

In [None]:
attack_map = {
    'BenignTraffic': 'Benign',
    'DDoS-ICMP_Flood': 'DDoS',
    'DDoS-UDP_Flood': 'DDoS',
    'DDoS-TCP_Flood': 'DDoS',
    'DDoS-PSHACK_Flood': 'DDoS',
    'DDoS-SYN_Flood': 'DDoS',
    'DDoS-RSTFINFlood': 'DDoS',
    'DDoS-SynonymousIP_Flood': 'DDoS',
    'DDoS-ICMP_Fragmentation': 'DDoS',
    'DDoS-UDP_Fragmentation': 'DDoS',
    'DDoS-ACK_Fragmentation': 'DDoS',
    'DDoS-HTTP_Flood': 'DDoS',
    'DDoS-SlowLoris': 'DDoS',
    'DoS-UDP_Flood': 'DoS',
    'DoS-TCP_Flood': 'DoS',
    'DoS-SYN_Flood': 'DoS',
    'DoS-HTTP_Flood': 'DoS',
    'DictionaryBruteForce': 'Bruteforce',
    'MITM-ArpSpoofing': 'Spoofing',
    'DNS_Spoofing': 'Spoofing',
    'Recon-HostDiscovery': 'Recon',
    'Recon-OSScan': 'Recon',
    'Recon-PortScan': 'Recon',
    'Recon-PingSweep': 'Recon',
    'VulnerabilityScan': 'Recon',
    'BrowserHijacking': 'Web',
    'CommandInjection': 'Web',
    'SqlInjection': 'Web',
    'XSS': 'Web',
    'Backdoor_Malware': 'Web',
    'Uploading_Attack': 'Web',
    'Mirai-greeth_flood': 'Mirai',
    'Mirai-udpplain': 'Mirai',
    'Mirai-greip_flood': 'Mirai'
}

In [None]:
frame['Attack Type'] = frame['label'].map(attack_map)

In [None]:
frame['Attack Type'].unique()

In [None]:
frame['Class'] = np.where(frame['Attack Type'] == 'Benign', 'Benign', 'Attack')

In [None]:
frame['Class'].unique()

In [None]:
def hist_plot(label, count, name, rotation):
    fig, ax = plt.subplots(figsize=(10,10))
    ax.bar(label, count)
    rects = ax.patches
    labels = [f"{count[i]}" for i in range(len(rects))]
    for rect, label in zip(rects, labels):
        height = rect.get_height()
        ax.text(
            rect.get_x() + rect.get_width() / 2, height + 5, label, ha="center", va="bottom"
        )
    ax.set_title(f'{name} distribution', fontsize=16, fontweight="bold")
    ax.set_xlabel(f'{name}')
    ax.set_ylabel('Distribution')
    ax.grid(which='Major', axis='both')
    ax.set_axisbelow(True)
    ax.tick_params(axis='x', labelrotation=rotation)
    plt.savefig(f"images/plots/{name}.png")

In [None]:
label = frame['Class'].value_counts().index.tolist()
count = frame['Class'].value_counts()

In [None]:
hist_plot(label, count, "Traffic type distribution", 0)

In [None]:
label = frame['Attack Type'].value_counts().index.tolist()
count = frame['Attack Type'].value_counts()

In [None]:
hist_plot(label, count, "Attack Distribution", 0)

In [None]:
label = frame['label'][frame['Attack Type'] == 'DDoS'].unique().tolist()
count = frame['label'][frame['Attack Type'] == 'DDoS'].value_counts()

In [None]:
hist_plot(label, count, "DDoS Attack Distribution", 90)

In [None]:
label = frame['label'][frame['Attack Type'] == 'DoS'].unique().tolist()
count = frame['label'][frame['Attack Type'] == 'DoS'].value_counts()

In [None]:
hist_plot(label, count, "DoS Attack Distribution", 0)

In [None]:
label = frame['label'][frame['Attack Type'] == 'Mirai'].unique().tolist()
count = frame['label'][frame['Attack Type'] == 'Mirai'].value_counts()

In [None]:
hist_plot(label, count, "Mirai Attack Distribution", 0)

In [None]:
label = frame['label'][frame['Attack Type'] == 'Spoofing'].unique().tolist()
count = frame['label'][frame['Attack Type'] == 'Spoofing'].value_counts()

In [None]:
hist_plot(label, count, "Spoofing Attack Distribution", 0)

In [None]:
label = frame['label'][frame['Attack Type'] == 'Recon'].unique().tolist()
count = frame['label'][frame['Attack Type'] == 'Recon'].value_counts()

In [None]:
hist_plot(label, count, "Recon Attack Distribution", 0)

In [None]:
label = frame['label'][frame['Attack Type'] == 'Web'].unique().tolist()
count = frame['label'][frame['Attack Type'] == 'Web'].value_counts()

In [None]:
hist_plot(label, count, "Web Attack Distribution", 90)

In [None]:
label = frame['label'][frame['Attack Type'] == 'Bruteforce'].unique().tolist()
count = frame['label'][frame['Attack Type'] == 'Bruteforce'].value_counts()

In [None]:
hist_plot(label, count, "Brute Force Attack Distribution", 0)

In [None]:
frame.shape

In [None]:
frame.columns

# Binary Classification Data

In [None]:
def create_bc_data():
    normal = frame.loc[frame['Attack Type'] == 'Benign']
    intrusion = frame.loc[frame['Attack Type'] != 'Benign']
    normal_data = normal.sample(n = 300000)
    intrusion_data = intrusion.sample(n = 300000)
    bc_data = pd.concat([intrusion_data, normal_data])
    print(bc_data['Class'].value_counts())
    bc_data.to_csv('data/CIC_IoT2023/binary_classification_data.csv')

# Multiclass Classification Data

In [None]:
def create_mc_data(sample_size):
    dfs = []
    attacks = frame['Attack Type'].value_counts().index.tolist()
    for name in attacks:
        mc_data = frame[frame['Attack Type'] == name]
        if len(mc_data) > sample_size:
            mc_data = mc_data.sample(n = sample_size, random_state = 0)
        dfs.append(mc_data)
    mc_data = pd.concat(dfs, ignore_index = True)
    mc_data.to_csv('data/CIC_IoT2023/multiclass_classification_data.csv')
    mc_data['Attack Type'].value_counts()

In [None]:
create_mc_data(10000)