## Extraction
---
This part will load data from the folder and concatenate them into one DataFrame


In [2]:
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import glob

In [3]:
_ids = list()

In [4]:
# get csv files
for csvfile in glob.glob('fall2024data/*.csv'):
    _ids.append(pd.read_csv(csvfile, sep=','))

In [5]:
# get json files
for jsonfile in glob.glob('fall2024data/*.json'):
    _ids.append(pd.read_json(jsonfile, lines=True))

In [6]:
# get parquet files
for pqfile in glob.glob('fall2024data/*.parquet'):
    buff = pq.read_table(pqfile)
    _ids.append(buff.to_pandas())

In [7]:
ids = pd.concat(_ids, ignore_index=True)

## Transform
---
In this part, data will be separated by its Label and processed to show some insight
* Data types conversion
* Data format conversion (cm to inches, etc.)
* Remove duplicates
* Identifying errors in data
* Handling out-of-range and outlier data
* Add any other transformations you find necessary.


Also, Drop Label 'Heartbleed'

In [8]:
Hbd = (ids.iloc[:,-1] == 'Heartbleed')

In [9]:
H_idx = Hbd[Hbd == True].index

In [10]:
ids.drop(H_idx, inplace=True)

General Info about data

In [11]:
ids.shape

(61117, 79)

In [12]:
ids.info()

<class 'pandas.core.frame.DataFrame'>
Index: 61117 entries, 0 to 61127
Data columns (total 79 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0    Destination Port             61117 non-null  int64  
 1    Flow Duration                61117 non-null  int64  
 2    Total Fwd Packets            61117 non-null  int64  
 3    Total Backward Packets       61117 non-null  int64  
 4   Total Length of Fwd Packets   61117 non-null  int64  
 5    Total Length of Bwd Packets  61117 non-null  int64  
 6    Fwd Packet Length Max        61117 non-null  int64  
 7    Fwd Packet Length Min        61117 non-null  int64  
 8    Fwd Packet Length Mean       61117 non-null  float64
 9    Fwd Packet Length Std        61117 non-null  float64
 10  Bwd Packet Length Max         61117 non-null  int64  
 11   Bwd Packet Length Min        61117 non-null  int64  
 12   Bwd Packet Length Mean       61117 non-null  float64
 13   Bwd P

In [13]:
ids.describe().to_csv('a.csv')

Dataset seems too big to generalize and has too many outliers.
Thus, I'm going to split datasets according to its label.

In [14]:
np.unique(ids.to_numpy()[:,-1])

array(['BENIGN', 'DoS GoldenEye', 'DoS Hulk', 'DoS Slowhttptest'],
      dtype=object)

In [15]:
BENIGN = ids.loc[ids[' Label'] == 'BENIGN']
DoS_GoldenEye = ids.loc[ids[' Label'] == 'DoS GoldenEye']
DoS_Hulk = ids.loc[ids[' Label'] == 'DoS Hulk']
DoS_Slowhttptest = ids.loc[ids[' Label'] == 'DoS Slowhttptest']


In [16]:
BENIGN.describe()

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,act_data_pkt_fwd,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
count,5005.0,5005.0,5005.0,5005.0,5005.0,5005.0,5005.0,5005.0,5005.0,5005.0,...,5005.0,5005.0,5005.0,5005.0,5005.0,5005.0,5005.0,5005.0,5005.0,5005.0
mean,8896.135664,12384370.0,6.36983,6.431369,775.210989,5657.561,246.608791,22.282517,70.413606,78.658708,...,3.617982,26.053147,68328.57,48104.09,160493.7,41280.04,4174032.0,153254.3,4295982.0,4005783.0
std,19021.064009,31963290.0,24.489302,35.970694,5561.961381,60462.13,826.750063,43.988482,197.475971,285.818241,...,21.295997,6.453267,450714.8,314148.5,873226.2,365791.4,14848380.0,2034671.0,15189070.0,14687320.0
min,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,53.0,181.0,2.0,1.0,31.0,6.0,28.0,0.0,12.75,0.0,...,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,80.0,30944.0,2.0,2.0,70.0,142.0,42.0,6.0,39.0,0.0,...,1.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,443.0,656714.0,4.0,2.0,187.0,352.0,80.0,41.0,52.0,23.276598,...,3.0,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,62011.0,119998900.0,1235.0,1881.0,293115.0,3192321.0,24820.0,1472.0,3731.837838,5185.44164,...,1233.0,52.0,11000000.0,7785188.0,19500000.0,11000000.0,120000000.0,59300000.0,120000000.0,120000000.0


In [17]:
DoS_Slowhttptest.describe().to_csv('DoS_Slowhttptest.csv')

In [18]:
DoS_Hulk.describe().to_csv('DoS_Hulk.csv')

In [19]:
DoS_GoldenEye.describe().to_csv('DoS_GoldenEye.csv')

In [20]:
BENIGN.iloc[:,:-1].mean()

 Destination Port              8.896136e+03
 Flow Duration                 1.238437e+07
 Total Fwd Packets             6.369830e+00
 Total Backward Packets        6.431369e+00
Total Length of Fwd Packets    7.752110e+02
                                   ...     
 Active Min                    4.128004e+04
Idle Mean                      4.174032e+06
 Idle Std                      1.532543e+05
 Idle Max                      4.295982e+06
 Idle Min                      4.005783e+06
Length: 78, dtype: float64

In [21]:
# Columns containing NaN value
ids.columns[ids.isna().any()]

Index(['Flow Bytes/s', ' Flow Packets/s'], dtype='object')

In [22]:
# Count NaN
ids.loc[:,['Flow Bytes/s', ' Flow Packets/s']].isna().sum()

Flow Bytes/s       114
 Flow Packets/s     25
dtype: int64

In [23]:
ids.loc[(ids.count(axis=1) < ids.shape[1]), :]

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
3475,80,0,2,0,0,0,0,0,0.0,0.0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,DoS Hulk
3512,80,0,2,0,0,0,0,0,0.0,0.0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,DoS Hulk
3599,80,0,2,0,0,0,0,0,0.0,0.0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,DoS Hulk
3869,80,0,2,0,0,0,0,0,0.0,0.0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,DoS Hulk
4111,80,0,2,0,0,0,0,0,0.0,0.0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,DoS Hulk
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50023,80,0,2,0,0,0,0,0,0.0,0.0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,DoS Hulk
50266,80,0,2,0,0,0,0,0,0.0,0.0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,DoS Hulk
50349,80,0,2,0,0,0,0,0,0.0,0.0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,DoS Hulk
50690,80,0,2,0,0,0,0,0,0.0,0.0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,DoS Hulk


In [24]:
# All rows containing NaN value are class 'DoS Hulk'
np.unique(ids.loc[(ids.count(axis=1) < ids.shape[1]), :].to_numpy()[:,-1], return_counts=True)

(array(['DoS Hulk'], dtype=object), array([114]))

In [25]:
# Create new DF to visualize correlation coefficient
ids_corr = ids.iloc[:,:-1].dropna()

In [26]:
ids.dropna(inplace=True)

In [27]:
ids.isna().sum().sum()

0

In [28]:
import matplotlib.pyplot as plt

In [29]:
target = [BENIGN, DoS_GoldenEye, DoS_Hulk, DoS_Slowhttptest]

In [30]:

for j in range(len(BENIGN.columns)-1):
    fig, ax = plt.subplots(len(target), 1, constrained_layout=True)
    fig.set_dpi(600)

    target_col = j
    fig.suptitle(BENIGN.columns[target_col])

    for i in range(len(target)):
        
        ax[i].set_title(target[i].iloc[0,-1])
        ax[i].scatter(range(target[i].shape[0]), 
                      target[i].iloc[:,target_col].to_numpy(),
                      marker='x', 
                      s=[5 for _ in range(target[i].shape[0])])
    
    fig.savefig(f"Features_plot/{j}.jpeg", dpi=600)
    plt.close(fig)
    print(f"Features_plot/{j}.jpeg")

Features_plot/0.jpeg
Features_plot/1.jpeg
Features_plot/2.jpeg
Features_plot/3.jpeg
Features_plot/4.jpeg
Features_plot/5.jpeg
Features_plot/6.jpeg
Features_plot/7.jpeg
Features_plot/8.jpeg
Features_plot/9.jpeg
Features_plot/10.jpeg
Features_plot/11.jpeg
Features_plot/12.jpeg
Features_plot/13.jpeg
Features_plot/14.jpeg
Features_plot/15.jpeg
Features_plot/16.jpeg
Features_plot/17.jpeg
Features_plot/18.jpeg
Features_plot/19.jpeg
Features_plot/20.jpeg
Features_plot/21.jpeg
Features_plot/22.jpeg
Features_plot/23.jpeg
Features_plot/24.jpeg
Features_plot/25.jpeg
Features_plot/26.jpeg
Features_plot/27.jpeg
Features_plot/28.jpeg
Features_plot/29.jpeg
Features_plot/30.jpeg
Features_plot/31.jpeg
Features_plot/32.jpeg
Features_plot/33.jpeg
Features_plot/34.jpeg
Features_plot/35.jpeg
Features_plot/36.jpeg
Features_plot/37.jpeg
Features_plot/38.jpeg
Features_plot/39.jpeg
Features_plot/40.jpeg
Features_plot/41.jpeg
Features_plot/42.jpeg
Features_plot/43.jpeg
Features_plot/44.jpeg
Features_plot/45.jpe

In [None]:
for j in range(len(BENIGN.columns)-1):


# Load
---


In [31]:
ids.to_csv('test.csv')