# ETL Process
---
 In this file, it will 
 * read data inside 'fall2024data/'
 * convert data into one dataframe
 * change the name of features
 * drop rows which containing Nan or Inf value
 * save plot of features to 'Featrues_plot/'
 * save Analystic data to 'Analysis/'
 * save processed data to csv in 'Datasets/'

    'Dataset.csv'           - whole dataset of traffic\
    'BENIGN.csv'            - set of data labeled 'BENIGN'\
    'DoS_GoldenEye.csv'     - set ofdata labeled 'DoS_GoldenEye'\
    'DoS_Hulk.csv'          - set of data labeled 'DoS_Hulk'\
    'DoS_Slowhttptest.csv'  - set of data labeled 'DoS_Slowttptest'
    
 ...

## Extraction
---
This part will load data from the folder and concatenate them into one DataFrame


In [None]:
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import glob

In [None]:
_ids = list()

In [None]:
# get csv files
for csvfile in glob.glob('fall2024data/*.csv'):
    _ids.append(pd.read_csv(csvfile, sep=','))

In [None]:
# get json files
for jsonfile in glob.glob('fall2024data/*.json'):
    _ids.append(pd.read_json(jsonfile, lines=True))

In [None]:
# get parquet files
for pqfile in glob.glob('fall2024data/*.parquet'):
    buff = pq.read_table(pqfile)
    _ids.append(buff.to_pandas())

In [None]:
ids = pd.concat(_ids, ignore_index=True)

## Transform
---
In this part, data will be separated by its Label and processed to show some insight
* Data types conversion
* Data format conversion (cm to inches, etc.)
* Remove duplicates
* Identifying errors in data
* Handling out-of-range and outlier data
* Add any other transformations you find necessary.

Also, Drop Label 'Heartbleed'


In [None]:
# Drop Label 'Heartbleed'
Hbd = (ids.iloc[:,-1] == 'Heartbleed')
H_idx = Hbd[Hbd == True].index
ids.drop(H_idx, inplace=True)

### General Info about data

In [None]:
# 61117 samples with 78 features and 1 label
ids.shape

In [None]:
#All of features are in numerical type, thus, we don't need to transform it.
ids.info()

In [None]:
pd.unique(ids.iloc[:,-1])

### Change the name of the features

In [None]:
cols = list(ids.columns)
for i in range(len(cols)):
    cols[i] = cols[i].strip()
ids.columns = cols

### Drop rows which contains Nan or Inf value

In [None]:
# Convert Inf value into Nan
ids.replace([np.inf, -np.inf], np.nan, inplace=True)

In [None]:
# Columns containing NaN value
ids.columns[ids.isna().any()]

In [None]:
# 'DoS Hulk' and 'BENIGN' contains Nan or Inf value
np.unique(ids.loc[(ids.count(axis=1) < ids.shape[1]), :].to_numpy()[:,-1], return_counts=True)

In [None]:
ids.dropna(inplace=True)
ids.isna().sum().sum()

In [None]:
BENIGN = ids.loc[ids['Label'] == 'BENIGN']
DoS_GoldenEye = ids.loc[ids['Label'] == 'DoS GoldenEye']
DoS_Hulk = ids.loc[ids['Label'] == 'DoS Hulk']
DoS_Slowhttptest = ids.loc[ids['Label'] == 'DoS Slowhttptest']

### Plotting each features
Just so watch distribution

In [None]:
import matplotlib.pyplot as plt

In [None]:
target = [BENIGN, DoS_GoldenEye, DoS_Hulk, DoS_Slowhttptest]

In [None]:
for j in range(len(BENIGN.columns)-1):
    fig, ax = plt.subplots(len(target), 1, constrained_layout=True)
    fig.set_dpi(600)

    target_col = j
    fig.suptitle(BENIGN.columns[target_col])

    for i in range(len(target)):
        
        ax[i].set_title(target[i].iloc[0,-1])
        ax[i].scatter(range(target[i].shape[0]), 
                      target[i].iloc[:,target_col].to_numpy(),
                      marker='x', 
                      s=[5 for _ in range(target[i].shape[0])])
    
    fig.savefig(f"Features_plot/{j}_{BENIGN.columns[j].replace('/', '')}.jpeg", dpi=600)
    plt.close(fig)
    print(f"Features_plot/{j}_{BENIGN.columns[j].replace('/', '')}.jpeg")

In [None]:
ids.describe().to_csv('Analysis/ids_describe.csv')

In [None]:
DoS_GoldenEye.describe().to_csv('Analysis/GoldenEye_describe.csv')
DoS_Hulk.describe().to_csv('Analysis/Hulk_describe.csv')
DoS_Slowhttptest.describe().to_csv('Analysis/Slowhttptest_describe.csv')

# Load
---


In [None]:
DoS_Slowhttptest.to_csv('Datasets/DoS_Slowhttptest.csv')
DoS_Hulk.to_csv('Datasets/DoS_Hulk.csv')
DoS_GoldenEye.to_csv('Datasets/DoS_GoldenEye.csv')

In [None]:
ids.to_csv('Datasets/Dataset.csv')