In [1]:
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
import warnings
from sklearn.utils import shuffle
warnings.filterwarnings('ignore')



In [4]:
csv_list = ['02-14-2018.csv', '02-15-2018.csv', '02-16-2018.csv', '02-20-2018.csv', 
            '02-21-2018.csv', '02-22-2018.csv', '02-23-2018.csv', '02-28-2018.csv', 
            '03-01-2018.csv', '03-02-2018.csv']


output_file = 'concat_df.csv'


columns_to_keep = ['Dst Port', 'Flow IAT Min', 'Init Fwd Win Byts', 'Flow Duration', 'Fwd IAT Min', 'Bwd IAT Min', 'Init Bwd Win Byts', 'Flow IAT Std', 'Fwd IAT Tot', 'Flow IAT Mean', 'Flow IAT Max', 'Flow Pkts/s', 
                   'TotLen Fwd Pkts', 'Fwd IAT Mean', 'Fwd Pkts/s', 'Fwd IAT Max', 'Fwd Pkt Len Std', 'Fwd Pkt Len Max', 'Pkt Size Avg', 'Fwd Header Len', 'Tot Fwd Pkts', 'Fwd Act Data Pkts', 'ACK Flag Cnt', 'Label']

def process_file(file_path, chunksize=100000):
    dfs = []
    for chunk in pd.read_csv(file_path, chunksize=chunksize, usecols=columns_to_keep):
        dfs.append(chunk)
    return pd.concat(dfs, ignore_index=True)


def concatenate_files(file_paths):
    dfs = []
    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(process_file, file) for file in file_paths]
        for future in as_completed(futures):
            dfs.append(future.result())

    
    final_df = pd.concat(dfs, ignore_index=True)

    final_df.to_csv(output_file, index=False)

# Remove output file if it already exists
if os.path.exists(output_file):
    os.remove(output_file)

# Run the concatenation
concatenate_files(csv_list)

print(f"Concatenated CSV written to {output_file}")

Concatenated CSV written to concat_df.csv


In [5]:
df_filtered = pd.read_csv('concat_df.csv')
df_filtered.head()

Unnamed: 0,Dst Port,Flow Duration,Tot Fwd Pkts,TotLen Fwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Std,Flow Pkts/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,...,Fwd IAT Min,Bwd IAT Min,Fwd Header Len,Fwd Pkts/s,ACK Flag Cnt,Pkt Size Avg,Init Fwd Win Byts,Init Bwd Win Byts,Fwd Act Data Pkts,Label
0,0,115307855,5,0,0,0,0.04336218,28800000,32400000,61000000,...,812396,0,0,0.04336218,0,0,-1,-1,0,Benign
1,0,60997457,2,0,0,0,0.032788252,61000000,0,61000000,...,61000000,0,0,0.032788252,0,0,-1,-1,0,Benign
2,67,61149019,5,1500,300,0,0.081767461,15300000,12800000,32600000,...,3530939,0,40,0.081767461,0,360,-1,-1,4,Benign
3,0,60997555,2,0,0,0,0.032788199,61000000,0,61000000,...,61000000,0,0,0.032788199,0,0,-1,-1,0,Benign
4,0,61997503,3,0,0,0,0.048389046,31000000,42400000,61000000,...,999909,0,0,0.048389046,0,0,-1,-1,0,Benign


In [6]:
df_filtered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16233002 entries, 0 to 16233001
Data columns (total 24 columns):
 #   Column             Dtype 
---  ------             ----- 
 0   Dst Port           object
 1   Flow Duration      object
 2   Tot Fwd Pkts       object
 3   TotLen Fwd Pkts    object
 4   Fwd Pkt Len Max    object
 5   Fwd Pkt Len Std    object
 6   Flow Pkts/s        object
 7   Flow IAT Mean      object
 8   Flow IAT Std       object
 9   Flow IAT Max       object
 10  Flow IAT Min       object
 11  Fwd IAT Tot        object
 12  Fwd IAT Mean       object
 13  Fwd IAT Max        object
 14  Fwd IAT Min        object
 15  Bwd IAT Min        object
 16  Fwd Header Len     object
 17  Fwd Pkts/s         object
 18  ACK Flag Cnt       object
 19  Pkt Size Avg       object
 20  Init Fwd Win Byts  object
 21  Init Bwd Win Byts  object
 22  Fwd Act Data Pkts  object
 23  Label              object
dtypes: object(24)
memory usage: 2.9+ GB


In [7]:
df_filtered.describe()

Unnamed: 0,Dst Port,Flow Duration,Tot Fwd Pkts,TotLen Fwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Std,Flow Pkts/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,...,Fwd IAT Min,Bwd IAT Min,Fwd Header Len,Fwd Pkts/s,ACK Flag Cnt,Pkt Size Avg,Init Fwd Win Byts,Init Bwd Win Byts,Fwd Act Data Pkts,Label
count,16233002,16233002,16233002,16233002,16233002,16233002.0,16233002.0,16233002.0,16233002.0,16233002,...,16233002,16233002,16233002,16233002.0,16233002,16233002.0,16233002,16233002,16233002,16233002
unique,84935,5168801,5756,24332,2893,387342.0,6690516.0,6120568.0,7410439.0,3137837,...,980067,520187,8898,6617109.0,5,418449.0,18330,18105,3031,16
top,53,1,1,0,0,0.0,2000000.0,1.0,0.0,1,...,0,0,40,500000.0,0,0.0,8192,-1,0,Benign
freq,3832396,193577,4192912,4819457,4819457,8775702.0,177800.0,184841.0,7456487.0,196251,...,4407388,9453448,3252371,147436.0,10301889,4814361.0,4929163,7885337,8251244,13484708


In [8]:
df_filtered.isnull().sum()

Dst Port             0
Flow Duration        0
Tot Fwd Pkts         0
TotLen Fwd Pkts      0
Fwd Pkt Len Max      0
Fwd Pkt Len Std      0
Flow Pkts/s          0
Flow IAT Mean        0
Flow IAT Std         0
Flow IAT Max         0
Flow IAT Min         0
Fwd IAT Tot          0
Fwd IAT Mean         0
Fwd IAT Max          0
Fwd IAT Min          0
Bwd IAT Min          0
Fwd Header Len       0
Fwd Pkts/s           0
ACK Flag Cnt         0
Pkt Size Avg         0
Init Fwd Win Byts    0
Init Bwd Win Byts    0
Fwd Act Data Pkts    0
Label                0
dtype: int64

In [9]:
df_filtered['Label'].value_counts()

Label
Benign                      13484708
DDOS attack-HOIC              686012
DDoS attacks-LOIC-HTTP        576191
DoS attacks-Hulk              461912
Bot                           286191
FTP-BruteForce                193360
SSH-Bruteforce                187589
Infilteration                 161934
DoS attacks-SlowHTTPTest      139890
DoS attacks-GoldenEye          41508
DoS attacks-Slowloris          10990
DDOS attack-LOIC-UDP            1730
Brute Force -Web                 611
Brute Force -XSS                 230
SQL Injection                     87
Label                             59
Name: count, dtype: int64

In [10]:
benign_df = df_filtered[df_filtered['Label'] == 'Benign']
non_benign_df = df_filtered[df_filtered['Label'] != 'Benign']

In [11]:
benign_sampled = shuffle(benign_df, n_samples = 1183245, random_state = 42)

In [12]:
new_df = pd.concat([benign_sampled, non_benign_df])

In [13]:
new_df.to_csv('reduced_dataset.csv', index=False)

In [14]:
df = pd.read_csv('reduced_dataset.csv')
df.head()

Unnamed: 0,Dst Port,Flow Duration,Tot Fwd Pkts,TotLen Fwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Std,Flow Pkts/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,...,Fwd IAT Min,Bwd IAT Min,Fwd Header Len,Fwd Pkts/s,ACK Flag Cnt,Pkt Size Avg,Init Fwd Win Byts,Init Bwd Win Byts,Fwd Act Data Pkts,Label
0,445,21035444,1,0.0,0.0,0.0,0.237694,5258861.0,5130278.201409,12015355.0,...,0.0,3004536.0,32,0.047539,0,0.0,8192,0,0,Benign
1,53,414,1,46.0,46.0,0.0,4830.917874,414.0,0.0,414.0,...,0.0,0.0,8,2415.458937,0,77.0,-1,-1,0,Benign
2,51832,57,2,0.0,0.0,0.0,35087.719298,57.0,0.0,57.0,...,57.0,0.0,40,35087.719298,1,0.0,260,-1,0,Benign
3,53,2749,1,49.0,49.0,0.0,727.537286,2749.0,0.0,2749.0,...,0.0,0.0,8,363.768643,0,131.0,-1,-1,0,Benign
4,63974,1308,5,935.0,935.0,418.144712,5351.681957,218.0,283.818252,742.0,...,3.0,742.0,124,3822.629969,0,179.714286,65535,32768,1,Benign


In [15]:
df['Label'].value_counts()

Label
Benign                      1183245
DDOS attack-HOIC             686012
DDoS attacks-LOIC-HTTP       576191
DoS attacks-Hulk             461912
Bot                          286191
FTP-BruteForce               193360
SSH-Bruteforce               187589
Infilteration                161934
DoS attacks-SlowHTTPTest     139890
DoS attacks-GoldenEye         41508
DoS attacks-Slowloris         10990
DDOS attack-LOIC-UDP           1730
Brute Force -Web                611
Brute Force -XSS                230
SQL Injection                    87
Label                            59
Name: count, dtype: int64