# Exploring the datasets

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
import torch
import numpy as np

## Preliminary and general analysis of train part

In [6]:
folder_selector='.\\datasets\\kaggle DDoS Dataset\\ddos_balanced'

In [7]:
def serch_csvs_in_folder(path):
    csv_files = []
    for dirname, _, filenames in os.walk(path):
        for filename in filenames:
            csv_file = os.path.join(dirname, filename)
            csv_files.append(csv_file)
    return csv_files

In [8]:
csv_files=serch_csvs_in_folder(folder_selector)
csv_files

['.\\datasets\\kaggle DDoS Dataset\\ddos_balanced\\final_dataset.csv']

In [9]:
sizes = [str(os.path.getsize(file)/(2.0**30)) + " GB" for file in csv_files]
sizes

['6.328099204227328 GB']

In [7]:
classes_distribution_test = {}

for ds in csv_files: #for every dataset
    with pd.read_csv(ds, chunksize=10**6) as reader: #that has to be processed in chunks
        for chunk in reader:
            chunk.columns = chunk.columns.str.strip()

            labels_count = chunk['Label'].value_counts()

            for class_,count in labels_count.iteritems():
                #print(class_, count)
                if class_ in classes_distribution_test.keys():
                    classes_distribution_test[class_] += count
                else:
                    classes_distribution_test[class_] = count

classes_distribution_test

{'ddos': 6472647, 'Benign': 6321980}

In [8]:
classes_seen = {}
dest='.\\datasets\\kaggle DDoS Dataset\\small\\final.csv'
header_only=pd.read_csv(ds, nrows=0)
header_only.to_csv(dest, mode='w', header=True, index=False)

In [9]:
classes_seen=classes_seen.fromkeys(classes_distribution_test, 0)
for ds in csv_files: #for every dataset
    with pd.read_csv(ds, chunksize=10**6) as reader: #that has to be processed in chunks
        for chunk in reader:

            rows_to_drop=[] #store rows of the chunk to cut away

            ch_copy=chunk.copy() #duplicate chunk to edit it

            #ch_copy.columns = ch_copy.columns.str.strip()

            #drop na and +-inf values
            ch_copy.replace([np.inf, -np.inf], np.nan, inplace=True)
            ch_copy.dropna(inplace=True)

            #ch_copy.replace({'Label': replacement_dict}, inplace=True)
            #drop useless cols
            #ch_copy.drop(['Unnamed: 0','Timestamp','Source Port','Source IP','SimillarHTTP',
             #'Protocol','Flow ID','Destination IP'], axis=1, inplace=True)

            #print(chunk.index)
            #print('Chunk length before drop: ', len(chunk))

            for line_idx in ch_copy.index:
                label = str(ch_copy.loc[line_idx, 'Label'])

                classes_seen[label] += 1
                if classes_seen[label] > 500000:
                    rows_to_drop.append(line_idx)

            ch_copy.drop(index=rows_to_drop, inplace=True) #finally drop excess

            #print('Chunk length after drop: ', len(ch_copy))

            #append with no header
            ch_copy.to_csv(dest, mode='a', header=False, index=False)

In [None]:
#df.describe()
df.info()
#df.head()

In [None]:
first_row=df.iloc[0,:]
first_row

In [None]:
columns = df.columns.str.strip().to_list()
columns.sort()
columns

['ACK Flag Count',
 'Active Max',
 'Active Mean',
 'Active Min',
 'Active Std',
 'Average Packet Size',
 'Avg Bwd Segment Size',
 'Avg Fwd Segment Size',
 'Bwd Avg Bulk Rate',
 'Bwd Avg Bytes/Bulk',
 'Bwd Avg Packets/Bulk',
 'Bwd Header Length',
 'Bwd IAT Max',
 'Bwd IAT Mean',
 'Bwd IAT Min',
 'Bwd IAT Std',
 'Bwd IAT Total',
 'Bwd PSH Flags',
 'Bwd Packet Length Max',
 'Bwd Packet Length Mean',
 'Bwd Packet Length Min',
 'Bwd Packet Length Std',
 'Bwd Packets/s',
 'Bwd URG Flags',
 'CWE Flag Count',
 'Destination Port',
 'Down/Up Ratio',
 'ECE Flag Count',
 'FIN Flag Count',
 'Flow Bytes/s',
 'Flow Duration',
 'Flow IAT Max',
 'Flow IAT Mean',
 'Flow IAT Min',
 'Flow IAT Std',
 'Flow Packets/s',
 'Fwd Avg Bulk Rate',
 'Fwd Avg Bytes/Bulk',
 'Fwd Avg Packets/Bulk',
 'Fwd Header Length',
 'Fwd Header Length.1',
 'Fwd IAT Max',
 'Fwd IAT Mean',
 'Fwd IAT Min',
 'Fwd IAT Std',
 'Fwd IAT Total',
 'Fwd PSH Flags',
 'Fwd Packet Length Max',
 'Fwd Packet Length Mean',
 'Fwd Packet Length Min

In [None]:
#df.drop(['Unnamed: 0','Timestamp','Source Port','Source IP','SimillarHTTP',
#'Protocol','Flow ID','Destination IP'], axis=1, inplace=True)
df.drop_duplicates(keep='first', inplace=True)
print("After dropping duplicates, the length of df:", len(df))

After dropping duplicates, the length of df: 2522362


In [None]:
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)
print("After dropping NaNs, the length of df:", len(df))

After dropping NaNs, the length of df: 2520798


In [None]:
"""for col in df.columns:
    if len(df[col].unique()) == 1:
        print('Removing column', str(col))
        df.drop(col,inplace=True,axis=1)"""

Removing column Bwd PSH Flags
Removing column Bwd URG Flags
Removing column Fwd Avg Bytes/Bulk
Removing column Fwd Avg Packets/Bulk
Removing column Fwd Avg Bulk Rate
Removing column Bwd Avg Bytes/Bulk
Removing column Bwd Avg Packets/Bulk
Removing column Bwd Avg Bulk Rate


---
## Analyzing train dataset part

In [None]:
del df

In [None]:
folder_selector='.\\datasets\\CICDDoS2019\\original\\train'

In [None]:
csv_files = serch_csvs_in_folder(folder_selector)
csv_files

['.\\datasets\\CICDDoS2019\\original\\train\\DrDoS_DNS.csv',
 '.\\datasets\\CICDDoS2019\\original\\train\\DrDoS_LDAP.csv',
 '.\\datasets\\CICDDoS2019\\original\\train\\DrDoS_MSSQL.csv',
 '.\\datasets\\CICDDoS2019\\original\\train\\DrDoS_NetBIOS.csv',
 '.\\datasets\\CICDDoS2019\\original\\train\\DrDoS_NTP.csv',
 '.\\datasets\\CICDDoS2019\\original\\train\\DrDoS_SNMP.csv',
 '.\\datasets\\CICDDoS2019\\original\\train\\DrDoS_SSDP.csv',
 '.\\datasets\\CICDDoS2019\\original\\train\\DrDoS_UDP.csv',
 '.\\datasets\\CICDDoS2019\\original\\train\\Syn.csv',
 '.\\datasets\\CICDDoS2019\\original\\train\\TFTP.csv',
 '.\\datasets\\CICDDoS2019\\original\\train\\UDPLag.csv']

In [None]:
classes_distribution_train = {}

for ds in csv_files: #for every dataset
    with pd.read_csv(ds, chunksize=10**6) as reader: #that has to be processed in chunks
        for chunk in reader:
            chunk.columns = chunk.columns.str.strip()

            labels_count = chunk['Label'].value_counts()

            for class_,count in labels_count.iteritems():
                #print(class_, count)
                if class_ in classes_distribution_train.keys():
                    classes_distribution_train[class_] += count
                else:
                    classes_distribution_train[class_] = count

classes_distribution_train

  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:


KeyboardInterrupt: 

In [None]:
dict_={}

for k,v in classes_distribution_train.items():
    dict_[k.replace('DrDoS_', '')] = classes_distribution_train[k]

dict_['UDPLag'] = dict_.pop('UDP-lag')

classes_distribution_train = dict_

%store classes_distribution_train

Stored 'classes_distribution_train' (dict)


In [None]:
%store -r classes_distribution_train
plt.bar(classes_distribution_train.keys(), classes_distribution_train.values())
plt.show()

---
## Analyzing test dataset part

In [None]:
folder_selector='.\\datasets\\CICDDoS2019\\original\\test'

In [None]:
csv_files = serch_csvs_in_folder(folder_selector)
csv_files

['.\\datasets\\CICDDoS2019\\original\\test\\LDAP.csv',
 '.\\datasets\\CICDDoS2019\\original\\test\\MSSQL.csv',
 '.\\datasets\\CICDDoS2019\\original\\test\\NetBIOS.csv',
 '.\\datasets\\CICDDoS2019\\original\\test\\Portmap.csv',
 '.\\datasets\\CICDDoS2019\\original\\test\\Syn.csv',
 '.\\datasets\\CICDDoS2019\\original\\test\\UDP.csv',
 '.\\datasets\\CICDDoS2019\\original\\test\\UDPLag.csv']

In [None]:
classes_distribution_test = {}

for ds in csv_files: #for every dataset
    with pd.read_csv(ds, chunksize=10**6) as reader: #that has to be processed in chunks
        for chunk in reader:
            chunk.columns = chunk.columns.str.strip()

            labels_count = chunk['Label'].value_counts()

            for class_,count in labels_count.iteritems():
                #print(class_, count)
                if class_ in classes_distribution_test.keys():
                    classes_distribution_test[class_] += count
                else:
                    classes_distribution_test[class_] = count

classes_distribution_test

  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:
  for chunk in reader:


{'LDAP': 1915122,
 'NetBIOS': 3657497,
 'BENIGN': 56965,
 'MSSQL': 5787453,
 'Portmap': 186960,
 'Syn': 4891500,
 'UDP': 3867155,
 'UDPLag': 1873}

In [None]:
%store classes_distribution_test

Stored 'classes_distribution_test' (dict)


In [None]:
%store -r classes_distribution_test
plt.bar(classes_distribution_test.keys(), classes_distribution_test.values())
plt.show()

___
## Manintain all data in original dataset

In [20]:
original='.\\datasets\\kaggle DDoS Dataset\\ddos_balanced\\final_dataset.csv'
dest='.\\datasets\\kaggle DDoS Dataset\\refined_total\\refined.csv'
header_only=pd.read_csv(original, nrows=0)
header_only.drop(['Unnamed: 0','Timestamp','Src IP',
            'Flow ID','Dst IP'], axis=1, inplace=True)
header_only.to_csv(dest, mode='w', header=True, index=False)

In [27]:
print(header_only.columns.to_list())
print(len(header_only.columns.to_list()))

['Src Port', 'Dst Port', 'Protocol', 'Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts', 'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max', 'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std', 'Bwd Pkt Len Max', 'Bwd Pkt Len Min', 'Bwd Pkt Len Mean', 'Bwd Pkt Len Std', 'Flow Byts/s', 'Flow Pkts/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Tot', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Tot', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Len', 'Bwd Header Len', 'Fwd Pkts/s', 'Bwd Pkts/s', 'Pkt Len Min', 'Pkt Len Max', 'Pkt Len Mean', 'Pkt Len Std', 'Pkt Len Var', 'FIN Flag Cnt', 'SYN Flag Cnt', 'RST Flag Cnt', 'PSH Flag Cnt', 'ACK Flag Cnt', 'URG Flag Cnt', 'CWE Flag Count', 'ECE Flag Cnt', 'Down/Up Ratio', 'Pkt Size Avg', 'Fwd Seg Size Avg', 'Bwd Seg Size Avg', 'Fwd Byts/b Avg', 'Fwd Pkts/b Avg', 'Fwd Blk Rate Avg', 'Bwd Byt

In [22]:
for ds in csv_files: #for every dataset
    with pd.read_csv(ds, chunksize=10**6) as reader: #that has to be processed in chunks
        for chunk in reader:

            ch_copy=chunk.copy() #duplicate chunk to edit it

            ch_copy.columns = ch_copy.columns.str.strip()

            #drop na and +-inf values
            ch_copy.replace([np.inf, -np.inf], np.nan, inplace=True)
            ch_copy.dropna(inplace=True)

            #drop useless cols
            ch_copy.drop(['Unnamed: 0','Timestamp','Src IP',
            'Flow ID','Dst IP'], axis=1, inplace=True)

            #append with no header
            ch_copy.to_csv(dest, mode='a', header=False, index=False)

In [23]:
sizes = [str(os.path.getsize(dest)/(2.0**30)) + " GB" for file in csv_files]
sizes

['5.202487978152931 GB']

In [25]:
classes_distribution_kaggle_bal = {}

for ds in csv_files: #for every dataset
    with pd.read_csv(ds, chunksize=10**6) as reader: #that has to be processed in chunks
        for chunk in reader:
            chunk.columns = chunk.columns.str.strip()

            labels_count = chunk['Label'].value_counts()

            for class_,count in labels_count.items():
                #print(class_, count)
                if class_ in classes_distribution_kaggle_bal.keys():
                    classes_distribution_kaggle_bal[class_] += count
                else:
                    classes_distribution_kaggle_bal[class_] = count

classes_distribution_kaggle_bal

  for class_,count in labels_count.iteritems():
  for class_,count in labels_count.iteritems():
  for class_,count in labels_count.iteritems():
  for class_,count in labels_count.iteritems():
  for class_,count in labels_count.iteritems():
  for class_,count in labels_count.iteritems():
  for class_,count in labels_count.iteritems():
  for class_,count in labels_count.iteritems():
  for class_,count in labels_count.iteritems():
  for class_,count in labels_count.iteritems():
  for class_,count in labels_count.iteritems():
  for class_,count in labels_count.iteritems():
  for class_,count in labels_count.iteritems():


{'ddos': 6472647, 'Benign': 6321980}

In [26]:
%store classes_distribution_kaggle_bal

Stored 'classes_distribution_kaggle_bal' (dict)
