In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

import seaborn as sns
import matplotlib.pyplot as plt

Merging two datasets then downscaling them to fit the model

In [2]:
data1 = pd.read_csv('02-20-2018.csv')
data2 = pd.read_csv('02-21-2018.csv')

data = pd.concat([data1, data2])

data.head(5)

Unnamed: 0,Flow ID,Src IP,Src Port,Dst IP,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,172.31.69.25-94.231.103.172-22-45498-6,94.231.103.172,45498.0,172.31.69.25,22,6,20/02/2018 08:34:07,888751,11,11,...,32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
1,8.0.6.4-8.6.0.1-0-0-0,8.6.0.1,0.0,8.0.6.4,0,0,20/02/2018 08:33:22,112642816,3,0,...,0,0.0,0.0,0.0,0.0,56300000.0,7.071068,56300000.0,56300000.0,Benign
2,8.0.6.4-8.6.0.1-0-0-0,8.6.0.1,0.0,8.0.6.4,0,0,20/02/2018 08:36:11,112642712,3,0,...,0,0.0,0.0,0.0,0.0,56300000.0,18.384776,56300000.0,56300000.0,Benign
3,8.0.6.4-8.6.0.1-0-0-0,8.6.0.1,0.0,8.0.6.4,0,0,20/02/2018 08:39:00,112642648,3,0,...,0,0.0,0.0,0.0,0.0,56300000.0,5.656854,56300000.0,56300000.0,Benign
4,8.0.6.4-8.6.0.1-0-0-0,8.6.0.1,0.0,8.0.6.4,0,0,20/02/2018 08:41:49,112642702,3,0,...,0,0.0,0.0,0.0,0.0,56300000.0,65.053824,56300000.0,56300000.0,Benign


In [3]:
data.shape

(8997323, 84)

In [4]:
data['Label'].value_counts()

Label
Benign                    7733390
DDOS attack-HOIC           686012
DDoS attacks-LOIC-HTTP     576191
DDOS attack-LOIC-UDP         1730
Name: count, dtype: int64

In [5]:
attacks = 0
for i in range(1, 4):
    attacks += data['Label'].value_counts()[i]
    
print(attacks)

1263933


  attacks += data['Label'].value_counts()[i]
  attacks += data['Label'].value_counts()[i]
  attacks += data['Label'].value_counts()[i]


In [6]:
benign_samples = data[data['Label'] == 'Benign']
attack_samples = data[data['Label'] != 'Benign']

num_attacks = len(attack_samples)
if num_attacks > 2 * len(benign_samples):
    num_attacks = len(benign_samples) // 2

balanced_benign = benign_samples.sample(n=num_attacks, random_state=42)

data = pd.concat([balanced_benign, attack_samples])

data = data.sample(frac=1, random_state=42).reset_index(drop=True)

In [7]:
data.shape

(2527866, 84)

Data Normalisation

In [8]:
label_encoder = LabelEncoder()
data['Label'] = label_encoder.fit_transform(data['Label'])

In [9]:
# Drop columns which cannot contribute to prediction
data = data.drop(columns=['Timestamp','Flow ID', 'Src IP', 'Src Port', 'Dst IP', 'Dst Port', 'Protocol'])

data[['Flow Byts/s', 'Flow Pkts/s']] = np.log1p(data[['Flow Byts/s', 'Flow Pkts/s']])

data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.dropna(inplace=True)

numerical_cols = data.select_dtypes(include=['float64', 'int64']).columns
numerical_cols = numerical_cols.difference(['Label'])

scaler = StandardScaler()
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

print(data.columns)

Index(['Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts', 'TotLen Fwd Pkts',
       'TotLen Bwd Pkts', 'Fwd Pkt Len Max', 'Fwd Pkt Len Min',
       'Fwd Pkt Len Mean', 'Fwd Pkt Len Std', 'Bwd Pkt Len Max',
       'Bwd Pkt Len Min', 'Bwd Pkt Len Mean', 'Bwd Pkt Len Std', 'Flow Byts/s',
       'Flow Pkts/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max',
       'Flow IAT Min', 'Fwd IAT Tot', 'Fwd IAT Mean', 'Fwd IAT Std',
       'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Tot', 'Bwd IAT Mean',
       'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags',
       'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Len',
       'Bwd Header Len', 'Fwd Pkts/s', 'Bwd Pkts/s', 'Pkt Len Min',
       'Pkt Len Max', 'Pkt Len Mean', 'Pkt Len Std', 'Pkt Len Var',
       'FIN Flag Cnt', 'SYN Flag Cnt', 'RST Flag Cnt', 'PSH Flag Cnt',
       'ACK Flag Cnt', 'URG Flag Cnt', 'CWE Flag Count', 'ECE Flag Cnt',
       'Down/Up Ratio', 'Pkt Size Avg', 'Fwd Seg Size Avg', 'Bwd Seg Size Avg',
   

In [10]:
data.head(5)

Unnamed: 0,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,Fwd Pkt Len Std,Bwd Pkt Len Max,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,1.281942,-0.030638,-0.038609,-0.03191,-0.017441,-0.540471,-0.363191,-0.681175,-0.526686,-0.742814,...,0.427249,-0.057607,-0.048256,-0.064798,-0.044861,2.726943,-0.072133,2.686088,2.752107,3
1,-0.371297,-0.030897,-0.029221,-0.031618,-0.016985,-0.392423,1.532894,0.017622,-0.526686,-0.599357,...,-2.028648,-0.057607,-0.048256,-0.064798,-0.044861,-0.352854,-0.072133,-0.35401,-0.348722,0
2,-0.370788,-0.030379,-0.001057,-0.029373,-0.011353,0.746719,-0.363191,1.344044,1.376072,1.173361,...,0.427249,-0.057607,-0.048256,-0.064798,-0.044861,-0.352854,-0.072133,-0.35401,-0.348722,1
3,-0.371376,-0.030638,-0.038609,-0.03191,-0.017441,-0.540471,-0.363191,-0.681175,-0.526686,-0.742814,...,0.427249,-0.057607,-0.048256,-0.064798,-0.044861,-0.352854,-0.072133,-0.35401,-0.348722,0
4,-0.1567,-0.030119,-0.001057,-0.031124,-0.015937,-0.141565,-0.363191,-0.210457,-0.016015,-0.269406,...,0.427249,-0.057607,-0.048256,-0.064798,-0.044861,-0.352854,-0.072133,-0.35401,-0.348722,0


**Feature Selection**

In [11]:
correlation_matrix = data.corr()

label_correlation = correlation_matrix['Label'].abs()

sorted_correlation = label_correlation.sort_values(ascending=False)

print("Features sorted by their correlation with the Label:")
print(sorted_correlation)

threshold = 0.1
selected_features = sorted_correlation[sorted_correlation > threshold].index

selected_data = data[selected_features]

data = selected_data

print(f"Selected features based on threshold {threshold}:")
print(selected_features)

Features sorted by their correlation with the Label:
Label               1.000000
Flow Pkts/s         0.500718
Fwd Seg Size Avg    0.405727
Fwd Pkt Len Mean    0.405727
Fwd Pkt Len Max     0.345266
                      ...   
Fwd Pkts/b Avg           NaN
Fwd Blk Rate Avg         NaN
Bwd Byts/b Avg           NaN
Bwd Pkts/b Avg           NaN
Bwd Blk Rate Avg         NaN
Name: Label, Length: 77, dtype: float64
Selected features based on threshold 0.1:
Index(['Label', 'Flow Pkts/s', 'Fwd Seg Size Avg', 'Fwd Pkt Len Mean',
       'Fwd Pkt Len Max', 'Fwd Seg Size Min', 'Bwd Pkt Len Min', 'Pkt Len Min',
       'Flow IAT Min', 'Flow Byts/s', 'Fwd IAT Min', 'Flow IAT Mean',
       'Fwd Pkt Len Std', 'Fwd Pkt Len Min', 'Fwd IAT Mean',
       'Init Bwd Win Byts', 'RST Flag Cnt', 'ECE Flag Cnt', 'ACK Flag Cnt',
       'Idle Min', 'Idle Mean', 'Bwd Pkt Len Std', 'Idle Max', 'Bwd IAT Tot',
       'Flow IAT Max', 'Fwd IAT Std', 'Fwd IAT Max', 'SYN Flag Cnt',
       'Fwd PSH Flags', 'Flow IAT Std', '

**Data Splitting**

In [12]:
train, test = train_test_split(data, test_size=0.01, random_state=12)

X_train = train.drop(columns=['Label'])
y_train = train['Label']

X_test = test.drop(columns=['Label'])
y_test = test['Label']

test['Label'].value_counts()

Label
0    12716
1     6717
3     5731
2       18
Name: count, dtype: int64

In [13]:
model = RandomForestClassifier(
    n_estimators=50,
    criterion='gini',
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    min_weight_fraction_leaf=0.0,
    max_features=3,
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    bootstrap=True,
    oob_score=False,
    n_jobs=-1,
    random_state=None,
    verbose=1,
    warm_start=False,
    ccp_alpha=0.0,
    max_samples=None
)

hyperparameters = {
    'n_estimators': [125]
    # [50, 75, 100, 125, 150]
}

In [14]:
clf = GridSearchCV(
    estimator=model,
    param_grid=hyperparameters,
    cv=2, # 5
    verbose=1,
    n_jobs=-1
)

In [15]:
clf.fit(X=X_train, y=y_train)

Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   17.8s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   17.8s
[Parallel(n_jobs=-1)]: Done 125 out of 125 | elapsed:   55.7s finished
[Parallel(n_jobs=-1)]: Done 125 out of 125 | elapsed:   55.7s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.8s
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.9s
[Parallel(n_jobs=8)]: Done 125 out of 125 | elapsed:    2.7s finished
[Parallel(n_jobs=8)]: Done 125 out of 125 | elapsed:    2.7s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   17.8s
[Parall

In [16]:
print("Accuracy score on Validation set: \n")
print(clf.best_score_ )
print("---------------")
print("Best performing hyperparameters on Validation set: ")
print(clf.best_params_)
print("---------------")
print(clf.best_estimator_)

Accuracy score on Validation set: 

0.9947774143628325
---------------
Best performing hyperparameters on Validation set: 
{'n_estimators': 125}
---------------
RandomForestClassifier(max_features=3, n_estimators=125, n_jobs=-1, verbose=1)


In [17]:
data['Label'].value_counts()

Label
0    1254267
1     686012
3     576191
2       1730
Name: count, dtype: int64

In [18]:
model = clf.best_estimator_

predictions = model.predict(X_test)

print("Accuracy: ",accuracy_score(y_test, predictions)*100,"%")
print("Precision: ",precision_score(y_test, predictions, average="macro")*100,"%")
print("Recall: ",recall_score(y_test, predictions, average="macro")*100,"%")

Accuracy:  99.50758478278135 %
Precision:  98.18221240634759 %
Recall:  98.28736359548131 %


[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 125 out of 125 | elapsed:    0.0s finished


In [19]:
from sklearn.metrics import confusion_matrix
cf_matrix = confusion_matrix(y_test, predictions)
# import seaborn as sns
# sns.heatmap(cf_matrix, annot=True)
cf_matrix

array([[12632,    60,     0,    24],
       [   14,  6699,     0,     4],
       [    0,     0,    17,     1],
       [   12,     8,     1,  5710]])