In [1]:
# All  import statements needed for the notebook
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.metrics import *

In [2]:
import tensorflow as tf
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
data = pd.read_csv('/content/drive/MyDrive/balancedhalf_data.csv')

data.drop(data.columns[0] , inplace=True , axis=1)

unknown = ['SSH-Patator','DoS slowloris','DoS Slowhttptest','Bot','Infiltration','Heartbleed']
att = data.loc[(data['Label'].isin(unknown))]

data.drop(att.index,axis=0 , inplace=True, errors='ignore')

In [5]:
## PreProcessing

# importing required libraries for normalizing data
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
# selecting numeric attributes columns from data
numeric_col = data.select_dtypes(include='number').columns

# using standard scaler for normalizing
std_scaler = MinMaxScaler()
def normalization(df,att,col):
  for i in col:
    arr = df[i]
    arr = np.array(arr)
    x = np.array(att[i])
    df[i] = std_scaler.fit_transform(arr.reshape(len(arr),1))
    #To use the same scaler which was used in preprocessing the train data 
    att[i] = std_scaler.transform(x.reshape(len(x),1))
  return df,att
# calling the normalization() function
data , att = normalization(data.copy(),att.copy(),numeric_col)

data.shape , att.shape

att.Label = 'unknown'

X = att.drop('Label' , axis=1)
X = X.to_numpy().reshape(-1, 83,1)

y = att.Label

X_train = data.drop('Label' , axis=1)
X_train = X_train.to_numpy().reshape(-1, 83,1)

y_train = data.Label

In [6]:
# **Load Model and Predict**
MODEL = '/content/drive/MyDrive/models/model_split_softmax_cnn_model.hdf5'
from tensorflow import keras
model = keras.models.load_model(MODEL)

In [7]:
from sklearn.model_selection import train_test_split
X_train_set, X_test_set, y_train_set, y_test_set = train_test_split(X_train, y_train, test_size=0.33, random_state=42)

In [8]:
Y_train_predicted = model.predict(X_train_set)



In [9]:
LABELS = ['BENIGN','DDoS','DoS GoldenEye','DoS Hulk','FTP-Patator','PortScan']

In [10]:
y_prediction_index = np.argmax(Y_train_predicted, axis = 1)

In [11]:
df = pd.DataFrame(data=Y_train_predicted,columns=LABELS)

In [12]:
labeltoindex = {
    'BENIGN':0,'DDoS':1,'DoS Hulk':3,'DoS GoldenEye':2,'PortScan':5,'FTP-Patator':4
}

In [13]:
np.vectorize(labeltoindex.get)(y_train_set)

array([5, 0, 3, ..., 0, 1, 0])

In [14]:
TRUELY_PREDICTED = np.vectorize(labeltoindex.get)(y_train_set) == y_prediction_index

In [15]:
pd.Series(TRUELY_PREDICTED).value_counts()

True     731027
False       430
dtype: int64

In [16]:
df.head()

Unnamed: 0,BENIGN,DDoS,DoS GoldenEye,DoS Hulk,FTP-Patator,PortScan
0,0.0,0.0,0.0,0.0,0.0,1.0
1,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0
3,2.4047529999999997e-26,0.0,4.0527199999999996e-36,1.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0


In [17]:
df['INDEX_WIN'] = np.where( TRUELY_PREDICTED == True , y_prediction_index, '')

In [18]:
df.shape

(731457, 7)

In [19]:
df = df[df['INDEX_WIN'] != '']

In [20]:
df.shape

(731027, 7)

In [21]:
df['INDEX_WIN'].value_counts()

0    371886
3    154828
5    106584
1     85603
2      6822
4      5304
Name: INDEX_WIN, dtype: int64

In [22]:
df.dtypes

BENIGN           float32
DDoS             float32
DoS GoldenEye    float32
DoS Hulk         float32
FTP-Patator      float32
PortScan         float32
INDEX_WIN         object
dtype: object

In [23]:
df.INDEX_WIN = df.INDEX_WIN.astype('int')

In [24]:
df.dtypes

BENIGN           float32
DDoS             float32
DoS GoldenEye    float32
DoS Hulk         float32
FTP-Patator      float32
PortScan         float32
INDEX_WIN          int64
dtype: object

In [25]:
threshold_values = {}
def to_do(df):
  ind = df['INDEX_WIN'].to_numpy()[0]
  LABELS = ['BENIGN','DDoS','DoS GoldenEye','DoS Hulk','FTP-Patator','PortScan']  
  label = LABELS[ind] 
  import numpy as np
  threshold =  np.amin(df[label])
  threshold_values[label] = threshold
  return threshold

In [26]:
df.groupby(['INDEX_WIN'], group_keys=True).apply(to_do)

INDEX_WIN
0    0.376277
1    1.000000
2    0.975764
3    0.700373
4    1.000000
5    0.910532
dtype: float32

In [27]:
threshold_values

{'BENIGN': 0.37627712,
 'DDoS': 1.0,
 'DoS GoldenEye': 0.97576433,
 'DoS Hulk': 0.7003728,
 'FTP-Patator': 0.9999999,
 'PortScan': 0.9105322}

## Hard Code to verify threshold values

In [28]:
Threshold = {'BENIGN': 1,
 'DDoS': 1.0,
 'DoS GoldenEye': 1,
 'DoS Hulk': 1,
 'FTP-Patator': 1,
 'PortScan': 1}

In [29]:
LABELS = ['BENIGN','DDoS','DoS GoldenEye','DoS Hulk','FTP-Patator','PortScan']
for i in range(len(df)):  
  arr = df.iloc[i].to_numpy()
  index = int(arr[-1]) 
  value = arr[index] 
  if Threshold[LABELS[index]] > value :
    Threshold[LABELS[index]] = value

In [30]:
arr = df.iloc[1].to_numpy()
arr[-1]

0.0

In [31]:
Threshold

{'BENIGN': 0.3762771189212799,
 'DDoS': 1.0,
 'DoS GoldenEye': 0.9757643342018127,
 'DoS Hulk': 0.7003728151321411,
 'FTP-Patator': 0.9999998807907104,
 'PortScan': 0.9105321764945984}

## Evaluation

In [32]:
y_test_set

1002165    PortScan
1084508    PortScan
1062881    PortScan
86053        BENIGN
803511     DoS Hulk
             ...   
288561       BENIGN
638651         DDoS
654869         DDoS
1065285    PortScan
770726     DoS Hulk
Name: Label, Length: 360270, dtype: object

In [33]:
labeltoindex

{'BENIGN': 0,
 'DDoS': 1,
 'DoS Hulk': 3,
 'DoS GoldenEye': 2,
 'PortScan': 5,
 'FTP-Patator': 4}

In [34]:
## only X_test_set
Y_test_predicted = model.predict(X_test_set)
total = len(X_test_set) 
max_probability_test = np.amax(Y_test_predicted, axis = 1)
y_test_prediction_index = np.argmax(Y_test_predicted, axis = 1)
y_test_set_index = np.vectorize(labeltoindex.get)(y_test_set)
RIGHT_BOOLEAN = y_test_prediction_index == y_test_set_index
y_test_set = y_test_set.to_numpy()



In [42]:
TruelyPredicted_test = 0

for i in range(len(RIGHT_BOOLEAN)):
  if RIGHT_BOOLEAN[i] and max_probability_test[i] >= (threshold_values[y_test_set[i]]) :
    TruelyPredicted_test += 1 

## ACCURACY PREDICTION:
Accuracy = TruelyPredicted_test/total
print('Accuracy is ',Accuracy)

Accuracy is  0.9993810197907125


In [43]:
## unknown set
X = att.drop('Label' , axis=1)
X = X.to_numpy().reshape(-1, 83,1)
predicted = model.predict(X)
max_probability = np.amax(predicted, axis = 1)
unknown_index = np.argmax(predicted, axis = 1)




In [45]:
TOTAL = att.shape[0]
## WITH UNKNOWN SET
truepredicted = 0
predictions = {'BENIGN': 0,
 'DDoS': 0,
 'DoS GoldenEye': 0,
 'DoS Hulk': 0,
 'FTP-Patator': 0,
 'PortScan': 0}
# 2.0 UNKNOWN SET:
for i in range(len(unknown_index)):
  threshold = threshold_values[LABELS[unknown_index[i]]]
  prob = max_probability[i]
  ## check if prob<threshold, then right prediction
  if prob < threshold:
    truepredicted += 1
  else:
    predictions[LABELS[unknown_index[i]]] += 1

## ACCURACY PREDICTION:
Accuracy = truepredicted/TOTAL
print('Accuracy is ',Accuracy)

Accuracy is  0.14136943504295757


In [38]:
truepredicted

13009

In [39]:
TOTAL

19205

In [40]:
threshold_values

{'BENIGN': 0.37627712,
 'DDoS': 1.0,
 'DoS GoldenEye': 0.97576433,
 'DoS Hulk': 0.7003728,
 'FTP-Patator': 0.9999999,
 'PortScan': 0.9105322}

In [46]:
predictions

{'BENIGN': 6220,
 'DDoS': 0,
 'DoS GoldenEye': 3128,
 'DoS Hulk': 7041,
 'FTP-Patator': 101,
 'PortScan': 0}