In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dropout,Activation

from keras.layers.convolutional import Conv1D
from keras import backend as K
from keras.layers.core import Lambda
from keras.layers.convolutional import MaxPooling1D
from keras.layers.embeddings import Embedding

import keras
from keras.utils import np_utils

In [4]:
from sklearn.feature_selection import VarianceThreshold

In [5]:
dataset = pd.read_csv('Usman_test_train_IoT/Combined_Data_IoT.csv')

In [6]:
dataset.columns

Index(['ts', 'date', 'time', 'fridge_temperature', 'temp_condition', 'label',
       'type', 'door_state', 'sphone_signal', 'latitude', 'longitude',
       'FC1_Read_Input_Register', 'FC2_Read_Discrete_Value',
       'FC3_Read_Holding_Register', 'FC4_Read_Coil', 'motion_status',
       'light_status', 'current_temperature', 'thermostat_status',
       'temperature', 'pressure', 'humidity'],
      dtype='object')

In [7]:
Data = dataset.drop(['ts', 'date', 'time'], axis =1)

In [8]:
Data['label'].value_counts()

0    245000
1    156119
Name: label, dtype: int64

In [9]:
Data['type'].value_counts()

normal        245000
backdoor       35000
injection      35000
password       35000
ddos           25000
ransomware     16030
xss             6116
scanning        3973
Name: type, dtype: int64

In [10]:
Data.isna().sum().sum()

0

In [11]:
Data.head()

Unnamed: 0,fridge_temperature,temp_condition,label,type,door_state,sphone_signal,latitude,longitude,FC1_Read_Input_Register,FC2_Read_Discrete_Value,FC3_Read_Holding_Register,FC4_Read_Coil,motion_status,light_status,current_temperature,thermostat_status,temperature,pressure,humidity
0,9.0,1,1,ddos,0,0,4.514077,14.421946,32450,32708,32035,32728,0,0,28.442693,1,35.773605,1.035,46.343618
1,9.25,1,1,ddos,0,0,4.514077,14.421946,32450,32708,32035,32728,0,0,28.442693,1,35.773605,1.035,46.343618
2,12.65,1,1,ddos,0,0,4.514077,14.421946,32450,32708,32035,32728,0,0,28.442693,1,35.773605,1.035,46.343618
3,4.65,0,1,ddos,0,0,4.514077,14.421946,32450,32708,32035,32728,0,0,28.442693,1,35.773605,1.035,46.343618
4,12.65,1,1,ddos,0,0,4.514077,14.421946,32450,32708,32035,32728,0,0,28.442693,1,35.773605,1.035,46.343618


# Model Development (Binary)

In [12]:
# from ctgan import CTGANSynthesizer
# ctgan = CTGANSynthesizer(epochs=10)
# ctgan.fit(Data.drop(['type'], axis = 1), ['label'])

In [13]:
# GAN_IOT = ctgan.sample(600000)
# GAN_IOT

In [14]:
# GAN_IOT.to_csv("GAN_IOT_10Epoches.csv", index=False)

In [15]:
samples = pd.read_csv('GAN_IOT_10Epoches.csv')

In [16]:
samples['label'].value_counts()

0    427817
1    172183
Name: label, dtype: int64

In [17]:
X_G = samples.drop(['label'], axis = 1)
X_G.shape

(600000, 17)

In [18]:
y_G = samples['label']
y_G = y_G.values
y_G.shape

(600000,)

In [19]:
sel = VarianceThreshold(threshold=(.8 * (1 - .8))) #don't change these values
X_T = sel.fit(X_G)

In [20]:
sel_cols = X_T.get_support(indices=True)
X_VT = X_G.iloc[:,sel_cols]
X_VT

Unnamed: 0,fridge_temperature,temp_condition,latitude,longitude,FC1_Read_Input_Register,FC2_Read_Discrete_Value,FC3_Read_Holding_Register,FC4_Read_Coil,current_temperature,thermostat_status,temperature,pressure,humidity
0,10.041658,0,4.550903,14.438127,32454,32713,32040,32728,28.442293,1,35.770101,1.035372,46.348176
1,6.699955,1,4.531272,75.202959,32451,32705,32039,32724,28.737284,0,35.769628,1.035715,46.355070
2,6.700362,1,4.517152,14.403387,32454,32703,32044,32721,28.442653,0,35.771778,1.035797,46.359988
3,6.702047,0,4.524009,14.439772,32451,32703,32043,32726,28.442530,0,35.771197,1.034263,46.343268
4,6.701300,1,4.526589,14.442005,32449,32707,32038,32725,27.159283,0,35.772592,1.036184,38.912467
...,...,...,...,...,...,...,...,...,...,...,...,...,...
599995,6.701781,0,4.497448,14.400577,32454,32707,32034,32726,28.442123,0,35.771638,1.035189,46.349511
599996,6.700457,0,4.511919,14.457472,55798,3071,24817,26585,28.441697,1,35.770840,1.035442,46.345211
599997,6.700144,0,1.381538,10.911842,32451,32705,32041,32719,28.442684,1,35.772983,1.035264,46.349470
599998,6.700591,0,4.525477,14.401991,32448,32709,32038,32717,28.442219,1,35.772159,1.035277,46.350916


In [21]:
X_VT.shape

(600000, 13)

In [22]:
scaler = MinMaxScaler()
X_M = scaler.fit_transform(X_VT)
print(X_M)

[[0.70550429 0.         0.03402255 ... 0.53001883 0.54297187 0.43603677]
 [0.44952356 1.         0.03388342 ... 0.53000224 0.54300734 0.4361097 ]
 [0.44955469 1.         0.03378335 ... 0.53007757 0.54301577 0.43616172]
 ...
 [0.44953801 0.         0.01156011 ... 0.53011979 0.54296081 0.43605046]
 [0.44957226 0.         0.03384235 ... 0.53009091 0.54296213 0.43606575]
 [0.44962436 0.         0.03375006 ... 0.41070494 0.63669728 0.38079804]]


In [23]:
# (Smote -> varience threshold -> min max scaler)
trainX, testX, trainY, testY = train_test_split(X_M, y_G, test_size = 0.30, random_state = 5)

In [24]:
print(X_M.shape)
print(y_G.shape)
print(trainX.shape)
print(trainY.shape)
print(testX.shape)
print(testY.shape)

(600000, 13)
(600000,)
(420000, 13)
(420000,)
(180000, 13)
(180000,)


### LR

In [29]:
#Test Train split
lr = LogisticRegression() #for binary
lr.fit(trainX, trainY)
y_predict = lr.predict(testX)
print(accuracy_score(testY, y_predict))
print(classification_report(testY, y_predict))

0.7222277777777778
              precision    recall  f1-score   support

           0       0.73      0.97      0.83    128414
           1       0.58      0.11      0.19     51586

    accuracy                           0.72    180000
   macro avg       0.65      0.54      0.51    180000
weighted avg       0.69      0.72      0.65    180000



### LDA

In [30]:
#Test Train split
lda = LinearDiscriminantAnalysis()
lda.fit(trainX, trainY)
y_predict = lda.predict(testX)
print(accuracy_score(testY, y_predict))
print(classification_report(testY, y_predict))

0.7218222222222223
              precision    recall  f1-score   support

           0       0.73      0.97      0.83    128414
           1       0.57      0.12      0.19     51586

    accuracy                           0.72    180000
   macro avg       0.65      0.54      0.51    180000
weighted avg       0.69      0.72      0.65    180000



### KNN

In [None]:
#Test Train split
#default parameters used in base paper (n_neighbors, default=5) (p, default=2 for Euclidean Distance)
knn = KNeighborsClassifier()
knn.fit(trainX, trainY)
y_predict = knn.predict(testX)
print(accuracy_score(testY, y_predict))
print(classification_report(testY, y_predict))

### RF

In [25]:
#Test Train split
rf = RandomForestClassifier(n_estimators=10,criterion='gini')
rf.fit(trainX, trainY)
y_predict1 = rf.predict(testX)
print(accuracy_score(testY, y_predict1))
print(classification_report(testY, y_predict1))

0.7392944444444445
              precision    recall  f1-score   support

           0       0.76      0.93      0.84    128414
           1       0.60      0.27      0.37     51586

    accuracy                           0.74    180000
   macro avg       0.68      0.60      0.60    180000
weighted avg       0.71      0.74      0.70    180000



### DT (CART)

In [32]:
#Test Train Split
dt = DecisionTreeClassifier(criterion='gini')
dt.fit(trainX, trainY)
y_predict1 = dt.predict(testX)
print(accuracy_score(testY, y_predict1))
print(classification_report(testY, y_predict1))

0.6480055555555555
              precision    recall  f1-score   support

           0       0.76      0.74      0.75    128414
           1       0.39      0.41      0.40     51586

    accuracy                           0.65    180000
   macro avg       0.58      0.58      0.58    180000
weighted avg       0.65      0.65      0.65    180000



### NB

In [33]:
#Test Train Split
nb = GaussianNB()
nb.fit(trainX, trainY)
y_predict1 = nb.predict(testX)
print(accuracy_score(testY, y_predict1))
print(classification_report(testY, y_predict1))

0.7011166666666667
              precision    recall  f1-score   support

           0       0.75      0.87      0.81    128414
           1       0.46      0.28      0.35     51586

    accuracy                           0.70    180000
   macro avg       0.61      0.58      0.58    180000
weighted avg       0.67      0.70      0.68    180000



### SVM

In [None]:
#Test Train Split
svclassifier = SVC(kernel='rbf', gamma='auto')
svclassifier.fit(trainX, trainY)
y_predict = svclassifier.predict(testX)
print(accuracy_score(testY, y_predict))
print(classification_report(testY, y_predict))

### LSTM

In [26]:
X_G.shape

(600000, 17)

In [29]:
X_GV = X_G.values

In [30]:
X1 = X_GV.reshape((-1, 1, 17))

In [33]:
X1.shape

(600000, 1, 17)

In [34]:
(trainX, testX, trainY, testY) = train_test_split(X1, y_G, test_size = 0.30, random_state = 5)

In [35]:
# 1 LSTM Layer (input), 3 Dense Hidden Layers
model = Sequential()
model.add(LSTM(17, input_shape=(1, 17), activation='tanh'))
model.add(Dropout(0.2))
model.add(Dense(128, activation='tanh'))
model.add(Dropout(0.2))
model.add(Dense(100, activation='tanh'))
model.add(Dropout(0.2))
model.add(Dense(64, activation='tanh'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=["accuracy"])
model.fit(trainX, trainY, epochs=35, batch_size=64, verbose=2, validation_data=(testX, testY))

Epoch 1/35
6563/6563 - 41s - loss: 0.5999 - accuracy: 0.7127 - val_loss: 0.5952 - val_accuracy: 0.7134
Epoch 2/35
6563/6563 - 40s - loss: 0.5984 - accuracy: 0.7129 - val_loss: 0.5953 - val_accuracy: 0.7134
Epoch 3/35
6563/6563 - 33s - loss: 0.5972 - accuracy: 0.7129 - val_loss: 0.5954 - val_accuracy: 0.7134
Epoch 4/35
6563/6563 - 35s - loss: 0.5975 - accuracy: 0.7129 - val_loss: 0.5983 - val_accuracy: 0.7134
Epoch 5/35
6563/6563 - 33s - loss: 0.5971 - accuracy: 0.7129 - val_loss: 0.5960 - val_accuracy: 0.7134
Epoch 6/35
6563/6563 - 29s - loss: 0.5972 - accuracy: 0.7129 - val_loss: 0.5960 - val_accuracy: 0.7134
Epoch 7/35
6563/6563 - 32s - loss: 0.5967 - accuracy: 0.7129 - val_loss: 0.5939 - val_accuracy: 0.7134
Epoch 8/35
6563/6563 - 35s - loss: 0.5964 - accuracy: 0.7129 - val_loss: 0.5945 - val_accuracy: 0.7134
Epoch 9/35


KeyboardInterrupt: 

In [56]:
loss, accuracy = model.evaluate(testX, testY)
print("Loss:" + str(loss))
print("Accuracy:" + str(accuracy))

Loss:0.5779118537902832
Accuracy:0.679796576499939


In [57]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_4 (LSTM)                (None, 17)                2380      
_________________________________________________________________
dropout_4 (Dropout)          (None, 17)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               2304      
_________________________________________________________________
dropout_5 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 100)               12900     
_________________________________________________________________
dropout_6 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 64)               

### CNN (ConvID)

In [36]:
X_G.shape

(600000, 17)

In [37]:
X_GV = X_G.values

In [38]:
X1 = X_GV.reshape((-1, 1, 17))

In [39]:
X1.shape

(600000, 1, 17)

In [40]:
(trainX, testX, trainY, testY) = train_test_split(X1, y_G, test_size = 0.30, random_state = 5)

In [41]:
nb_filter = 250
filter_length = 3

model = Sequential()
model.add(Conv1D(filters=nb_filter, kernel_size=filter_length, padding='same', activation='tanh'))
model.add(Dropout(0.2))
model.add(Dense(128, activation='tanh'))
model.add(Dropout(0.2))
model.add(Dense(100, activation='tanh'))
model.add(Dropout(0.2))
model.add(Dense(64, activation='tanh'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=["accuracy"])
model.fit(trainX, trainY, epochs=35, batch_size=64, verbose=2, validation_data=(testX, testY))

Epoch 1/35
6563/6563 - 41s - loss: 0.6038 - accuracy: 0.7116 - val_loss: 0.6011 - val_accuracy: 0.7134
Epoch 2/35
6563/6563 - 41s - loss: 0.6008 - accuracy: 0.7129 - val_loss: 0.5991 - val_accuracy: 0.7134
Epoch 3/35
6563/6563 - 37s - loss: 0.6007 - accuracy: 0.7129 - val_loss: 0.6004 - val_accuracy: 0.7134
Epoch 4/35
6563/6563 - 37s - loss: 0.6007 - accuracy: 0.7129 - val_loss: 0.6029 - val_accuracy: 0.7134
Epoch 5/35


KeyboardInterrupt: 

In [None]:
loss, accuracy = model.evaluate(testX, testY)
print("Loss:" + str(loss))
print("Accuracy:" + str(accuracy))

### Conv-LSTM

In [42]:
X_G.shape

(600000, 17)

In [43]:
X_GV = X_G.values

In [44]:
X1 = X_GV.reshape((-1, 1, 17))

In [45]:
X1.shape

(600000, 1, 17)

In [46]:
(trainX, testX, trainY, testY) = train_test_split(X1, y_G, test_size = 0.30, random_state = 5)

In [47]:
nb_filter = 250
filter_length = 3

model = Sequential()
model.add(Conv1D(filters=nb_filter, kernel_size=filter_length, padding='same', activation='tanh'))
model.add(Dropout(0.2))
model.add(MaxPooling1D(pool_size=1))
model.add(Dropout(0.2))
model.add(LSTM(17))
model.add(Dropout(0.2))
model.add(Dense(128, activation='tanh'))
model.add(Dropout(0.2))
model.add(Dense(100, activation='tanh'))
model.add(Dropout(0.2))
model.add(Dense(64, activation='tanh'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=["accuracy"])
model.fit(trainX, trainY, epochs=35, batch_size=64, verbose=2, validation_data=(testX, testY))

Epoch 1/35
6563/6563 - 61s - loss: 0.5997 - accuracy: 0.7128 - val_loss: 0.5978 - val_accuracy: 0.7134
Epoch 2/35
6563/6563 - 51s - loss: 0.5988 - accuracy: 0.7129 - val_loss: 0.5975 - val_accuracy: 0.7134
Epoch 3/35
6563/6563 - 50s - loss: 0.5980 - accuracy: 0.7129 - val_loss: 0.5965 - val_accuracy: 0.7134
Epoch 4/35


KeyboardInterrupt: 

In [None]:
loss, accuracy = model.evaluate(testX, testY)
print("Loss:" + str(loss))
print("Accuracy:" + str(accuracy))

In [None]:
nb_filter = 250
filter_length = 3

model = Sequential()
model.add(Conv1D(filters=nb_filter, kernel_size=filter_length, padding='same', activation='tanh'))
model.add(MaxPooling1D(pool_size=1))
model.add(Conv1D(filters=nb_filter, kernel_size=filter_length, padding='same', activation='tanh'))
model.add(MaxPooling1D(pool_size=1))
model.add(Activation('softmax'))
model.add(LSTM(17, return_sequences=True))
model.add(LSTM(17, return_sequences=True))
model.add(Dense(128, activation='tanh'))
model.add(Dense(100, activation='tanh'))
model.add(Dense(64, activation='tanh'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=["accuracy"])
model.fit(trainX, trainY, epochs=500, batch_size=100, verbose=2, validation_data=(testX, testY))

In [None]:
loss, accuracy = model.evaluate(testX, testY)
print("Loss:" + str(loss))
print("Accuracy:" + str(accuracy))

# Model Development (Multi Class)

In [98]:
XM = Data.drop(['label', 'type','latitude','longitude'], axis = 1)
XM.shape

(401119, 15)

In [99]:
yM = Data['type']
yM = yM.values
yM.shape

(401119,)

In [64]:
# from ctgan import CTGANSynthesizer
# ctgan = CTGANSynthesizer(epochs=10)
# ctgan.fit(Data.drop(['label'], axis = 1), ['type'])

In [65]:
# GAN_IOT_M = ctgan.sample(600000)
# GAN_IOT_M

In [51]:
GAN_IOT_M.to_csv("GAN_IOT_M_10Epoches.csv", index=False)

In [52]:
samples_M = pd.read_csv('GAN_IOT_M_10Epoches.csv')

In [53]:
samples_M['type'].value_counts()

backdoor      283393
xss            59076
password       54409
normal         46782
ddos           46552
injection      43116
scanning       33949
ransomware     32723
Name: type, dtype: int64

In [54]:
# XM = samples_M.drop(['type'], axis = 1)
# XM.shape

(600000, 17)

In [55]:
# yM = samples_M['type']
# yM = yM.values
# yM.shape

(600000,)

In [66]:
# from imblearn.over_sampling import SMOTE 
# smote = SMOTE()
# from collections import Counter

In [84]:
# XM_S = Data.drop(['label', 'type'], axis = 1)
# XM_S.shape

In [85]:
# yM_S = Data['type']
# yM_S = yM_S.values
# yM_S.shape

In [86]:
# XM, yM = smote.fit_resample(XM_S,yM_S)

In [87]:
# print("Before Smote :", Counter(yM_S))
# print("After Smote :", Counter(yM))

In [88]:
# XM.shape

In [100]:
sel = VarianceThreshold(threshold=(.8 * (1 - .8))) #don't change these values
X_T = sel.fit(XM)

In [101]:
sel_cols = X_T.get_support(indices=True)
X_VT = XM.iloc[:,sel_cols]
X_VT

Unnamed: 0,fridge_temperature,FC1_Read_Input_Register,FC2_Read_Discrete_Value,FC3_Read_Holding_Register,FC4_Read_Coil,current_temperature,temperature,pressure,humidity
0,9.00,32450,32708,32035,32728,28.442693,35.773605,1.035000,46.343618
1,9.25,32450,32708,32035,32728,28.442693,35.773605,1.035000,46.343618
2,12.65,32450,32708,32035,32728,28.442693,35.773605,1.035000,46.343618
3,4.65,32450,32708,32035,32728,28.442693,35.773605,1.035000,46.343618
4,12.65,32450,32708,32035,32728,28.442693,35.773605,1.035000,46.343618
...,...,...,...,...,...,...,...,...,...
401114,6.70,32450,32708,32035,32728,28.442693,32.799434,2.204924,37.024913
401115,6.70,32450,32708,32035,32728,28.442693,29.453781,-2.030547,90.297894
401116,6.70,32450,32708,32035,32728,28.442693,47.185992,0.872942,37.687701
401117,6.70,32450,32708,32035,32728,28.442693,43.097037,3.168207,93.647950


In [102]:
X_VT.shape

(401119, 9)

In [103]:
scaler = MinMaxScaler()
X_M = scaler.fit_transform(X_VT)
print(X_M)

[[0.61538462 0.49521571 0.49909209 ... 0.51730735 0.53355618 0.46251056]
 [0.63461538 0.49521571 0.49909209 ... 0.51730735 0.53355618 0.46251056]
 [0.89615385 0.49521571 0.49909209 ... 0.51730735 0.53355618 0.46251056]
 ...
 [0.43846154 0.49521571 0.49909209 ... 0.90452247 0.52695586 0.3756012 ]
 [0.43846154 0.49521571 0.49909209 ... 0.76578681 0.62043775 0.93746771]
 [0.43846154 0.49521571 0.49909209 ... 0.40588822 0.58120503 0.3689465 ]]


In [104]:
(trainX, testX, trainY, testY) = train_test_split(X_M, yM, test_size = 0.30, random_state = 5)

### LR

In [105]:
#Test Train split
lr = LogisticRegression(multi_class='ovr') #for multiclass
lr.fit(trainX, trainY)
y_predict = lr.predict(testX)
print(accuracy_score(testY, y_predict))
print(classification_report(testY, y_predict))

0.610474006116208


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

    backdoor       0.00      0.00      0.00     10449
        ddos       1.00      0.00      0.00      7678
   injection       0.00      0.00      0.00     10462
      normal       0.61      1.00      0.76     73454
    password       0.00      0.00      0.00     10490
  ransomware       0.00      0.00      0.00      4743
    scanning       0.00      0.00      0.00      1212
         xss       0.00      0.00      0.00      1848

    accuracy                           0.61    120336
   macro avg       0.20      0.13      0.10    120336
weighted avg       0.44      0.61      0.46    120336



  _warn_prf(average, modifier, msg_start, len(result))


### LDA

In [106]:
#Test Train split
lda = LinearDiscriminantAnalysis()
lda.fit(trainX, trainY)
y_predict = lda.predict(testX)
print(accuracy_score(testY, y_predict))
print(classification_report(testY, y_predict))

0.6130002659220848


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

    backdoor       0.00      0.00      0.00     10449
        ddos       0.38      0.00      0.00      7678
   injection       0.00      0.00      0.00     10462
      normal       0.62      1.00      0.76     73454
    password       0.51      0.03      0.05     10490
  ransomware       0.00      0.00      0.00      4743
    scanning       0.15      0.08      0.11      1212
         xss       0.00      0.00      0.00      1848

    accuracy                           0.61    120336
   macro avg       0.21      0.14      0.12    120336
weighted avg       0.45      0.61      0.47    120336



  _warn_prf(average, modifier, msg_start, len(result))


### KNN

In [44]:
#Test Train split
knn = KNeighborsClassifier()
knn.fit(trainX, trainY)
y_predict = knn.predict(testX)
print(accuracy_score(testY, y_predict))
print(classification_report(testY, y_predict))

0.5679347826086957
              precision    recall  f1-score   support

    backdoor       0.17      0.55      0.26      7034
        ddos       0.61      0.35      0.45      5125
   injection       0.58      0.36      0.44      6950
      normal       0.75      0.70      0.73     48919
    password       0.67      0.26      0.37      7003
  ransomware       0.76      0.26      0.38      3184
    scanning       0.94      0.26      0.40       810
         xss       0.47      0.12      0.18      1199

    accuracy                           0.57     80224
   macro avg       0.62      0.36      0.40     80224
weighted avg       0.67      0.57      0.59     80224



### RF 

In [107]:
#Test Train split
rf = RandomForestClassifier(n_estimators=10,criterion='gini')
rf.fit(trainX, trainY)
y_predict1 = rf.predict(testX)
print(accuracy_score(testY, y_predict1))
print(classification_report(testY, y_predict1))

0.6795472676505784
              precision    recall  f1-score   support

    backdoor       0.68      0.28      0.40     10449
        ddos       0.96      0.19      0.32      7678
   injection       0.70      0.29      0.41     10462
      normal       0.67      0.96      0.79     73454
    password       0.71      0.27      0.39     10490
  ransomware       0.58      0.17      0.26      4743
    scanning       0.94      0.20      0.33      1212
         xss       0.74      0.19      0.30      1848

    accuracy                           0.68    120336
   macro avg       0.75      0.32      0.40    120336
weighted avg       0.70      0.68      0.62    120336



### DT (CART) 

In [108]:
#Test Train split
dt = DecisionTreeClassifier(criterion='gini')
dt.fit(trainX, trainY)
y_predict1 = dt.predict(testX)
print(accuracy_score(testY, y_predict1))
print(classification_report(testY, y_predict1))

0.6761983113947614
              precision    recall  f1-score   support

    backdoor       0.66      0.28      0.39     10449
        ddos       0.94      0.19      0.31      7678
   injection       0.68      0.29      0.40     10462
      normal       0.67      0.95      0.79     73454
    password       0.66      0.27      0.39     10490
  ransomware       0.53      0.17      0.26      4743
    scanning       0.85      0.21      0.34      1212
         xss       0.68      0.21      0.32      1848

    accuracy                           0.68    120336
   macro avg       0.71      0.32      0.40    120336
weighted avg       0.69      0.68      0.62    120336



### NB 

In [47]:
#Test Train split
nb = GaussianNB()
nb.fit(trainX, trainY)
y_predict1 = nb.predict(testX)
print(accuracy_score(testY, y_predict1))
print(classification_report(testY, y_predict1))

0.07077682489030714


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

    backdoor       0.00      0.00      0.00      7034
        ddos       0.09      1.00      0.16      5125
   injection       0.00      0.00      0.00      6950
      normal       0.00      0.00      0.00     48919
    password       0.00      0.00      0.00      7003
  ransomware       0.04      0.14      0.06      3184
    scanning       0.01      0.14      0.02       810
         xss       0.00      0.00      0.00      1199

    accuracy                           0.07     80224
   macro avg       0.02      0.16      0.03     80224
weighted avg       0.01      0.07      0.01     80224



### SVM

In [74]:
#Test Train split
svclassifier = SVC(kernel='rbf', gamma='auto')
svclassifier.fit(trainX, trainY)
y_predict = svclassifier.predict(testX)
print(accuracy_score(testY, y_predict))
print(classification_report(testY, y_predict))

0.6113257877941763


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

    backdoor       0.00      0.00      0.00      7034
        ddos       1.00      0.02      0.05      5125
   injection       0.00      0.00      0.00      6950
      normal       0.61      1.00      0.76     48919
    password       0.00      0.00      0.00      7003
  ransomware       0.00      0.00      0.00      3184
    scanning       0.00      0.00      0.00       810
         xss       0.00      0.00      0.00      1199

    accuracy                           0.61     80224
   macro avg       0.20      0.13      0.10     80224
weighted avg       0.44      0.61      0.47     80224



### LSTM

In [113]:
XM.shape

(401119, 15)

In [115]:
X_MV = XM.values

In [117]:
X1 = X_MV.reshape((-1,1,15))

In [118]:
X1.shape

(401119, 1, 15)

In [119]:
len(np.unique(yM_G))

8

In [122]:
encoder = LabelEncoder()
encoder.fit(yM)
y_E = encoder.transform(yM)

In [123]:
y_E.shape

(401119,)

In [124]:
(trainX, testX, trainY, testY) = train_test_split(X1, y_E, test_size=0.30, random_state = 5)

In [126]:
# 1 LSTM Layer (input), 3 Dense Hidden Layers
model = Sequential()
model.add(LSTM(17, input_shape=(1, 15), activation='tanh'))
model.add(Dropout(0.2))
model.add(Dense(128, activation='tanh'))
model.add(Dropout(0.2))
model.add(Dense(100, activation='tanh'))
model.add(Dropout(0.2))
model.add(Dense(64, activation='tanh'))
model.add(Dropout(0.2))
model.add(Dense(8,activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=["accuracy"])
model.fit(trainX, trainY, epochs=35, batch_size=64, verbose=2, validation_data=(testX, testY))

Epoch 1/35
4388/4388 - 19s - loss: 1.3616 - accuracy: 0.6107 - val_loss: 1.3472 - val_accuracy: 0.6104
Epoch 2/35
4388/4388 - 16s - loss: 1.3518 - accuracy: 0.6110 - val_loss: 1.3466 - val_accuracy: 0.6104
Epoch 3/35
4388/4388 - 16s - loss: 1.3508 - accuracy: 0.6110 - val_loss: 1.3509 - val_accuracy: 0.6104
Epoch 4/35
4388/4388 - 16s - loss: 1.3500 - accuracy: 0.6110 - val_loss: 1.3573 - val_accuracy: 0.6104
Epoch 5/35
4388/4388 - 16s - loss: 1.3500 - accuracy: 0.6110 - val_loss: 1.3473 - val_accuracy: 0.6104
Epoch 6/35
4388/4388 - 16s - loss: 1.3502 - accuracy: 0.6110 - val_loss: 1.3454 - val_accuracy: 0.6104
Epoch 7/35
4388/4388 - 16s - loss: 1.3479 - accuracy: 0.6110 - val_loss: 1.3447 - val_accuracy: 0.6104
Epoch 8/35
4388/4388 - 16s - loss: 1.3462 - accuracy: 0.6110 - val_loss: 1.3497 - val_accuracy: 0.6104
Epoch 9/35
4388/4388 - 16s - loss: 1.3469 - accuracy: 0.6110 - val_loss: 1.3459 - val_accuracy: 0.6104
Epoch 10/35
4388/4388 - 16s - loss: 1.3451 - accuracy: 0.6110 - val_loss:

KeyboardInterrupt: 

In [None]:
loss, accuracy = model.evaluate(testX, testY)
print("Loss:" + str(loss))
print("Accuracy:" + str(accuracy))

### CNN (Conv1D)

In [133]:
X_M.shape

(401119, 17)

In [134]:
X1 = X_M.reshape((-1,1,17))

In [135]:
X1.shape

(401119, 1, 17)

In [136]:
encoder = LabelEncoder()
encoder.fit(yM_G)l
y_E = encoder.transform(yM_G)

In [137]:
(trainX, testX, trainY, testY) = train_test_split(X1, y_E, test_size = 0.30, random_state = 5)

In [138]:
nb_filter = 250
filter_length = 3

model = Sequential()
model.add(Conv1D(filters=nb_filter, kernel_size=filter_length, padding='same', activation='tanh'))
model.add(Dropout(0.2))
model.add(Dense(128, activation='tanh'))
model.add(Dropout(0.2))
model.add(Dense(100, activation='tanh'))
model.add(Dropout(0.2))
model.add(Dense(64, activation='tanh'))
model.add(Dropout(0.2))
model.add(Dense(8,activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=["accuracy"])
model.fit(trainX, trainY, epochs=35, batch_size=64, verbose=2, validation_data=(testX, testY))

Epoch 1/35
5014/5014 - 21s - loss: 1.3224 - accuracy: 0.6129 - val_loss: 1.2808 - val_accuracy: 0.6189
Epoch 2/35


KeyboardInterrupt: 

In [None]:
loss, accuracy = model.evaluate(testX, testY)
print("Loss:" + str(loss))
print("Accuracy:" + str(accuracy))

### Conv-LSTM

In [140]:
X_M.shape

(401119, 17)

In [141]:
X1 = X_M.reshape((-1,1,17))

In [142]:
X1.shape

(401119, 1, 17)

In [143]:
encoder = LabelEncoder()
encoder.fit(yM_G)
y_E = encoder.transform(yM_G)

In [144]:
(trainX, testX, trainY, testY) = train_test_split(X1, y_E, test_size = 0.30, random_state = 5)

In [145]:
nb_filter = 250
filter_length = 3

model = Sequential()
model.add(Conv1D(filters=nb_filter, kernel_size=filter_length, padding='same', activation='tanh'))
model.add(MaxPooling1D(pool_size=1))
model.add(Conv1D(filters=nb_filter, kernel_size=filter_length, padding='same', activation='tanh'))
model.add(MaxPooling1D(pool_size=1))
model.add(Activation('softmax'))
model.add(LSTM(17, return_sequences=True))
model.add(LSTM(17, return_sequences=True))
model.add(Dense(128, activation='tanh'))
model.add(Dense(100, activation='tanh'))
model.add(Dense(64, activation='tanh'))
model.add(Dense(8,activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=["accuracy"])
model.fit(trainX, trainY, epochs=35, batch_size=64, verbose=2, validation_data=(testX, testY))

Epoch 1/35
5014/5014 - 49s - loss: 1.3253 - accuracy: 0.6137 - val_loss: 1.3003 - val_accuracy: 0.6167
Epoch 2/35


KeyboardInterrupt: 

In [None]:
loss, accuracy = model.evaluate(testX, testY)
print("Loss:" + str(loss))
print("Accuracy:" + str(accuracy))