In [34]:
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [35]:
import numpy as np
import pandas as pd
df = pd.read_csv('/content/NF-ToN-IoT.csv', sep=',', encoding='utf-8')
df.shape

(1379274, 14)

In [36]:
df.dtypes

IPV4_SRC_ADDR                  object
L4_SRC_PORT                     int64
IPV4_DST_ADDR                  object
L4_DST_PORT                     int64
PROTOCOL                        int64
L7_PROTO                      float64
IN_BYTES                        int64
OUT_BYTES                       int64
IN_PKTS                         int64
OUT_PKTS                        int64
TCP_FLAGS                       int64
FLOW_DURATION_MILLISECONDS      int64
Label                           int64
Attack                         object
dtype: object

In [37]:
df.Label.value_counts()

Label
1    1108995
0     270279
Name: count, dtype: int64

In [38]:
df.Attack.value_counts()

Attack
injection     468539
ddos          326345
Benign        270279
password      156299
xss            99944
scanning       21467
dos            17717
backdoor       17247
mitm            1295
ransomware       142
Name: count, dtype: int64

In [39]:
df = df.drop(columns=['IPV4_SRC_ADDR', 'IPV4_DST_ADDR'])

In [40]:
from fastai.tabular.all import df_shrink
df = df_shrink(df, obj2cat=False, int2uint=False)

In [41]:
df.dtypes

L4_SRC_PORT                     int32
L4_DST_PORT                     int32
PROTOCOL                         int8
L7_PROTO                      float32
IN_BYTES                        int32
OUT_BYTES                       int32
IN_PKTS                         int32
OUT_PKTS                        int32
TCP_FLAGS                       int16
FLOW_DURATION_MILLISECONDS      int32
Label                            int8
Attack                         object
dtype: object

In [42]:
df.replace([np.inf, -np.inf], np.nan, inplace=True)
print(df.isna().any(axis=1).sum(), "rows with at least one NaN to remove")
df.dropna(inplace=True)

0 rows with at least one NaN to remove


In [43]:
print(df.duplicated().sum(), "fully duplicate rows to remove")
df.drop_duplicates(inplace=True)
df.reset_index(inplace=True, drop=True)

221280 fully duplicate rows to remove


# Model

In [44]:
data = pd.read_csv("/content/EthicalHackingDataset.csv")

In [45]:
data.head()

Unnamed: 0,L4_SRC_PORT,L4_DST_PORT,PROTOCOL,L7_PROTO,IN_BYTES,OUT_BYTES,IN_PKTS,OUT_PKTS,TCP_FLAGS,FLOW_DURATION_MILLISECONDS,Label,Attack
0,63318,443,6,91.0,181,165,2,1,24,327,0,Benign
1,57442,15600,17,0.0,63,0,1,0,0,0,0,Benign
2,57452,15600,17,0.0,63,0,1,0,0,0,0,Benign
3,138,138,17,10.16,472,0,2,0,0,0,0,Benign
4,51989,15600,17,0.0,63,0,1,0,0,0,0,Benign


In [46]:
num_features=[col for col in data.columns if data[col].dtype!='O']
num_features

['L4_SRC_PORT',
 'L4_DST_PORT',
 'PROTOCOL',
 'L7_PROTO',
 'IN_BYTES',
 'OUT_BYTES',
 'IN_PKTS',
 'OUT_PKTS',
 'TCP_FLAGS',
 'FLOW_DURATION_MILLISECONDS',
 'Label']

In [47]:
num_data = data[num_features]
num_data.head()

Unnamed: 0,L4_SRC_PORT,L4_DST_PORT,PROTOCOL,L7_PROTO,IN_BYTES,OUT_BYTES,IN_PKTS,OUT_PKTS,TCP_FLAGS,FLOW_DURATION_MILLISECONDS,Label
0,63318,443,6,91.0,181,165,2,1,24,327,0
1,57442,15600,17,0.0,63,0,1,0,0,0,0
2,57452,15600,17,0.0,63,0,1,0,0,0,0
3,138,138,17,10.16,472,0,2,0,0,0,0
4,51989,15600,17,0.0,63,0,1,0,0,0,0


In [48]:
y=num_data['Label']
X=num_data.drop('Label',axis=1)
cols=X.columns
cols

Index(['L4_SRC_PORT', 'L4_DST_PORT', 'PROTOCOL', 'L7_PROTO', 'IN_BYTES',
       'OUT_BYTES', 'IN_PKTS', 'OUT_PKTS', 'TCP_FLAGS',
       'FLOW_DURATION_MILLISECONDS'],
      dtype='object')

In [49]:
num_data['Label'].unique()

array([0, 1])

In [50]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,train_size=0.8,random_state=0)

In [51]:
#Import Libraries
from sklearn.preprocessing import StandardScaler

In [52]:
#Creating a StandardScaler object to normalize the X train and test set feature data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [53]:
X_train = np.array(X_train)
y_train = np.array(y_train).reshape(-1, 1)

In [54]:
#Make sure both arrays have correct dimensions
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(926395, 10)
(926395, 1)
(231599, 10)
(231599,)


In [55]:
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import BatchNormalization

In [56]:
# initializing ann
model = Sequential()

# adding the first input layer and the first hidden layer
model.add(Dense(units=32, activation='relu', input_dim=10))

# add hidden layer with dropout
model.add(Dense(units=24, activation='relu'))
model.add(Dropout(0.2))

# add hidden layer with dropout
model.add(Dense(units=11, activation='relu'))
model.add(Dropout(0.2))

# add hidden layer
model.add(Dense(units=8, activation='relu'))

# add an output layer (2 classes in the target variable)
model.add(Dense(units=2, activation='softmax'))

# compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [58]:
model.fit(X_train, y_train, epochs=50, batch_size=30)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x797db3d99c90>

In [59]:
acc = model.evaluate(X_test, y_test)[1]

print(f'Accuracy of model is {acc}')

Accuracy of model is 0.9913557767868042


In [60]:
y_pred = np.argmax(model.predict(X_test), axis=1)
y_pred



array([1, 1, 1, ..., 0, 1, 0])

In [61]:
results_df = pd.DataFrame({
    'Actual Value': y_test,
    'Predicted Value': y_pred
})

# Display the DataFrame
results_df

Unnamed: 0,Actual Value,Predicted Value
241655,1,1
83742,1,1
486752,1,1
309356,1,1
1114565,1,1
...,...,...
177284,1,1
145311,1,1
813227,0,0
470855,1,1


In [62]:
acc = model.evaluate(X_test, y_test)[1]

print(f'Accuracy of model is {acc}')

Accuracy of model is 0.9913557767868042


In [63]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.98      0.97      0.97     39683
           1       0.99      1.00      0.99    191916

    accuracy                           0.99    231599
   macro avg       0.99      0.98      0.98    231599
weighted avg       0.99      0.99      0.99    231599



In [64]:
model.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_35 (Dense)            (None, 32)                352       
                                                                 
 dense_36 (Dense)            (None, 24)                792       
                                                                 
 dropout_14 (Dropout)        (None, 24)                0         
                                                                 
 dense_37 (Dense)            (None, 11)                275       
                                                                 
 dropout_15 (Dropout)        (None, 11)                0         
                                                                 
 dense_38 (Dense)            (None, 8)                 96        
                                                                 
 dense_39 (Dense)            (None, 2)                