# Cleaning

In [11]:
import zipfile

# Extract the dataset from the zip file
with zipfile.ZipFile('/content/NF-ToN-IoT.csv.zip', 'r') as zip_ref:
    zip_ref.extractall('/content')

In [12]:
import numpy as np # linear algebra
import pandas as pd
df = pd.read_csv('/content/NF-ToN-IoT.csv', sep=',', encoding='utf-8')
df.shape

(1379274, 14)

In [13]:
df.dtypes

IPV4_SRC_ADDR                  object
L4_SRC_PORT                     int64
IPV4_DST_ADDR                  object
L4_DST_PORT                     int64
PROTOCOL                        int64
L7_PROTO                      float64
IN_BYTES                        int64
OUT_BYTES                       int64
IN_PKTS                         int64
OUT_PKTS                        int64
TCP_FLAGS                       int64
FLOW_DURATION_MILLISECONDS      int64
Label                           int64
Attack                         object
dtype: object

In [14]:
df.Attack.value_counts()

Attack
injection     468539
ddos          326345
Benign        270279
password      156299
xss            99944
scanning       21467
dos            17717
backdoor       17247
mitm            1295
ransomware       142
Name: count, dtype: int64

In [15]:
# replacing values
df['Attack'].replace(['Benign', 'ddos', 'injection', 'password','xss','scanning','dos','backdoor','mitm','ransomware'],
						[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], inplace=True)


In [16]:
df.Attack.value_counts()

Attack
2    468539
1    326345
0    270279
3    156299
4     99944
5     21467
6     17717
7     17247
8      1295
9       142
Name: count, dtype: int64

In [17]:
df.dtypes

IPV4_SRC_ADDR                  object
L4_SRC_PORT                     int64
IPV4_DST_ADDR                  object
L4_DST_PORT                     int64
PROTOCOL                        int64
L7_PROTO                      float64
IN_BYTES                        int64
OUT_BYTES                       int64
IN_PKTS                         int64
OUT_PKTS                        int64
TCP_FLAGS                       int64
FLOW_DURATION_MILLISECONDS      int64
Label                           int64
Attack                          int64
dtype: object

In [18]:
df = df.drop(columns=['IPV4_SRC_ADDR', 'IPV4_DST_ADDR'])

In [19]:
from fastai.tabular.all import df_shrink
df = df_shrink(df, obj2cat=False, int2uint=False)

In [20]:
df.dtypes

L4_SRC_PORT                     int32
L4_DST_PORT                     int32
PROTOCOL                         int8
L7_PROTO                      float32
IN_BYTES                        int32
OUT_BYTES                       int32
IN_PKTS                         int32
OUT_PKTS                        int32
TCP_FLAGS                       int16
FLOW_DURATION_MILLISECONDS      int32
Label                            int8
Attack                           int8
dtype: object

In [21]:
df.replace([np.inf, -np.inf], np.nan, inplace=True)
print(df.isna().any(axis=1).sum(), "rows with at least one NaN to remove")
df.dropna(inplace=True)

0 rows with at least one NaN to remove


In [22]:
print(df.duplicated().sum(), "fully duplicate rows to remove")
df.drop_duplicates(inplace=True)
df.reset_index(inplace=True, drop=True)

221280 fully duplicate rows to remove


In [25]:
df.head()

Unnamed: 0,L4_SRC_PORT,L4_DST_PORT,PROTOCOL,L7_PROTO,IN_BYTES,OUT_BYTES,IN_PKTS,OUT_PKTS,TCP_FLAGS,FLOW_DURATION_MILLISECONDS,Label,Attack
0,63318,443,6,91.0,181,165,2,1,24,327,0,0
1,57442,15600,17,0.0,63,0,1,0,0,0,0,0
2,57452,15600,17,0.0,63,0,1,0,0,0,0,0
3,138,138,17,10.16,472,0,2,0,0,0,0,0
4,51989,15600,17,0.0,63,0,1,0,0,0,0,0


# Model

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [None]:
data = pd.read_csv("/content/EthicalHackingDataset/EthicalHackingDataset.csv")

In [None]:
data.head()

Unnamed: 0,L4_SRC_PORT,L4_DST_PORT,PROTOCOL,L7_PROTO,IN_BYTES,OUT_BYTES,IN_PKTS,OUT_PKTS,TCP_FLAGS,FLOW_DURATION_MILLISECONDS,Label,Attack
0,63318,443,6,91.0,181,165,2,1,24,327,0,Benign
1,57442,15600,17,0.0,63,0,1,0,0,0,0,Benign
2,57452,15600,17,0.0,63,0,1,0,0,0,0,Benign
3,138,138,17,10.16,472,0,2,0,0,0,0,Benign
4,51989,15600,17,0.0,63,0,1,0,0,0,0,Benign


In [None]:
num_features=[col for col in data.columns if data[col].dtype!='O']
num_features

['L4_SRC_PORT',
 'L4_DST_PORT',
 'PROTOCOL',
 'L7_PROTO',
 'IN_BYTES',
 'OUT_BYTES',
 'IN_PKTS',
 'OUT_PKTS',
 'TCP_FLAGS',
 'FLOW_DURATION_MILLISECONDS',
 'Label']

In [None]:
num_data = data[num_features]
num_data.head()

Unnamed: 0,L4_SRC_PORT,L4_DST_PORT,PROTOCOL,L7_PROTO,IN_BYTES,OUT_BYTES,IN_PKTS,OUT_PKTS,TCP_FLAGS,FLOW_DURATION_MILLISECONDS,Label
0,63318,443,6,91.0,181,165,2,1,24,327,0
1,57442,15600,17,0.0,63,0,1,0,0,0,0
2,57452,15600,17,0.0,63,0,1,0,0,0,0
3,138,138,17,10.16,472,0,2,0,0,0,0
4,51989,15600,17,0.0,63,0,1,0,0,0,0


In [None]:
y=num_data['Label']
X=num_data.drop('Label',axis=1)
cols=X.columns
cols

Index(['L4_SRC_PORT', 'L4_DST_PORT', 'PROTOCOL', 'L7_PROTO', 'IN_BYTES',
       'OUT_BYTES', 'IN_PKTS', 'OUT_PKTS', 'TCP_FLAGS',
       'FLOW_DURATION_MILLISECONDS'],
      dtype='object')

**Splitting Dataset**

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(X,y,train_size=0.8,random_state=0)

In [None]:
#Import Libraries
from sklearn.preprocessing import StandardScaler

In [None]:
#Creating a StandardScaler object to normalize the X train and test set feature data
scaler = StandardScaler()
X_train = scaler.fit_transform(x_train)
X_test = scaler.transform(x_test)

In [None]:
#Showing data
print('X Train: \n' , X_train[:5])

X Train: 
 [[ 4.74375952e-01 -3.81869997e-01  3.02531623e+00 -5.22095922e-01
  -8.31700954e-03 -2.52754266e-02 -6.38225549e-03 -7.13139232e-03
  -2.60446149e+00 -6.01232244e-02]
 [-3.23829856e-02 -3.81869997e-01  3.02531623e+00 -5.22095922e-01
  -8.29670903e-03 -2.52440866e-02 -6.38225549e-03 -7.13139232e-03
  -2.60446149e+00 -6.01077469e-02]
 [-6.08692283e-01 -3.80119145e-01 -3.25483859e-01 -4.65896553e-01
  -5.85049736e-03  3.65920611e-03 -5.07297236e-03 -4.18362913e-03
   5.89598440e-01 -5.95815122e-02]
 [-5.66070552e-01 -3.56579906e-01 -3.25483859e-01  1.89447696e+00
  -8.35253543e-03 -2.58003712e-02 -6.38225549e-03 -7.72094496e-03
  -2.36786446e+00 -6.01232244e-02]
 [ 9.96644803e-01  2.33512873e+00 -3.25483859e-01 -4.65896553e-01
  -4.62231640e-03  8.46989241e-03 -3.43636844e-03 -6.46313296e-04
   5.89598440e-01 -5.60990766e-02]]


In [None]:
num_data['Label'].unique()

array([0, 1])

In [None]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier()
classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_test)

In [None]:
results_df = pd.DataFrame({
    'Actual Value': y_test,
    'Predicted Value': y_pred
})

# Display the DataFrame
results_df

Unnamed: 0,Actual Value,Predicted Value
241655,1,1
83742,1,1
486752,1,1
309356,1,1
1114565,1,1
...,...,...
177284,1,1
145311,1,1
813227,0,0
470855,1,1


In [None]:
#Compute accuracy
from sklearn.metrics import accuracy_score
print(accuracy_score(y_pred,y_test))
print('\n')

0.9992443836113282




In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     39683
           1       1.00      1.00      1.00    191916

    accuracy                           1.00    231599
   macro avg       1.00      1.00      1.00    231599
weighted avg       1.00      1.00      1.00    231599

