In [4]:
from pytorch_tabnet.tab_model import TabNetClassifier
from xgboost import XGBClassifier

import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import pandas as pd
import numpy as np
np.random.seed(0)


import os
import wget
from pathlib import Path
import shutil
import gzip

from matplotlib import pyplot as plt
%matplotlib inline

# Load the dataset


In [5]:
df = pd.read_csv('DNN-EdgeIIoT-dataset.csv', low_memory=False)

# Prepare data


In [6]:
print("Number of Rows: ", len(df.axes[0]))
print("Number of Columns: ", len(df.axes[1]))
print("-------------------------------------")
drop_columns = ["frame.time", "ip.src_host", "ip.dst_host", "arp.src.proto_ipv4","arp.dst.proto_ipv4", 
                "http.file_data","http.request.full_uri","icmp.transmit_timestamp",
                "http.request.uri.query", "tcp.options","tcp.payload","tcp.srcport",
                "tcp.dstport", "udp.port", "mqtt.msg"]
df.drop(drop_columns, axis=1, inplace=True)
#remove no values lines
df.dropna(axis=0, how='any', inplace=True)
#remove duplicates
df.drop_duplicates(subset=None, keep="first", inplace=True)
#shuffles the dataset
#df = shuffle(df)
#remove one of the types, Attack_type or Attack_label
#df.pop("Attack_type")
print("Number of Rows: ", len(df.axes[0]))
print("Number of Columns: ", len(df.axes[1]))

Number of Rows:  2219201
Number of Columns:  63
-------------------------------------
Number of Rows:  1909671
Number of Columns:  48


In [7]:
categorical_columns = []
categorical_dims =  {}
for col in df.columns[df.dtypes == object]:
    print(col, df[col].nunique())
    l_enc = LabelEncoder()
    df[col] = l_enc.fit_transform(df[col].values)
    categorical_columns.append(col)
    categorical_dims[col] = len(l_enc.classes_)

http.request.method 9
http.referer 5
http.request.version 13
dns.qry.name.len 10
mqtt.conack.flags 13
mqtt.protoname 3
mqtt.topic 3
Attack_type 15


# Split dataset into train and test

In [18]:
#split data
X = df.iloc[:,0:46]
#for binary class
Y_B = df.iloc[:,46]
#for multiclass 
Y = df.iloc[:,47]

# split data into train, test and validation sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=7)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=7) # 0.25 x 0.8 = 0.2
print("train X: ", len(X_train))
print("test X: ", len(X_test))
print("val X: ", len(X_val))


train X:  1145802
test X:  381935
val X:  381934


# Algorithm

In [19]:
clf_xgb = XGBClassifier(max_depth=8,
    learning_rate=0.1,
    n_estimators=400,
    verbosity=0,
    silent=None,
    objective="multi:softmax",
    booster='gbtree',
    n_jobs=-1,
    nthread=None,
    gamma=0,
    min_child_weight=1,
    max_delta_step=0,
    subsample=0.7,
    colsample_bytree=1,
    colsample_bylevel=1,
    colsample_bynode=1,
    reg_alpha=0,
    reg_lambda=1,
    scale_pos_weight=1,
    base_score=0.5,
    random_state=0,
    seed=None,
    num_class= len(Y.unique()))
le = LabelEncoder()
y_valid = le.fit_transform(y_val)
clf_xgb.fit(X_train, y_train,
            eval_set=[(X_val, y_valid)],
            early_stopping_rounds=40,
            verbose=10)



[0]	validation_0-mlogloss:2.00152
[10]	validation_0-mlogloss:0.56402
[20]	validation_0-mlogloss:0.22237
[30]	validation_0-mlogloss:0.10260
[40]	validation_0-mlogloss:0.05846
[50]	validation_0-mlogloss:0.04183
[60]	validation_0-mlogloss:0.03548
[70]	validation_0-mlogloss:0.03300
[80]	validation_0-mlogloss:0.03199
[90]	validation_0-mlogloss:0.03158
[100]	validation_0-mlogloss:0.03140
[110]	validation_0-mlogloss:0.03134
[120]	validation_0-mlogloss:0.03132
[130]	validation_0-mlogloss:0.03133
[140]	validation_0-mlogloss:0.03137
[150]	validation_0-mlogloss:0.03142
[160]	validation_0-mlogloss:0.03148


In [20]:
clf_xgb.save_model("xgboost_Y.json")

In [21]:
preds_valid = np.array(clf_xgb.predict_proba(X_val, ))
valid_acc = accuracy_score(y_pred=np.argmax(preds_valid, axis=1) + 1, y_true=y_val)
print(valid_acc)

preds_test = np.array(clf_xgb.predict_proba(X_test))
test_acc = accuracy_score(y_pred=np.argmax(preds_test, axis=1) + 1, y_true=y_test)
print(test_acc)

0.001167741023318165
0.0012436671161323261
