# Categorization of Attacks using Botnet-IoT Dataset

In [1]:
# Importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.tree import DecisionTreeClassifier 
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, classification_report

%matplotlib inline

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Training Dataset
data = pd.read_csv("UNSW_2018_IoT_Botnet_Final_10_best_Training.csv")
data.head()

Unnamed: 0,pkSeqID,proto,saddr,sport,daddr,dport,seq,stddev,N_IN_Conn_P_SrcIP,min,state_number,mean,N_IN_Conn_P_DstIP,drate,srate,max,attack,category,subcategory
0,3142762,udp,192.168.100.150,6551,192.168.100.3,80,251984,1.900363,100,0.0,4,2.687519,100,0.0,0.494549,4.031619,1,DDoS,UDP
1,2432264,tcp,192.168.100.150,5532,192.168.100.3,80,256724,0.078003,38,3.85693,3,3.934927,100,0.0,0.256493,4.012924,1,DDoS,TCP
2,1976315,tcp,192.168.100.147,27165,192.168.100.3,80,62921,0.268666,100,2.9741,3,3.341429,100,0.0,0.29488,3.609205,1,DDoS,TCP
3,1240757,udp,192.168.100.150,48719,192.168.100.3,80,99168,1.823185,63,0.0,4,3.222832,63,0.0,0.461435,4.942302,1,DoS,UDP
4,3257991,udp,192.168.100.147,22461,192.168.100.3,80,105063,0.822418,100,2.979995,4,3.983222,100,0.0,1.002999,4.994452,1,DDoS,UDP


In [4]:
# General Information about the data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2934817 entries, 0 to 2934816
Data columns (total 19 columns):
 #   Column             Dtype  
---  ------             -----  
 0   pkSeqID            int64  
 1   proto              object 
 2   saddr              object 
 3   sport              object 
 4   daddr              object 
 5   dport              object 
 6   seq                int64  
 7   stddev             float64
 8   N_IN_Conn_P_SrcIP  int64  
 9   min                float64
 10  state_number       int64  
 11  mean               float64
 12  N_IN_Conn_P_DstIP  int64  
 13  drate              float64
 14  srate              float64
 15  max                float64
 16  attack             int64  
 17  category           object 
 18  subcategory        object 
dtypes: float64(6), int64(6), object(7)
memory usage: 425.4+ MB


## Exploratory Data Analysis

In [5]:
data.describe()

Unnamed: 0,pkSeqID,seq,stddev,N_IN_Conn_P_SrcIP,min,state_number,mean,N_IN_Conn_P_DstIP,drate,srate,max,attack
count,2934817.0,2934817.0,2934817.0,2934817.0,2934817.0,2934817.0,2934817.0,2934817.0,2934817.0,2934817.0,2934817.0,2934817.0
mean,1834209.0,121297.3,0.8869639,82.54997,1.017208,3.134219,2.230471,92.45766,0.4303064,3.12829,3.019269,0.9998739
std,1059058.0,75787.0,0.8036391,24.39019,1.483551,1.187107,1.517766,18.16651,56.23304,784.5494,1.860915,0.0112275
min,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,917109.0,54847.0,0.029997,69.0,0.0,3.0,0.181934,100.0,0.0,0.155845,0.280417,1.0
50%,1834316.0,117737.0,0.792575,100.0,0.0,4.0,2.689973,100.0,0.0,0.28378,4.008429,1.0
75%,2751250.0,184870.0,1.74522,100.0,2.147949,4.0,3.565061,100.0,0.0,0.488,4.292426,1.0
max,3668522.0,262211.0,2.496763,100.0,4.980471,11.0,4.981882,100.0,58823.53,1000000.0,4.999999,1.0


In [6]:
data["proto"].value_counts()

udp          1596819
tcp          1330598
icmp            7228
arp              166
ipv6-icmp          6
Name: proto, dtype: int64

In [7]:
data["saddr"].value_counts()

192.168.100.147              761360
192.168.100.148              738642
192.168.100.150              712260
192.168.100.149              711466
192.168.100.3                  6609
192.168.100.5                  4107
192.168.100.6                   272
192.168.100.7                    34
192.168.100.4                    17
192.168.100.1                    14
192.168.100.27                    9
192.168.100.46                    8
fe80::250:56ff:febe:254           5
192.168.100.55                    3
fe80::250:56ff:febe:c038          2
fe80::2c6a:ff9b:7e14:166a         2
fe80::c0c0:aa20:45b9:bdd9         2
fe80::250:56ff:febe:26db          2
fe80::250:56ff:febe:89ee          2
fe80::250:56ff:febe:e9d9          1
Name: saddr, dtype: int64

In [8]:
data["sport"].value_counts()

0x0303    7156
80        3220
1822       878
60541      869
1216       868
          ... 
56775       31
39305       30
18992       30
0x000d      10
0x0011       8
Name: sport, Length: 65541, dtype: int64

In [9]:
data["daddr"].value_counts()

192.168.100.3      1900562
192.168.100.5       361192
192.168.100.7       332161
192.168.100.6       329679
192.168.100.150       3040
                    ...   
216.239.36.10            1
192.55.83.30             1
205.251.194.154          1
205.251.199.61           1
198.41.0.4               1
Name: daddr, Length: 81, dtype: int64

In [10]:
data["dport"].value_counts()

80       2858794
1           5379
3306        3757
53           275
-1           166
          ...   
8557           1
25567          1
3763           1
7754           1
4531           1
Name: dport, Length: 6906, dtype: int64

In [11]:
data["category"].value_counts()

DDoS              1541315
DoS               1320148
Reconnaissance      72919
Normal                370
Theft                  65
Name: category, dtype: int64

In [12]:
data["attack"].value_counts()

1    2934447
0        370
Name: attack, dtype: int64

In [13]:
data["subcategory"].value_counts()

UDP                  1584650
TCP                  1274843
Service_Scan           58626
OS_Fingerprint         14293
HTTP                    1970
Normal                   370
Keylogging                59
Data_Exfiltration          6
Name: subcategory, dtype: int64

### Extracting 10 best features

In [14]:
ten_best_features = data[['seq','stddev','N_IN_Conn_P_SrcIP', 'min', 'state_number', 'mean', 'N_IN_Conn_P_DstIP',
       'drate', 'srate', 'max']]
target_features = data[['attack','category','subcategory']]

## Data Preprocessing

In [15]:
# Label Encoding the target columns
le = LabelEncoder()
target_features['category'] = le.fit_transform(target_features['category'])
target_features['subcategory'] = le.fit_transform(target_features['subcategory'])

target_features.head()

Unnamed: 0,attack,category,subcategory
0,1,0,7
1,1,0,6
2,1,0,6
3,1,1,7
4,1,0,7


In [16]:
# Train-test Split
X_train, X_test, y_train, y_test = train_test_split(ten_best_features,target_features)

In [17]:
# Scaling the data
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## Defining the ML Model Classes

In [18]:
class RandomForest:
    def __init__(self,max_depth):
        self.rfc_attack = RandomForestClassifier(max_depth=max_depth)
        self.rfc_category = RandomForestClassifier(max_depth=max_depth)
        self.rfc_subcategory = RandomForestClassifier(max_depth=max_depth)
    
    def fit(self,X_train,y_train):
        self.rfc_attack.fit(X_train,y_train['attack'])
        
        features_category = np.concatenate((X_train,np.array(y_train['attack']).reshape(-1,1)),axis=1)
        self.rfc_category.fit(features_category,y_train['category'])
        
        features_subcategory = np.concatenate((features_category,np.array(y_train['category']).reshape(-1,1)),axis=1)
        self.rfc_subcategory.fit(features_subcategory,y_train['subcategory'])
        
    def predict(self,X_test):
        predict_attack = self.rfc_attack.predict(X_test)
        
        test_category = np.concatenate((X_test,predict_attack.reshape(-1,1)),axis=1)
        predict_category = self.rfc_category.predict(test_category)
        
        test_subcategory = np.concatenate((test_category,predict_category.reshape(-1,1)),axis=1)
        predict_subcategory = self.rfc_subcategory.predict(test_subcategory)
        
        return pd.DataFrame({'attack':predict_attack,'category':predict_category,'subcategory':predict_subcategory})

In [19]:
class NaiveBayes:
    def __init__(self):
        self.nb_attack = GaussianNB()
        self.nb_category = GaussianNB()
        self.nb_subcategory = GaussianNB()
    
    def fit(self,X_train,y_train):
        self.nb_attack.fit(X_train,y_train['attack'])
        
        features_category = np.concatenate((X_train,np.array(y_train['attack']).reshape(-1,1)),axis=1)
        self.nb_category.fit(features_category,y_train['category'])
        
        features_subcategory = np.concatenate((features_category,np.array(y_train['category']).reshape(-1,1)),axis=1)
        self.nb_subcategory.fit(features_subcategory,y_train['subcategory'])
        
    def predict(self,X_test):
        predict_attack = self.nb_attack.predict(X_test)
        
        test_category = np.concatenate((X_test,predict_attack.reshape(-1,1)),axis=1)
        predict_category = self.nb_category.predict(test_category)
        
        test_subcategory = np.concatenate((test_category,predict_category.reshape(-1,1)),axis=1)
        predict_subcategory = self.nb_subcategory.predict(test_subcategory)
        
        return pd.DataFrame({'attack':predict_attack,'category':predict_category,'subcategory':predict_subcategory})

In [20]:
class DecisionTree:
    def __init__(self,criterion,max_depth=5):
        self.dtree_attack = DecisionTreeClassifier(criterion=criterion,max_depth=max_depth)
        self.dtree_category = DecisionTreeClassifier(criterion=criterion,max_depth=max_depth)
        self.dtree_subcategory = DecisionTreeClassifier(criterion=criterion,max_depth=max_depth)
    
    def fit(self,X_train,y_train):
        self.dtree_attack.fit(X_train,y_train['attack'])
        
        features_category = np.concatenate((X_train,np.array(y_train['attack']).reshape(-1,1)),axis=1)
        self.dtree_category.fit(features_category,y_train['category'])
        
        features_subcategory = np.concatenate((features_category,np.array(y_train['category']).reshape(-1,1)),axis=1)
        self.dtree_subcategory.fit(features_subcategory,y_train['subcategory'])
        
    def predict(self,X_test):
        predict_attack = self.dtree_attack.predict(X_test)
        
        test_category = np.concatenate((X_test,predict_attack.reshape(-1,1)),axis=1)
        predict_category = self.dtree_category.predict(test_category)
        
        test_subcategory = np.concatenate((test_category,predict_category.reshape(-1,1)),axis=1)
        predict_subcategory = self.dtree_subcategory.predict(test_subcategory)
        
        return pd.DataFrame({'attack':predict_attack,'category':predict_category,'subcategory':predict_subcategory})

In [21]:
class GradientBoost:
    def __init__(self):
        self.xgb_attack = XGBClassifier()
        self.xgb_category = XGBClassifier()
        self.xgb_subcategory = XGBClassifier()
    
    def fit(self,X_train,y_train):
        self.xgb_attack.fit(X_train,y_train['attack'])
        
        features_category = np.concatenate((X_train,np.array(y_train['attack']).reshape(-1,1)),axis=1)
        self.xgb_category.fit(features_category,y_train['category'])
        
        features_subcategory = np.concatenate((features_category,np.array(y_train['category']).reshape(-1,1)),axis=1)
        self.xgb_subcategory.fit(features_subcategory,y_train['subcategory'])
        
    def predict(self,X_test):
        predict_attack = self.xgb_attack.predict(X_test)
        
        test_category = np.concatenate((X_test,predict_attack.reshape(-1,1)),axis=1)
        predict_category = self.xgb_category.predict(test_category)
        
        test_subcategory = np.concatenate((test_category,predict_category.reshape(-1,1)),axis=1)
        predict_subcategory = self.xgb_subcategory.predict(test_subcategory)
        
        return pd.DataFrame({'attack':predict_attack,'category':predict_category,'subcategory':predict_subcategory})

### Validation of ML Models on Training Dataset

**Random Forest**

In [22]:
rf = RandomForest(max_depth=3)
rf.fit(X_train,y_train)

predictions_rfc = rf.predict(X_test)

In [24]:
print(confusion_matrix(y_test['attack'],predictions_rfc['attack']))
print(classification_report(y_test['attack'],predictions_rfc['attack']))

[[     6     99]
 [     0 733600]]
              precision    recall  f1-score   support

           0       1.00      0.06      0.11       105
           1       1.00      1.00      1.00    733600

    accuracy                           1.00    733705
   macro avg       1.00      0.53      0.55    733705
weighted avg       1.00      1.00      1.00    733705



In [25]:
print(confusion_matrix(y_test['category'],predictions_rfc['category']))
print(classification_report(y_test['category'],predictions_rfc['category']))

[[378810   6434      0      0      0]
 [ 50115 280055      0      4      0]
 [     0     59      0     46      0]
 [ 10148   1978      0   6047      0]
 [     0      5      0      4      0]]
              precision    recall  f1-score   support

           0       0.86      0.98      0.92    385244
           1       0.97      0.85      0.91    330174
           2       0.00      0.00      0.00       105
           3       0.99      0.33      0.50     18173
           4       0.00      0.00      0.00         9

    accuracy                           0.91    733705
   macro avg       0.56      0.43      0.46    733705
weighted avg       0.91      0.91      0.90    733705



In [26]:
print(confusion_matrix(y_test['subcategory'],predictions_rfc['subcategory']))
print(classification_report(y_test['subcategory'],predictions_rfc['subcategory']))

[[     0      0      0      0      0      1      0      1]
 [     0      0      0      0      0      3    514      0]
 [     0      0      0      0      0      4      1      2]
 [     0      0      0      0      0     47     54      4]
 [     0      0      0      0      0    172   3397      2]
 [     0      0      0      0      0   6885   7713      4]
 [     0      0      0      0      0      1 318095    242]
 [     0      0      0      0      0      0      7 396556]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.00      0.00      0.00       517
           2       0.00      0.00      0.00         7
           3       0.00      0.00      0.00       105
           4       0.00      0.00      0.00      3571
           5       0.97      0.47      0.63     14602
           6       0.96      1.00      0.98    318338
           7       1.00      1.00      1.00    396563

    accuracy                           

In [27]:
rf.rfc_attack.score(X_test,y_test['attack'])

0.9998650683857954

In [28]:
rf.rfc_category.score(np.concatenate((X_test,np.array(predictions_rfc['attack']).reshape(-1,1)),axis=1),y_test['category'])

0.9062388834749661

In [29]:
rf.rfc_subcategory.score(np.concatenate((X_test,np.array(predictions_rfc['attack']).reshape(-1,1),
                                    np.array(predictions_rfc['category']).reshape(-1,1)),axis=1),y_test['subcategory'])

0.9834143150176161

**Naive Bayes**

In [23]:
nb = NaiveBayes()
nb.fit(X_train,y_train)

predictions_nb = nb.predict(X_test)

In [30]:
print(confusion_matrix(y_test['attack'],predictions_nb['attack']))
print(classification_report(y_test['attack'],predictions_nb['attack']))

[[    96      9]
 [  2955 730645]]
              precision    recall  f1-score   support

           0       0.03      0.91      0.06       105
           1       1.00      1.00      1.00    733600

    accuracy                           1.00    733705
   macro avg       0.52      0.96      0.53    733705
weighted avg       1.00      1.00      1.00    733705



In [31]:
print(confusion_matrix(y_test['category'],predictions_nb['category']))
print(classification_report(y_test['category'],predictions_nb['category']))

[[368591  16479     41    133      0]
 [183019 146012    786    355      2]
 [     0      6     96      3      0]
 [ 10673   1534   2121   3845      0]
 [     0      0      7      0      2]]
              precision    recall  f1-score   support

           0       0.66      0.96      0.78    385244
           1       0.89      0.44      0.59    330174
           2       0.03      0.91      0.06       105
           3       0.89      0.21      0.34     18173
           4       0.50      0.22      0.31         9

    accuracy                           0.71    733705
   macro avg       0.59      0.55      0.42    733705
weighted avg       0.77      0.71      0.68    733705



In [32]:
print(confusion_matrix(y_test['subcategory'],predictions_nb['subcategory']))
print(classification_report(y_test['subcategory'],predictions_nb['subcategory']))

[[     0      0      0      2      0      0      0      0]
 [     0    337      0     11      0    156     13      0]
 [     1      0      1      5      0      0      0      0]
 [     0      1      0     96      2      0      6      0]
 [     0    199      0    136      1    147   3088      0]
 [     0    555      0   1985    370   3327   8365      0]
 [     0    827      2    807      5    327 316370      0]
 [     0      0      0      9      0      0     20 396534]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.18      0.65      0.28       517
           2       0.33      0.14      0.20         7
           3       0.03      0.91      0.06       105
           4       0.00      0.00      0.00      3571
           5       0.84      0.23      0.36     14602
           6       0.96      0.99      0.98    318338
           7       1.00      1.00      1.00    396563

    accuracy                           

**Decision Tree (Information Gain)**

In [33]:
dtree_ig = DecisionTree('entropy')
dtree_ig.fit(X_train,y_train)
predictions_dtree_ig = dtree_ig.predict(X_test)

In [34]:
print(confusion_matrix(y_test['attack'],predictions_dtree_ig['attack']))
print(classification_report(y_test['attack'],predictions_dtree_ig['attack']))

[[    36     69]
 [     2 733598]]
              precision    recall  f1-score   support

           0       0.95      0.34      0.50       105
           1       1.00      1.00      1.00    733600

    accuracy                           1.00    733705
   macro avg       0.97      0.67      0.75    733705
weighted avg       1.00      1.00      1.00    733705



In [35]:
print(confusion_matrix(y_test['category'],predictions_dtree_ig['category']))
print(classification_report(y_test['category'],predictions_dtree_ig['category']))

[[339124  45579      0    541      0]
 [ 11483 318632      0     59      0]
 [     9     18     27     51      0]
 [   350   7533      2  10288      0]
 [     0      1      0      8      0]]
              precision    recall  f1-score   support

           0       0.97      0.88      0.92    385244
           1       0.86      0.97      0.91    330174
           2       0.93      0.26      0.40       105
           3       0.94      0.57      0.71     18173
           4       0.00      0.00      0.00         9

    accuracy                           0.91    733705
   macro avg       0.74      0.53      0.59    733705
weighted avg       0.92      0.91      0.91    733705



In [36]:
print(confusion_matrix(y_test['subcategory'],predictions_dtree_ig['subcategory']))
print(classification_report(y_test['subcategory'],predictions_dtree_ig['subcategory']))

[[     0      0      1      0      0      1      0      0]
 [     0    354      0      0      0    110     52      1]
 [     0      1      2      0      0      4      0      0]
 [     0      0      0     29      8     41     12     15]
 [     0      0      0      4    862    567   2118     20]
 [     0     15      0      0    245   8615   5708     19]
 [     0      3      0      0      1    479 317853      2]
 [     0      0      0     10      0      0      5 396548]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.95      0.68      0.80       517
           2       0.67      0.29      0.40         7
           3       0.67      0.28      0.39       105
           4       0.77      0.24      0.37      3571
           5       0.88      0.59      0.71     14602
           6       0.98      1.00      0.99    318338
           7       1.00      1.00      1.00    396563

    accuracy                           

In [37]:
dtree_ig.dtree_attack.score(X_test,y_test['attack'])

0.9999032308625401

In [38]:
dtree_ig.dtree_category.score(np.concatenate((X_test,np.array(predictions_dtree_ig['attack']).reshape(-1,1)),axis=1),y_test['category'])

0.9105444286191317

In [39]:
dtree_ig.dtree_subcategory.score(np.concatenate((X_test,np.array(predictions_dtree_ig['attack']).reshape(-1,1),
                                        np.array(predictions_dtree_ig['category']).reshape(-1,1)),axis=1),y_test['subcategory'])

0.9871310676634342

**Decision Tree (Gini Index)**

In [40]:
dtree_gini = DecisionTree('gini')
dtree_gini.fit(X_train,y_train)
predictions_dtree_gini = dtree_gini.predict(X_test)

In [41]:
print(confusion_matrix(y_test['attack'],predictions_dtree_gini['attack']))
print(classification_report(y_test['attack'],predictions_dtree_gini['attack']))

[[    46     59]
 [    16 733584]]
              precision    recall  f1-score   support

           0       0.74      0.44      0.55       105
           1       1.00      1.00      1.00    733600

    accuracy                           1.00    733705
   macro avg       0.87      0.72      0.78    733705
weighted avg       1.00      1.00      1.00    733705



In [42]:
print(confusion_matrix(y_test['category'],predictions_dtree_gini['category']))
print(classification_report(y_test['category'],predictions_dtree_gini['category']))

[[338047  46712      0    485      0]
 [  8416 321746      1     11      0]
 [     0     25     37     43      0]
 [    77   7458      7  10631      0]
 [     0      3      0      6      0]]
              precision    recall  f1-score   support

           0       0.98      0.88      0.92    385244
           1       0.86      0.97      0.91    330174
           2       0.82      0.35      0.49       105
           3       0.95      0.58      0.72     18173
           4       0.00      0.00      0.00         9

    accuracy                           0.91    733705
   macro avg       0.72      0.56      0.61    733705
weighted avg       0.92      0.91      0.91    733705



In [43]:
print(confusion_matrix(y_test['subcategory'],predictions_dtree_gini['subcategory']))
print(classification_report(y_test['subcategory'],predictions_dtree_gini['subcategory']))

[[     0      0      1      0      0      1      0      0]
 [     0    429      0      0      0     10     78      0]
 [     0      0      2      0      0      5      0      0]
 [     0     15      0      4     11     67      8      0]
 [     0     13      0      0    776    719   2063      0]
 [     0     13      1      1     83   9060   5444      0]
 [     0     58      0      0      1    486 317793      0]
 [     0      0      0      2      0      0      6 396555]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.81      0.83      0.82       517
           2       0.50      0.29      0.36         7
           3       0.57      0.04      0.07       105
           4       0.89      0.22      0.35      3571
           5       0.88      0.62      0.73     14602
           6       0.98      1.00      0.99    318338
           7       1.00      1.00      1.00    396563

    accuracy                           

In [44]:
dtree_gini.dtree_attack.score(X_test,y_test['attack'])

0.999897779080148

In [45]:
dtree_gini.dtree_category.score(np.concatenate((X_test,np.array(predictions_dtree_gini['attack']).reshape(-1,1)),axis=1),y_test['category'])

0.913801868598415

In [46]:
dtree_gini.dtree_subcategory.score(np.concatenate((X_test,np.array(predictions_dtree_gini['attack']).reshape(-1,1),
                                        np.array(predictions_dtree_gini['category']).reshape(-1,1)),axis=1),y_test['subcategory'])

0.9876162762963316

**Gradient Boost**

In [47]:
xgb = GradientBoost()
xgb.fit(X_train,y_train)
predictions_xgb = xgb.predict(X_test)

In [None]:
print(confusion_matrix(y_test['attack'],predictions_xgb['attack']))
print(classification_report(y_test['attack'],predictions_xgb['attack']))

In [None]:
print(confusion_matrix(y_test['category'],predictions_xgb['category']))
print(classification_report(y_test['category'],predictions_xgb['category']))

In [None]:
print(confusion_matrix(y_test['subcategory'],predictions_xgb['subcategory']))
print(classification_report(y_test['subcategory'],predictions_xgb['subcategory']))

In [None]:
xgb.xgb_attack.score(X_test,y_test['attack'])

In [None]:
xgb.xgb_category.score(np.concatenate((X_test,np.array(predictions_xgb['attack']).reshape(-1,1)),axis=1),y_test['category'])

In [None]:
xgb.xgb_subcategory.score(np.concatenate((X_test,np.array(predictions_xgb['attack']).reshape(-1,1),
                                        np.array(predictions_xgb['category']).reshape(-1,1)),axis=1),y_test['subcategory'])

### Test Data

In [None]:
test_data = pd.read_csv("UNSW_2018_IoT_Botnet_Final_10_best_Testing.csv")
test_data.head()

In [None]:
# Extracting the ten-best features from test set
test_ten_best = test_data[ten_best_features.columns]
test_labels = test_data[['attack','category','subcategory']]
test_ten_best.head()

**Using complete training data and test data**

In [None]:
# Standard Scaling
train_data = sc.fit_transform(ten_best_features)
test = sc.transform(test_ten_best)

In [None]:
ten_best_features = data[['seq','stddev','N_IN_Conn_P_SrcIP', 'min', 'state_number', 'mean', 'N_IN_Conn_P_DstIP',
       'drate', 'srate', 'max']]
target_features = data[['attack','category','subcategory']]

In [None]:
# Label Encoding

target_features['category'] = le.fit_transform(target_features['category'])
test_labels['category'] = le.transform(test_labels['category'])

target_features['subcategory'] = le.fit_transform(target_features['subcategory'])
test_labels['subcategory'] = le.transform(test_labels['subcategory'])

### Training ML Models on Complete Data and Testing on test set

**Random Forest**

In [None]:
rf_clf = RandomForest(max_depth=5)
rf_clf.fit(train_data,target_features)
predictions_rf = rf_clf.predict(test)

In [None]:
print("Random Forest: Attack\n")
print(confusion_matrix(test_labels['attack'],predictions_rf['attack']),"\n")
print(classification_report(test_labels['attack'],predictions_rf['attack']))

In [None]:
print("Random Forest: Category\n")
print(confusion_matrix(test_labels['category'],predictions_rf['category']))
print(classification_report(test_labels['category'],predictions_rf['category']))

In [None]:
print("Random Forest: Subcategory\n")
print(confusion_matrix(test_labels['subcategory'],predictions_rf['subcategory']))
print(classification_report(test_labels['subcategory'],predictions_rf['subcategory']))

**Naive Bayes**

In [None]:
nb_clf = NaiveBayes()
nb_clf.fit(train_data,target_features)
predictions_nb = nb_clf.predict(test)

In [None]:
print("Naive Bayes: Attack\n")
print(confusion_matrix(test_labels['attack'],predictions_nb['attack']))
print(classification_report(test_labels['attack'],predictions_nb['attack']))

In [None]:
print("Naive Bayes: Category\n")
print(confusion_matrix(test_labels['category'],predictions_nb['category']),"\n")
print(classification_report(test_labels['category'],predictions_nb['category']))

In [None]:
print("Naive Bayes: Subcategory\n")
print(confusion_matrix(test_labels['subcategory'],predictions_nb['subcategory']))
print(classification_report(test_labels['subcategory'],predictions_nb['subcategory']))

**Decision Tree (Information Gain)**

In [None]:
dtree_ig_clf = DecisionTree('entropy')
dtree_ig_clf.fit(train_data,target_features)
predictions_ig = dtree_ig_clf.predict(test)

In [None]:
print("Decision Tree (Information Gain): Attack\n")
print(confusion_matrix(test_labels['attack'],predictions_ig['attack']),"\n")
print(classification_report(test_labels['attack'],predictions_ig['attack']))

In [None]:
print("Decision Tree (Information Gain): Category\n")
print(confusion_matrix(test_labels['category'],predictions_ig['category']))
print(classification_report(test_labels['category'],predictions_ig['category']))

In [None]:
print("Decision Tree (Information Gain): Subcategory\n")
print(confusion_matrix(test_labels['subcategory'],predictions_ig['subcategory']),"\n")
print(classification_report(test_labels['subcategory'],predictions_ig['subcategory']))

**Decision Tree (Gini Index)**

In [None]:
dtree_gi_clf = DecisionTree('gini')
dtree_gi_clf.fit(train_data,target_features)
predictions_gi = dtree_gi_clf.predict(test)

In [None]:
print("Decision Tree (Gini Index): Attack\n")
print(confusion_matrix(test_labels['attack'],predictions_gi['attack']),"\n")
print(classification_report(test_labels['attack'],predictions_gi['attack']))

In [None]:
print("Decision Tree (Gini Index): Category\n")
print(confusion_matrix(test_labels['category'],predictions_gi['category']))
print(classification_report(test_labels['category'],predictions_gi['category']))

In [None]:
print("Decision Tree (Gini Index): Subcategory\n")
print(confusion_matrix(test_labels['subcategory'],predictions_gi['subcategory']),"\n")
print(classification_report(test_labels['subcategory'],predictions_gi['subcategory']))

**Gradient Boost**

In [None]:
xgb_clf = GradientBoost()
xgb_clf.fit(train_data,target_features)
predictions_gb = xgb_clf.predict(test)

In [None]:
print("Gradient Boost: Attack\n")
print(confusion_matrix(test_labels['attack'],predictions_gb['attack']),"\n")
print(classification_report(test_labels['attack'],predictions_gb['attack']))

In [None]:
print("Gradient Boost: Category\n")
print(confusion_matrix(test_labels['category'],predictions_gb['category']),"\n")
print(classification_report(test_labels['category'],predictions_gb['category']))

In [None]:
print("Gradient Boost: Subcategory\n")
print(confusion_matrix(test_labels['subcategory'],predictions_gb['subcategory']),"\n")
print(classification_report(test_labels['subcategory'],predictions_gb['subcategory']))