In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv('../../data/preprocessed_data.csv')
data.head(5)

Unnamed: 0,duration,src_bytes,dst_bytes,wrong_fragment,urgent,hot,num_failed_logins,num_compromised,su_attempted,num_root,...,service_smtp,flag_S0,flag_SF,flag_other,land_1,logged_in_1,root_shell_1,is_host_login_1,is_guest_login_1,attack_category
0,-0.110249,-0.007679,-0.004919,-0.089486,-0.007736,-0.095076,-0.027023,-0.011664,-0.024437,-0.012385,...,0,0,1,0,0,0,0,0,0,1
1,-0.110249,-0.007737,-0.004919,-0.089486,-0.007736,-0.095076,-0.027023,-0.011664,-0.024437,-0.012385,...,0,0,1,0,0,0,0,0,0,1
2,-0.110249,-0.007762,-0.004919,-0.089486,-0.007736,-0.095076,-0.027023,-0.011664,-0.024437,-0.012385,...,0,1,0,0,0,0,0,0,0,0
3,-0.110249,-0.007723,-0.002891,-0.089486,-0.007736,-0.095076,-0.027023,-0.011664,-0.024437,-0.012385,...,0,0,1,0,0,1,0,0,0,1
4,-0.110249,-0.007728,-0.004814,-0.089486,-0.007736,-0.095076,-0.027023,-0.011664,-0.024437,-0.012385,...,0,0,1,0,0,1,0,0,0,1


In [3]:
X = data.drop(['attack_category'],axis=1)
y = data['attack_category']

<br/>
<br/>
<br/>

## TRAIN TEST SPLIT

In [4]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split( X , y , test_size = 0.3 , random_state = 42 , stratify = y )

In [5]:
X_train.columns , X_train.shape

(Index(['duration', 'src_bytes', 'dst_bytes', 'wrong_fragment', 'urgent', 'hot',
        'num_failed_logins', 'num_compromised', 'su_attempted', 'num_root',
        'num_file_creations', 'num_shells', 'num_access_files', 'count',
        'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate',
        'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate',
        'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
        'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
        'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate',
        'dst_host_serror_rate', 'dst_host_srv_serror_rate',
        'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'protocol_type_tcp',
        'protocol_type_udp', 'service_http', 'service_other', 'service_private',
        'service_smtp', 'flag_S0', 'flag_SF', 'flag_other', 'land_1',
        'logged_in_1', 'root_shell_1', 'is_host_login_1', 'is_guest_login_1'],
       dtype='object'),
 (88181, 46))

In [6]:
X_test.shape

(37792, 46)

<br/>
<br/>

#### REINDEXING DATA

In [7]:
X_train = X_train.reset_index(drop = True)
y_train = y_train.reset_index(drop = True)

X_test = X_test.reset_index(drop = True)
y_test = y_test.reset_index(drop = True)

<br/>
<br/>
<br/>

### MODEL

In [8]:
from xgboost import XGBClassifier
xgb_classifier = XGBClassifier()
xgb_classifier.fit(X_train,y_train)

In [9]:
y_pred = xgb_classifier.predict(X_test)

In [10]:
from sklearn.metrics import classification_report
y_true = y_test
target_names = [ 'dos', 'normal', 'probe' , 'r2l' , 'u2r' ]
report = classification_report(y_true, y_pred, target_names=target_names)
print(report)

              precision    recall  f1-score   support

         dos       1.00      1.00      1.00     13778
      normal       1.00      1.00      1.00     20479
       probe       1.00      1.00      1.00      3497
         r2l       0.95      0.84      0.89        25
         u2r       0.88      0.54      0.67        13

    accuracy                           1.00     37792
   macro avg       0.97      0.88      0.91     37792
weighted avg       1.00      1.00      1.00     37792



In [11]:
from sklearn.metrics import confusion_matrix
cnf_mat = confusion_matrix(y_true,y_pred)
pd.DataFrame(cnf_mat , index = target_names , columns = target_names)

Unnamed: 0,dos,normal,probe,r2l,u2r
dos,13776,2,0,0,0
normal,4,20470,4,0,1
probe,0,10,3487,0,0
r2l,0,4,0,21,0
u2r,0,5,0,1,7


In [12]:
from sklearn.metrics import accuracy_score , precision_score , recall_score , f1_score 
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score( y_true , y_pred , average = None )
recall = recall_score( y_true , y_pred , average = None )
f1score = f1_score( y_true , y_pred , average = None )

In [13]:
print(accuracy)
print(precision)
print(recall)
print(f1score)

0.9991797205757832
[0.99970972 0.99897516 0.9988542  0.95454545 0.875     ]
[0.99985484 0.99956053 0.99714041 0.84       0.53846154]
[0.99978228 0.99926776 0.99799657 0.89361702 0.66666667]


In [14]:
data = {
    "model" : "XGBoost",
    "accuracy" : accuracy ,
    "precision" : precision.tolist() ,
    "recall" : recall.tolist() ,
    "F1_score" : f1score.tolist() ,
    "class_names" : target_names
}

data

{'model': 'XGBoost',
 'accuracy': 0.9991797205757832,
 'precision': [0.9997097242380262,
  0.9989751598262652,
  0.9988541965052994,
  0.9545454545454546,
  0.875],
 'recall': [0.9998548410509508,
  0.99956052541628,
  0.9971404060623391,
  0.84,
  0.5384615384615384],
 'F1_score': [0.9997822773786198,
  0.9992677568952892,
  0.9979965655409273,
  0.8936170212765958,
  0.6666666666666667],
 'class_names': ['dos', 'normal', 'probe', 'r2l', 'u2r']}

In [15]:
import json
with open("../../model_performances/XGBoost.json", "w") as outfile:
    json.dump(data, outfile)