In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv('../../data/preprocessed_data.csv')
data.head(5)

Unnamed: 0,duration,src_bytes,dst_bytes,wrong_fragment,urgent,hot,num_failed_logins,num_compromised,su_attempted,num_root,...,service_smtp,flag_S0,flag_SF,flag_other,land_1,logged_in_1,root_shell_1,is_host_login_1,is_guest_login_1,attack_category
0,-0.110249,-0.007679,-0.004919,-0.089486,-0.007736,-0.095076,-0.027023,-0.011664,-0.024437,-0.012385,...,0,0,1,0,0,0,0,0,0,1
1,-0.110249,-0.007737,-0.004919,-0.089486,-0.007736,-0.095076,-0.027023,-0.011664,-0.024437,-0.012385,...,0,0,1,0,0,0,0,0,0,1
2,-0.110249,-0.007762,-0.004919,-0.089486,-0.007736,-0.095076,-0.027023,-0.011664,-0.024437,-0.012385,...,0,1,0,0,0,0,0,0,0,0
3,-0.110249,-0.007723,-0.002891,-0.089486,-0.007736,-0.095076,-0.027023,-0.011664,-0.024437,-0.012385,...,0,0,1,0,0,1,0,0,0,1
4,-0.110249,-0.007728,-0.004814,-0.089486,-0.007736,-0.095076,-0.027023,-0.011664,-0.024437,-0.012385,...,0,0,1,0,0,1,0,0,0,1


In [3]:
X = data.drop(['attack_category'],axis=1)
y = data['attack_category']

<br/>
<br/>
<br/>
<br/>

### TRAIN TEST SPLIT

In [4]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split( X , y , test_size = 0.3 , random_state = 42 , stratify = y )

<br/>
<br/>
<br/>

#### REINDEXING DATA

In [5]:
X_train = X_train.reset_index(drop = True)
y_train = y_train.reset_index(drop = True)

X_test = X_test.reset_index(drop = True)
y_test = y_test.reset_index(drop = True)

<br/>
<br/>
<br/>

### MODEL

In [8]:
from sklearn.svm import SVC
svc_classifier = SVC()
svc_classifier.fit(X_train,y_train)

In [9]:
y_pred = svc_classifier.predict(X_test)

In [10]:
from sklearn.metrics import classification_report
y_true = y_test
target_names = [ 'dos', 'normal', 'probe' , 'r2l' , 'u2r' ]
report = classification_report(y_true, y_pred, target_names=target_names)
print(report)

              precision    recall  f1-score   support

         dos       1.00      1.00      1.00     13778
      normal       0.99      1.00      0.99     20479
       probe       0.98      0.98      0.98      3497
         r2l       1.00      0.56      0.72        25
         u2r       1.00      0.15      0.27        13

    accuracy                           0.99     37792
   macro avg       0.99      0.74      0.79     37792
weighted avg       0.99      0.99      0.99     37792



In [11]:
from sklearn.metrics import confusion_matrix
cnf_mat = confusion_matrix(y_true,y_pred)
pd.DataFrame(cnf_mat , index = target_names , columns = target_names)

Unnamed: 0,dos,normal,probe,r2l,u2r
dos,13745,33,0,0,0
normal,16,20387,76,0,0
probe,1,67,3429,0,0
r2l,3,8,0,14,0
u2r,0,11,0,0,2


<br/>
<br/>
<br/>
<br/>

#### APPLYING GRID SEARCH CV

In [14]:
from sklearn.model_selection import GridSearchCV

clf = GridSearchCV( SVC( random_state = 42 ) , {
    'kernel':[ 'linear' , 'poly' , 'rbf' , 'sigmoid' ],
    'degree' : [3,4,5]
} , cv=5 ,return_train_score = False , verbose=5)

clf.fit(X,y)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV 1/5] END ...........degree=3, kernel=linear;, score=0.984 total time= 1.3min
[CV 2/5] END ...........degree=3, kernel=linear;, score=0.985 total time= 1.2min
[CV 3/5] END ...........degree=3, kernel=linear;, score=0.985 total time= 1.2min
[CV 4/5] END ...........degree=3, kernel=linear;, score=0.984 total time= 1.2min
[CV 5/5] END ...........degree=3, kernel=linear;, score=0.985 total time= 1.2min
[CV 1/5] END .............degree=3, kernel=poly;, score=0.993 total time=  45.1s
[CV 2/5] END .............degree=3, kernel=poly;, score=0.994 total time=  42.3s
[CV 3/5] END .............degree=3, kernel=poly;, score=0.993 total time=  39.2s
[CV 4/5] END .............degree=3, kernel=poly;, score=0.993 total time= 4.1min
[CV 5/5] END .............degree=3, kernel=poly;, score=0.995 total time=  42.0s
[CV 1/5] END ..............degree=3, kernel=rbf;, score=0.994 total time=  36.2s
[CV 2/5] END ..............degree=3, kernel=rbf;

In [15]:
df = pd.DataFrame(clf.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_degree,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,69.317918,2.819109,3.430796,0.216113,3,linear,"{'degree': 3, 'kernel': 'linear'}",0.983687,0.9846,0.984878,0.983964,0.984877,0.984401,0.000489,6
1,79.497359,81.281094,3.354601,0.101855,3,poly,"{'degree': 3, 'kernel': 'poly'}",0.992697,0.993927,0.993292,0.992776,0.99484,0.993507,0.000799,4
2,28.269793,0.334343,8.069202,0.102891,3,rbf,"{'degree': 3, 'kernel': 'rbf'}",0.993848,0.995158,0.994443,0.994642,0.995396,0.994697,0.000546,1
3,505.69136,104.978674,23.421012,1.062929,3,sigmoid,"{'degree': 3, 'kernel': 'sigmoid'}",0.841556,0.842191,0.840881,0.850758,0.853537,0.845785,0.005285,10
4,71.988813,0.560521,3.419392,0.028401,4,linear,"{'degree': 4, 'kernel': 'linear'}",0.983687,0.9846,0.984878,0.983964,0.984877,0.984401,0.000489,6
5,53.648406,1.605187,4.276992,0.119041,4,poly,"{'degree': 4, 'kernel': 'poly'}",0.987696,0.989998,0.992975,0.98964,0.990911,0.990244,0.001722,5
6,32.525797,0.549437,8.093607,0.088217,4,rbf,"{'degree': 4, 'kernel': 'rbf'}",0.993848,0.995158,0.994443,0.994642,0.995396,0.994697,0.000546,1
7,520.952249,109.033516,23.928609,0.575718,4,sigmoid,"{'degree': 4, 'kernel': 'sigmoid'}",0.841556,0.842191,0.840881,0.850758,0.853537,0.845785,0.005285,10
8,72.06031,0.64221,3.409999,0.04583,5,linear,"{'degree': 5, 'kernel': 'linear'}",0.983687,0.9846,0.984878,0.983964,0.984877,0.984401,0.000489,6
9,95.228,61.37248,4.961004,0.186,5,poly,"{'degree': 5, 'kernel': 'poly'}",0.981742,0.983171,0.985394,0.982694,0.984084,0.983417,0.001244,9


In [16]:
df = df[['param_kernel' , 'param_degree' , 'mean_test_score']]
df.sort_values(by=['mean_test_score'] , ascending=False)

Unnamed: 0,param_kernel,param_degree,mean_test_score
2,rbf,3,0.994697
6,rbf,4,0.994697
10,rbf,5,0.994697
1,poly,3,0.993507
5,poly,4,0.990244
0,linear,3,0.984401
4,linear,4,0.984401
8,linear,5,0.984401
9,poly,5,0.983417
3,sigmoid,3,0.845785


<br/>
<br/>
<br/>
<br/>

#### FINAL MODEL

In [7]:
from sklearn.svm import SVC
final_model = SVC(kernel='rbf')
final_model.fit(X_train,y_train)

In [8]:
y_pred = final_model.predict(X_test)

In [10]:
from sklearn.metrics import classification_report,confusion_matrix
y_true = y_test
target_names = [ 'dos', 'normal', 'probe' , 'r2l' , 'u2r' ]
report = classification_report(y_true, y_pred, target_names=target_names)
print(report)

              precision    recall  f1-score   support

         dos       1.00      1.00      1.00     13778
      normal       0.99      1.00      0.99     20479
       probe       0.98      0.98      0.98      3497
         r2l       1.00      0.56      0.72        25
         u2r       1.00      0.15      0.27        13

    accuracy                           0.99     37792
   macro avg       0.99      0.74      0.79     37792
weighted avg       0.99      0.99      0.99     37792



In [11]:
conf_mat = confusion_matrix(y_true,y_pred)
pd.DataFrame(conf_mat , index = target_names , columns = target_names)

Unnamed: 0,dos,normal,probe,r2l,u2r
dos,13745,33,0,0,0
normal,16,20387,76,0,0
probe,1,67,3429,0,0
r2l,3,8,0,14,0
u2r,0,11,0,0,2


In [12]:
from sklearn.metrics import accuracy_score , precision_score , recall_score , f1_score 
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score( y_true , y_pred , average = None )
recall = recall_score( y_true , y_pred , average = None )
f1score = f1_score( y_true , y_pred , average = None )


print(accuracy)
print(precision)
print(recall)
print(f1score)

0.9943109652836579
[0.99854704 0.99419682 0.97831669 1.         1.        ]
[0.99760488 0.99550759 0.98055476 0.56       0.15384615]
[0.99807574 0.99485178 0.97943445 0.71794872 0.26666667]


In [13]:
data = {
    "model" : "Support Vector Classifier",
    "kernel" : 'rbf' ,
    "accuracy" : accuracy ,
    "precision" : precision.tolist() ,
    "recall" : recall.tolist() ,
    "F1_score" : f1score.tolist() ,
    "class_names" : target_names
}

data

{'model': 'Support Vector Classifier',
 'kernel': 'rbf',
 'accuracy': 0.9943109652836579,
 'precision': [0.998547039593171,
  0.9941968204427972,
  0.9783166904422254,
  1.0,
  1.0],
 'recall': [0.997604877340688,
  0.9955075931441965,
  0.9805547612239062,
  0.56,
  0.15384615384615385],
 'F1_score': [0.9980757361217005,
  0.9948517750396486,
  0.9794344473007712,
  0.717948717948718,
  0.2666666666666667],
 'class_names': ['dos', 'normal', 'probe', 'r2l', 'u2r']}

In [14]:
import json
with open("../../model_performances/support_vector_classifier.json", "w") as outfile:
    json.dump(data, outfile)