In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv('../../data/preprocessed_data.csv')
data.head(5)

Unnamed: 0,duration,src_bytes,dst_bytes,wrong_fragment,urgent,hot,num_failed_logins,num_compromised,su_attempted,num_root,...,service_smtp,flag_S0,flag_SF,flag_other,land_1,logged_in_1,root_shell_1,is_host_login_1,is_guest_login_1,attack_category
0,-0.110249,-0.007679,-0.004919,-0.089486,-0.007736,-0.095076,-0.027023,-0.011664,-0.024437,-0.012385,...,0,0,1,0,0,0,0,0,0,1
1,-0.110249,-0.007737,-0.004919,-0.089486,-0.007736,-0.095076,-0.027023,-0.011664,-0.024437,-0.012385,...,0,0,1,0,0,0,0,0,0,1
2,-0.110249,-0.007762,-0.004919,-0.089486,-0.007736,-0.095076,-0.027023,-0.011664,-0.024437,-0.012385,...,0,1,0,0,0,0,0,0,0,0
3,-0.110249,-0.007723,-0.002891,-0.089486,-0.007736,-0.095076,-0.027023,-0.011664,-0.024437,-0.012385,...,0,0,1,0,0,1,0,0,0,1
4,-0.110249,-0.007728,-0.004814,-0.089486,-0.007736,-0.095076,-0.027023,-0.011664,-0.024437,-0.012385,...,0,0,1,0,0,1,0,0,0,1


In [3]:
X = data.drop(['attack_category'],axis=1)
y = data['attack_category']

<br/>
<br/>
<br/>
<br/>

### TRAIN TEST SPLIT

In [4]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split( X , y , test_size = 0.3 , random_state = 42 , stratify = y )

<br/>
<br/>
<br/>

#### REINDEXING DATA

In [5]:
X_train = X_train.reset_index(drop = True)
y_train = y_train.reset_index(drop = True)

X_test = X_test.reset_index(drop = True)
y_test = y_test.reset_index(drop = True)

<br/>
<br/>
<br/>

### MODEL

In [6]:
from sklearn.ensemble import AdaBoostClassifier
adaboost_classifier = AdaBoostClassifier()
adaboost_classifier.fit(X_train,y_train)

In [7]:
y_pred = adaboost_classifier.predict(X_test)

In [8]:
from sklearn.metrics import classification_report
y_true = y_test
target_names = [ 'dos', 'normal', 'probe' , 'r2l' , 'u2r' ]
report = classification_report(y_true, y_pred, target_names=target_names)
print(report)

              precision    recall  f1-score   support

         dos       0.78      0.86      0.82     13778
      normal       0.90      0.82      0.86     20479
       probe       0.69      0.77      0.73      3497
         r2l       0.67      0.08      0.14        25
         u2r       0.07      0.23      0.11        13

    accuracy                           0.83     37792
   macro avg       0.62      0.55      0.53     37792
weighted avg       0.84      0.83      0.83     37792



In [9]:
from sklearn.metrics import confusion_matrix
cnf_mat = confusion_matrix(y_true,y_pred)
pd.DataFrame(cnf_mat , index = target_names , columns = target_names)

Unnamed: 0,dos,normal,probe,r2l,u2r
dos,11870,1772,136,0,0
normal,2571,16826,1041,1,40
probe,776,41,2680,0,0
r2l,0,23,0,2,0
u2r,0,9,1,0,3


<br/>
<br/>
<br/>
<br/>

#### APPLYING GRID SEARCH CV

In [10]:
from sklearn.model_selection import GridSearchCV

clf = GridSearchCV( AdaBoostClassifier( random_state = 42 ) , {
    'learning_rate':[0.01,0.5,0.1],
    'n_estimators' : [300,400,500],
} , cv=5 ,return_train_score = False,verbose=5 )

clf.fit(X,y)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 1/5] END learning_rate=0.01, n_estimators=300;, score=0.934 total time=  41.9s
[CV 2/5] END learning_rate=0.01, n_estimators=300;, score=0.869 total time=  40.9s
[CV 3/5] END learning_rate=0.01, n_estimators=300;, score=0.932 total time=  41.0s
[CV 4/5] END learning_rate=0.01, n_estimators=300;, score=0.929 total time=  41.8s
[CV 5/5] END learning_rate=0.01, n_estimators=300;, score=0.934 total time=  40.9s
[CV 1/5] END learning_rate=0.01, n_estimators=400;, score=0.938 total time=  55.3s
[CV 2/5] END learning_rate=0.01, n_estimators=400;, score=0.838 total time=  55.0s
[CV 3/5] END learning_rate=0.01, n_estimators=400;, score=0.932 total time=  54.2s
[CV 4/5] END learning_rate=0.01, n_estimators=400;, score=0.934 total time=  54.7s
[CV 5/5] END learning_rate=0.01, n_estimators=400;, score=0.934 total time=  54.5s
[CV 1/5] END learning_rate=0.01, n_estimators=500;, score=0.952 total time= 1.1min
[CV 2/5] END learning_rate=

In [11]:
df = pd.DataFrame(clf.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,39.027046,0.402953,2.387994,0.063774,0.01,300,"{'learning_rate': 0.01, 'n_estimators': 300}",0.934074,0.869022,0.931574,0.929467,0.933714,0.91957,0.025328,1
1,51.740222,0.338012,3.091386,0.122234,0.01,400,"{'learning_rate': 0.01, 'n_estimators': 400}",0.937686,0.837865,0.932248,0.93431,0.934349,0.915292,0.038753,2
2,64.893193,0.6058,3.836199,0.186118,0.01,500,"{'learning_rate': 0.01, 'n_estimators': 500}",0.951617,0.57178,0.937885,0.932405,0.950822,0.868902,0.148745,3
3,38.848393,0.102981,2.270606,0.04796,0.5,300,"{'learning_rate': 0.5, 'n_estimators': 300}",0.740544,0.526057,0.743759,0.57474,0.776891,0.672398,0.101595,5
4,51.887222,0.118059,3.03698,0.073105,0.5,400,"{'learning_rate': 0.5, 'n_estimators': 400}",0.744354,0.516888,0.764715,0.700643,0.739581,0.693236,0.09058,4
5,64.564548,0.213351,3.760203,0.036718,0.5,500,"{'learning_rate': 0.5, 'n_estimators': 500}",0.742092,0.499623,0.767494,0.526991,0.728546,0.652949,0.115027,6
6,38.757786,0.06658,2.255414,0.027205,0.1,300,"{'learning_rate': 0.1, 'n_estimators': 300}",0.601667,0.547371,0.577853,0.604906,0.602247,0.586809,0.02201,7
7,51.921392,0.091241,3.053407,0.052607,0.1,400,"{'learning_rate': 0.1, 'n_estimators': 400}",0.587537,0.549157,0.566263,0.562753,0.565492,0.56624,0.012314,8
8,65.327779,0.514669,3.839209,0.120947,0.1,500,"{'learning_rate': 0.1, 'n_estimators': 500}",0.54229,0.54876,0.56404,0.539057,0.546837,0.548197,0.008621,9


In [12]:
df = df[['param_learning_rate' , 'param_n_estimators' , 'mean_test_score']]
df.sort_values(by=['mean_test_score'] , ascending=False)

Unnamed: 0,param_learning_rate,param_n_estimators,mean_test_score
0,0.01,300,0.91957
1,0.01,400,0.915292
2,0.01,500,0.868902
4,0.5,400,0.693236
3,0.5,300,0.672398
5,0.5,500,0.652949
6,0.1,300,0.586809
7,0.1,400,0.56624
8,0.1,500,0.548197


<br/>
<br/>
<br/>
<br/>

#### FINAL MODEL

In [13]:
final_model = AdaBoostClassifier( learning_rate=0.01 , n_estimators=300 )
final_model.fit(X_train,y_train)

In [14]:
y_pred = final_model.predict(X_test)

In [15]:
y_true = y_test
target_names = [ 'dos', 'normal', 'probe' , 'r2l' , 'u2r' ]
report = classification_report(y_true, y_pred, target_names=target_names)
print(report)

              precision    recall  f1-score   support

         dos       0.93      0.91      0.92     13778
      normal       0.93      1.00      0.96     20479
       probe       0.97      0.69      0.81      3497
         r2l       0.00      0.00      0.00        25
         u2r       0.00      0.00      0.00        13

    accuracy                           0.93     37792
   macro avg       0.57      0.52      0.54     37792
weighted avg       0.93      0.93      0.93     37792



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
conf_mat = confusion_matrix(y_true,y_pred)
pd.DataFrame(conf_mat , index = target_names , columns = target_names)

Unnamed: 0,dos,normal,probe,r2l,u2r
dos,12503,1267,8,0,0
normal,47,20379,53,0,0
probe,917,170,2410,0,0
r2l,3,22,0,0,0
u2r,0,12,1,0,0


In [17]:
from sklearn.metrics import accuracy_score , precision_score , recall_score , f1_score 
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score( y_true , y_pred , average = None )
recall = recall_score( y_true , y_pred , average = None )
f1score = f1_score( y_true , y_pred , average = None )


print(accuracy)
print(precision)
print(recall)
print(f1score)

0.933848433530906
[0.92821084 0.93267735 0.97491909 0.         0.        ]
[0.90746117 0.99511695 0.68916214 0.         0.        ]
[0.91771873 0.96288596 0.80750544 0.         0.        ]


  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
data = {
    "model" : "AdaBoost Classifier",
    "learning_rate" : 0.01,
    "n_estimators" : 300,
    "accuracy" : accuracy ,
    "precision" : precision.tolist() ,
    "recall" : recall.tolist() ,
    "F1_score" : f1score.tolist() ,
    "class_names" : target_names
}

data

{'model': 'AdaBoost Classifier',
 'learning_rate': 0.01,
 'n_estimators': 300,
 'accuracy': 0.933848433530906,
 'precision': [0.9282108389012621,
  0.9326773455377574,
  0.9749190938511327,
  0.0,
  0.0],
 'recall': [0.9074611699811294,
  0.9951169490697788,
  0.6891621389762653,
  0.0,
  0.0],
 'F1_score': [0.9177187316500294,
  0.9628859647050486,
  0.8075054447981236,
  0.0,
  0.0],
 'class_names': ['dos', 'normal', 'probe', 'r2l', 'u2r']}

In [19]:
import json
with open("../../model_performances/adaboost_classifier.json", "w") as outfile:
    json.dump(data, outfile)