In [38]:
import time
import pandas as pd
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split 
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC, NuSVC, LinearSVC
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

# Data Processing

In [56]:
data = pd.read_csv("data.csv")
data.sample(frac=1)
target=data['attack']
data.drop(['packet', 'attack'], axis=1, inplace=True)
train_data, test_data, train_target, test_target = train_test_split(data, target, test_size=0.8, random_state=50)

In [57]:
def PrintStats(y_pred, test_target):
    print("Number of mislabeled points out of a total %d points : %d"
      % (test_target.shape[0], (test_target != y_pred).sum()))

    test_target = test_target.to_numpy()
    tp = 0
    tn = 0
    fp = 0
    fn = 0
    for t in range(test_target.shape[0]):
        if test_target[t] == -1:
            if y_pred[t] == 1:
                fp += 1
            else:
                tn += 1
        else:
            if y_pred[t] == 1:
                tp += 1
            else:
                fn += 1

    print("True Positives :", tp)
    print("True Negatives :", tn)
    print("False Positives :", fp)
    print("False Negatives :", fn)

In [58]:
def mode(sample):
    c = Counter(sample)
    return [k for k, v in c.items() if v == c.most_common(1)[0][1]]

# Decision Tree, KNN and Gaussian Naive Bayes

In [59]:
start = time.time()

model1 = DecisionTreeClassifier()
model2 = KNeighborsClassifier(n_neighbors = 2)
model3 = GaussianNB()

model1.fit(train_data, train_target)
model2.fit(train_data, train_target)
model3.fit(train_data, train_target)

pred1 = model1.predict(test_data)
pred2 = model2.predict(test_data)
pred3 = model3.predict(test_data)

final_pred = np.array([])
for i in range(0,len(test_data)):
    final_pred = np.append(final_pred, mode([pred1[i], pred2[i], pred3[i]]))

end = time.time()
print("The time taken is: ",(end-start)*1000,"millisec")
PrintStats(final_pred, test_target)

The time taken is:  540.3978824615479 millisec
Number of mislabeled points out of a total 15098 points : 6
True Positives : 14749
True Negatives : 343
False Positives : 6
False Negatives : 0


In [60]:
print("The accuracy is: ",accuracy_score(test_target, final_pred)*100,"%")

The accuracy is:  99.96025963703802 %


In [61]:
print(classification_report(test_target, final_pred))

              precision    recall  f1-score   support

          -1       1.00      0.98      0.99       349
           1       1.00      1.00      1.00     14749

   micro avg       1.00      1.00      1.00     15098
   macro avg       1.00      0.99      1.00     15098
weighted avg       1.00      1.00      1.00     15098



# Random Forest, Decision Tree and Quadratic SVM

In [62]:
start = time.time()

model1 = RandomForestClassifier(n_estimators = 100)
model2 = DecisionTreeClassifier() 
model3 = SVC(kernel='poly', degree=2, gamma='scale')

model1.fit(train_data, train_target)
model2.fit(train_data, train_target)
model3.fit(train_data, train_target)

pred1 = model1.predict(test_data)
pred2 = model2.predict(test_data)
pred3 = model3.predict(test_data)

final_pred = np.array([])
for i in range(0,len(test_data)):
    final_pred = np.append(final_pred, mode([pred1[i], pred2[i], pred3[i]]))

end = time.time()
print("The time taken is: ",(end-start)*1000,"millisec")
PrintStats(final_pred, test_target)

The time taken is:  409.99722480773926 millisec
Number of mislabeled points out of a total 15098 points : 8
True Positives : 14749
True Negatives : 341
False Positives : 8
False Negatives : 0


In [63]:
print("The accuracy is: ",accuracy_score(test_target, final_pred)*100,"%")

The accuracy is:  99.94701284938402 %


In [64]:
print(classification_report(test_target, final_pred))

              precision    recall  f1-score   support

          -1       1.00      0.98      0.99       349
           1       1.00      1.00      1.00     14749

   micro avg       1.00      1.00      1.00     15098
   macro avg       1.00      0.99      0.99     15098
weighted avg       1.00      1.00      1.00     15098



# Random Forest and Decision Tree

In [65]:
start = time.time()

model1 = RandomForestClassifier(n_estimators = 100)
model2 = DecisionTreeClassifier() 

model1.fit(train_data, train_target)
model2.fit(train_data, train_target)

pred1 = model1.predict(test_data)
pred2 = model2.predict(test_data)

final_pred = np.array([])
for i in range(0,len(test_data)):
    final_pred = np.append(final_pred, mode([pred1[i], pred2[i]]))

end = time.time()
print("The time taken is: ",(end-start)*1000,"millisec")
PrintStats(final_pred, test_target)

The time taken is:  387.87841796875 millisec
Number of mislabeled points out of a total 15098 points : 8
True Positives : 14749
True Negatives : 341
False Positives : 8
False Negatives : 0


In [66]:
print("The accuracy is: ",accuracy_score(test_target, final_pred)*100,"%")

The accuracy is:  99.94701284938402 %


In [67]:
print(classification_report(test_target, final_pred))

              precision    recall  f1-score   support

          -1       1.00      0.98      0.99       349
           1       1.00      1.00      1.00     14749

   micro avg       1.00      1.00      1.00     15098
   macro avg       1.00      0.99      0.99     15098
weighted avg       1.00      1.00      1.00     15098

