# DNS Tunneling Detection

Dataset Credits : https://github.com/chuayupeng/dns-tunnelling-detection

In [2]:
import time
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split 
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC, NuSVC, LinearSVC
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

# Data Processing

In [3]:
data = pd.read_csv("data.csv")
data.sample(frac=1)
target=data['attack']
data.drop(['packet', 'attack'], axis=1, inplace=True)
train_data, test_data, train_target, test_target = train_test_split(data, target, test_size=0.2, random_state=50)

In [4]:
def PrintStats(y_pred, test_target):
    print("Number of mislabeled points out of a total %d points : %d"
      % (test_target.shape[0], (test_target != y_pred).sum()))

    test_target = test_target.to_numpy()
    tp = 0
    tn = 0
    fp = 0
    fn = 0
    for t in range(test_target.shape[0]):
        if test_target[t] == -1:
            if y_pred[t] == 1:
                fp += 1
            else:
                tn += 1
        else:
            if y_pred[t] == 1:
                tp += 1
            else:
                fn += 1

    print("True Positives :", tp)
    print("True Negatives :", tn)
    print("False Positives :", fp)
    print("False Negatives :", fn)

# Gaussian Naive Bayes Classifier

In [5]:
start = time.time()
gnb = GaussianNB()
y_pred = gnb.fit(train_data, train_target).predict(test_data)
end = time.time()
print("The time taken is: ",(end-start)*1000,"millisec")
PrintStats(y_pred, test_target)

The time taken is:  6.436347961425781 millisec
Number of mislabeled points out of a total 3775 points : 72
True Positives : 3686
True Negatives : 17
False Positives : 72
False Negatives : 0


In [6]:
print("The accuracy is: ",accuracy_score(test_target, y_pred)*100,"%")

The accuracy is:  98.09271523178808 %


In [7]:
print(classification_report(test_target, y_pred))

              precision    recall  f1-score   support

          -1       1.00      0.19      0.32        89
           1       0.98      1.00      0.99      3686

   micro avg       0.98      0.98      0.98      3775
   macro avg       0.99      0.60      0.66      3775
weighted avg       0.98      0.98      0.97      3775



# Multinomial Naive Bayes Classifier

In [8]:
start = time.time()
MultiNB = MultinomialNB()
y_pred = MultiNB.fit(train_data, train_target).predict(test_data)
end = time.time()
print("The time taken is: ",(end-start)*1000,"millisec")
PrintStats(y_pred, test_target)

The time taken is:  5.5179595947265625 millisec
Number of mislabeled points out of a total 3775 points : 81
True Positives : 3686
True Negatives : 8
False Positives : 81
False Negatives : 0


In [9]:
print("The accuracy is: ",accuracy_score(test_target, y_pred)*100,"%")

The accuracy is:  97.8543046357616 %


In [10]:
print(classification_report(test_target, y_pred))

              precision    recall  f1-score   support

          -1       1.00      0.09      0.16        89
           1       0.98      1.00      0.99      3686

   micro avg       0.98      0.98      0.98      3775
   macro avg       0.99      0.54      0.58      3775
weighted avg       0.98      0.98      0.97      3775



# Bernoulli Naive Bayes Classifier

In [11]:
start = time.time()
BernNB = BernoulliNB(binarize = 0.05)
y_pred = BernNB.fit(train_data, train_target).predict(test_data)
end = time.time()
print("The time taken is: ",(end-start)*1000,"millisec")
PrintStats(y_pred, test_target)

The time taken is:  12.42685317993164 millisec
Number of mislabeled points out of a total 3775 points : 89
True Positives : 3686
True Negatives : 0
False Positives : 89
False Negatives : 0


In [12]:
print("The accuracy is: ",accuracy_score(test_target, y_pred)*100,"%")

The accuracy is:  97.64238410596026 %


In [13]:
print(classification_report(test_target, y_pred))

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00        89
           1       0.98      1.00      0.99      3686

   micro avg       0.98      0.98      0.98      3775
   macro avg       0.49      0.50      0.49      3775
weighted avg       0.95      0.98      0.96      3775



  'precision', 'predicted', average, warn_for)


# Random Forest Classifier

In [14]:
start = time.time()
clf = RandomForestClassifier(n_estimators = 100)  
clf.fit(train_data, train_target) 
y_pred = clf.predict(test_data)
end = time.time()
print("The time taken is: ",(end-start)*1000,"millisec")
PrintStats(y_pred, test_target)

The time taken is:  313.8902187347412 millisec
Number of mislabeled points out of a total 3775 points : 0
True Positives : 3686
True Negatives : 89
False Positives : 0
False Negatives : 0


In [15]:
print("The accuracy is: ",accuracy_score(test_target, y_pred)*100,"%")

The accuracy is:  100.0 %


In [16]:
print(classification_report(test_target, y_pred))

              precision    recall  f1-score   support

          -1       1.00      1.00      1.00        89
           1       1.00      1.00      1.00      3686

   micro avg       1.00      1.00      1.00      3775
   macro avg       1.00      1.00      1.00      3775
weighted avg       1.00      1.00      1.00      3775



# Decision Tree Classifier

In [17]:
start = time.time()
clf = DecisionTreeClassifier()   
clf.fit(train_data, train_target) 
y_pred = clf.predict(test_data)
end = time.time()
print("The time taken is: ",(end-start)*1000,"millisec")
PrintStats(y_pred, test_target)

The time taken is:  15.103816986083984 millisec
Number of mislabeled points out of a total 3775 points : 0
True Positives : 3686
True Negatives : 89
False Positives : 0
False Negatives : 0


In [18]:
print("The accuracy is: ",accuracy_score(test_target, y_pred)*100,"%")

The accuracy is:  100.0 %


In [19]:
print(classification_report(test_target, y_pred))

              precision    recall  f1-score   support

          -1       1.00      1.00      1.00        89
           1       1.00      1.00      1.00      3686

   micro avg       1.00      1.00      1.00      3775
   macro avg       1.00      1.00      1.00      3775
weighted avg       1.00      1.00      1.00      3775



# Multi Layer Perceptron

In [20]:
start = time.time()
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(15, 10), random_state=1)
clf.fit(train_data, train_target) 
y_pred = clf.predict(test_data)
end = time.time()
print("The time taken is: ",(end-start)*1000,"millisec")
PrintStats(y_pred, test_target)

The time taken is:  224.2448329925537 millisec
Number of mislabeled points out of a total 3775 points : 89
True Positives : 3686
True Negatives : 0
False Positives : 89
False Negatives : 0


In [21]:
print("The accuracy is: ",accuracy_score(test_target, y_pred)*100,"%")

The accuracy is:  97.64238410596026 %


In [22]:
print(classification_report(test_target, y_pred))

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00        89
           1       0.98      1.00      0.99      3686

   micro avg       0.98      0.98      0.98      3775
   macro avg       0.49      0.50      0.49      3775
weighted avg       0.95      0.98      0.96      3775



  'precision', 'predicted', average, warn_for)


# Linear Support Vector Machine 

In [23]:
start = time.time()
SVML = LinearSVC()
y_pred = SVML.fit(train_data, train_target).predict(test_data)
end = time.time()
print("The time taken is: ",(end-start)*1000,"millisec")
PrintStats(y_pred, test_target)

The time taken is:  514.93239402771 millisec
Number of mislabeled points out of a total 3775 points : 89
True Positives : 3686
True Negatives : 0
False Positives : 89
False Negatives : 0




In [24]:
print("The accuracy is: ",accuracy_score(test_target, y_pred)*100,"%")

The accuracy is:  97.64238410596026 %


In [25]:
print(classification_report(test_target, y_pred))

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00        89
           1       0.98      1.00      0.99      3686

   micro avg       0.98      0.98      0.98      3775
   macro avg       0.49      0.50      0.49      3775
weighted avg       0.95      0.98      0.96      3775



  'precision', 'predicted', average, warn_for)


# Quadratic Support Vector Machine

In [26]:
start = time.time()
SVMQ = SVC(kernel='poly', degree=2, gamma='scale')
y_pred = SVMQ.fit(train_data, train_target).predict(test_data)
end = time.time()
print("The time taken is: ",(end-start)*1000,"millisec")
PrintStats(y_pred, test_target)

The time taken is:  86.66110038757324 millisec
Number of mislabeled points out of a total 3775 points : 2
True Positives : 3686
True Negatives : 87
False Positives : 2
False Negatives : 0


In [27]:
print("The accuracy is: ",accuracy_score(test_target, y_pred)*100,"%")

The accuracy is:  99.94701986754967 %


In [28]:
print(classification_report(test_target, y_pred))

              precision    recall  f1-score   support

          -1       1.00      0.98      0.99        89
           1       1.00      1.00      1.00      3686

   micro avg       1.00      1.00      1.00      3775
   macro avg       1.00      0.99      0.99      3775
weighted avg       1.00      1.00      1.00      3775



# K Nearest Neighbours

In [29]:
start = time.time()
KNN = KNeighborsClassifier(n_neighbors = 2)
y_pred = KNN.fit(train_data, train_target).predict(test_data)
end = time.time()
print("The time taken is: ",(end-start)*1000,"millisec")
PrintStats(y_pred, test_target)

The time taken is:  109.70950126647949 millisec
Number of mislabeled points out of a total 3775 points : 0
True Positives : 3686
True Negatives : 89
False Positives : 0
False Negatives : 0


In [30]:
print("The accuracy is: ",accuracy_score(test_target, y_pred)*100,"%")

The accuracy is:  100.0 %


In [31]:
print(classification_report(test_target, y_pred))

              precision    recall  f1-score   support

          -1       1.00      1.00      1.00        89
           1       1.00      1.00      1.00      3686

   micro avg       1.00      1.00      1.00      3775
   macro avg       1.00      1.00      1.00      3775
weighted avg       1.00      1.00      1.00      3775

