# DNS Tunneling Detection

Dataset Credits : https://github.com/chuayupeng/dns-tunnelling-detection

In [1]:
import time
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split 
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC, NuSVC, LinearSVC
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

# Data Processing

In [2]:
data = pd.read_csv("data.csv")
data.sample(frac=1)
target=data['attack']
data.drop(['packet', 'attack'], axis=1, inplace=True)
train_data, test_data, train_target, test_target = train_test_split(data, target, test_size=0.8, random_state=0)

In [3]:
def PrintStats(y_pred, test_target):
    print("Number of mislabeled points out of a total %d points : %d"
      % (test_target.shape[0], (test_target != y_pred).sum()))

    test_target = test_target.to_numpy()
    tp = 0
    tn = 0
    fp = 0
    fn = 0
    for t in range(test_target.shape[0]):
        if test_target[t] == -1:
            if y_pred[t] == 1:
                fp += 1
            else:
                tn += 1
        else:
            if y_pred[t] == 1:
                tp += 1
            else:
                fn += 1

    print("True Positives :", tp)
    print("True Negatives :", tn)
    print("False Positives :", fp)
    print("False Negatives :", fn)

# Gaussian Naive Bayes Classifier

In [38]:
gnb = GaussianNB()
start = time.time()
y_pred = gnb.fit(train_data, train_target).predict(test_data)
end = time.time()
print("The time taken is: ",(end-start)*1000,"millisec")
PrintStats(y_pred, test_target)

The time taken is:  6.273031234741211 millisec
Number of mislabeled points out of a total 15098 points : 170
True Positives : 14760
True Negatives : 168
False Positives : 170
False Negatives : 0


In [39]:
print("The accuracy is: ",accuracy_score(test_target, y_pred)*100,"%")

The accuracy is:  98.87402304941052 %


In [40]:
print(classification_report(test_target, y_pred))

              precision    recall  f1-score   support

          -1       1.00      0.50      0.66       338
           1       0.99      1.00      0.99     14760

   micro avg       0.99      0.99      0.99     15098
   macro avg       0.99      0.75      0.83     15098
weighted avg       0.99      0.99      0.99     15098



# Multinomial Naive Bayes Classifier

In [41]:
start = time.time()
MultiNB = MultinomialNB()
y_pred = MultiNB.fit(train_data, train_target).predict(test_data)
end = time.time()
print("The time taken is: ",(end-start)*1000,"millisec")
PrintStats(y_pred, test_target)

The time taken is:  8.806943893432617 millisec
Number of mislabeled points out of a total 15098 points : 1926
True Positives : 12897
True Negatives : 275
False Positives : 63
False Negatives : 1863


In [42]:
print("The accuracy is: ",accuracy_score(test_target, y_pred)*100,"%")

The accuracy is:  87.24334348920387 %


In [43]:
print(classification_report(test_target, y_pred))

              precision    recall  f1-score   support

          -1       0.13      0.81      0.22       338
           1       1.00      0.87      0.93     14760

   micro avg       0.87      0.87      0.87     15098
   macro avg       0.56      0.84      0.58     15098
weighted avg       0.98      0.87      0.91     15098



# Bernoulli Naive Bayes Classifier

In [44]:
start = time.time()
BernNB = BernoulliNB(binarize = 0.05)
y_pred = BernNB.fit(train_data, train_target).predict(test_data)
end = time.time()
print("The time taken is: ",(end-start)*1000,"millisec")
PrintStats(y_pred, test_target)

The time taken is:  5.345344543457031 millisec
Number of mislabeled points out of a total 15098 points : 338
True Positives : 14760
True Negatives : 0
False Positives : 338
False Negatives : 0


In [45]:
print("The accuracy is: ",accuracy_score(test_target, y_pred)*100,"%")

The accuracy is:  97.76129288647503 %


In [46]:
print(classification_report(test_target, y_pred))

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00       338
           1       0.98      1.00      0.99     14760

   micro avg       0.98      0.98      0.98     15098
   macro avg       0.49      0.50      0.49     15098
weighted avg       0.96      0.98      0.97     15098



  'precision', 'predicted', average, warn_for)


# Random Forest Classifier

In [47]:
start = time.time()
clf = RandomForestClassifier(n_estimators = 100)  
clf.fit(train_data, train_target) 
y_pred = clf.predict(test_data)
end = time.time()
print("The time taken is: ",(end-start)*1000,"millisec")
PrintStats(y_pred, test_target)

The time taken is:  188.950777053833 millisec
Number of mislabeled points out of a total 15098 points : 0
True Positives : 14760
True Negatives : 338
False Positives : 0
False Negatives : 0


In [48]:
print("The accuracy is: ",accuracy_score(test_target, y_pred)*100,"%")

The accuracy is:  100.0 %


In [49]:
print(classification_report(test_target, y_pred))

              precision    recall  f1-score   support

          -1       1.00      1.00      1.00       338
           1       1.00      1.00      1.00     14760

   micro avg       1.00      1.00      1.00     15098
   macro avg       1.00      1.00      1.00     15098
weighted avg       1.00      1.00      1.00     15098



# Decision Tree Classifier

In [50]:
start = time.time()
clf = DecisionTreeClassifier()   
clf.fit(train_data, train_target) 
y_pred = clf.predict(test_data)
end = time.time()
print("The time taken is: ",(end-start)*1000,"millisec")
PrintStats(y_pred, test_target)

The time taken is:  8.431434631347656 millisec
Number of mislabeled points out of a total 15098 points : 0
True Positives : 14760
True Negatives : 338
False Positives : 0
False Negatives : 0


In [51]:
print("The accuracy is: ",accuracy_score(test_target, y_pred)*100,"%")

The accuracy is:  100.0 %


In [52]:
print(classification_report(test_target, y_pred))

              precision    recall  f1-score   support

          -1       1.00      1.00      1.00       338
           1       1.00      1.00      1.00     14760

   micro avg       1.00      1.00      1.00     15098
   macro avg       1.00      1.00      1.00     15098
weighted avg       1.00      1.00      1.00     15098



# Multi Layer Perceptron

In [53]:
start = time.time()
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(15, 10), random_state=1)
clf.fit(train_data, train_target) 
y_pred = clf.predict(test_data)
end = time.time()
print("The time taken is: ",(end-start)*1000,"millisec")
PrintStats(y_pred, test_target)

The time taken is:  77.53133773803711 millisec
Number of mislabeled points out of a total 15098 points : 338
True Positives : 14760
True Negatives : 0
False Positives : 338
False Negatives : 0


In [54]:
print("The accuracy is: ",accuracy_score(test_target, y_pred)*100,"%")

The accuracy is:  97.76129288647503 %


In [55]:
print(classification_report(test_target, y_pred))

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00       338
           1       0.98      1.00      0.99     14760

   micro avg       0.98      0.98      0.98     15098
   macro avg       0.49      0.50      0.49     15098
weighted avg       0.96      0.98      0.97     15098



  'precision', 'predicted', average, warn_for)


# Linear Support Vector Machine 

In [56]:
start = time.time()
SVML = LinearSVC()
y_pred = SVML.fit(train_data, train_target).predict(test_data)
end = time.time()
print("The time taken is: ",(end-start)*1000,"millisec")
PrintStats(y_pred, test_target)

The time taken is:  121.28448486328125 millisec
Number of mislabeled points out of a total 15098 points : 338
True Positives : 14760
True Negatives : 0
False Positives : 338
False Negatives : 0




In [57]:
print("The accuracy is: ",accuracy_score(test_target, y_pred)*100,"%")

The accuracy is:  97.76129288647503 %


In [58]:
print(classification_report(test_target, y_pred))

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00       338
           1       0.98      1.00      0.99     14760

   micro avg       0.98      0.98      0.98     15098
   macro avg       0.49      0.50      0.49     15098
weighted avg       0.96      0.98      0.97     15098



  'precision', 'predicted', average, warn_for)


# Quadratic Support Vector Machine

In [59]:
start = time.time()
SVMQ = SVC(kernel='poly', degree=2, gamma='scale')
y_pred = SVMQ.fit(train_data, train_target).predict(test_data)
end = time.time()
print("The time taken is: ",(end-start)*1000,"millisec")
PrintStats(y_pred, test_target)

The time taken is:  42.67311096191406 millisec
Number of mislabeled points out of a total 15098 points : 0
True Positives : 14760
True Negatives : 338
False Positives : 0
False Negatives : 0


In [60]:
print("The accuracy is: ",accuracy_score(test_target, y_pred)*100,"%")

The accuracy is:  100.0 %


In [61]:
print(classification_report(test_target, y_pred))

              precision    recall  f1-score   support

          -1       1.00      1.00      1.00       338
           1       1.00      1.00      1.00     14760

   micro avg       1.00      1.00      1.00     15098
   macro avg       1.00      1.00      1.00     15098
weighted avg       1.00      1.00      1.00     15098



# K Nearest Neighbours

In [62]:
start = time.time()
KNN = KNeighborsClassifier(n_neighbors = 2)
y_pred = KNN.fit(train_data, train_target).predict(test_data)
end = time.time()
print("The time taken is: ",(end-start)*1000,"millisec")
PrintStats(y_pred, test_target)

The time taken is:  325.3171443939209 millisec
Number of mislabeled points out of a total 15098 points : 4
True Positives : 14760
True Negatives : 334
False Positives : 4
False Negatives : 0


In [63]:
print("The accuracy is: ",accuracy_score(test_target, y_pred)*100,"%")

The accuracy is:  99.97350642469202 %


In [64]:
print(classification_report(test_target, y_pred))

              precision    recall  f1-score   support

          -1       1.00      0.99      0.99       338
           1       1.00      1.00      1.00     14760

   micro avg       1.00      1.00      1.00     15098
   macro avg       1.00      0.99      1.00     15098
weighted avg       1.00      1.00      1.00     15098

