In [144]:
import json
import math
from collections import Counter
from os.path import join
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
from numpy import *
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import shuffle
from tqdm.notebook import tqdm, trange


In [145]:
df1_benign = pd.read_csv('DoHBrw-2020/benign-chrome.csv', delimiter=',')
df2_benign = pd.read_csv('DoHBrw-2020/benign-firefox.csv', delimiter=',')
df1_benign.append(df2_benign)
df1_benign['DoH'] = 0 # benign
df1_benign = df1_benign.rename(columns={'DoH': 'labels'})


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



In [146]:
df1_malic = pd.read_csv('DoHBrw-2020/mal-iodine.csv', delimiter=',')
df1_malic['DoH'] = 1  # 'iodine'  malicious
df2_malic = pd.read_csv('DoHBrw-2020/mal-dns2tcp.csv', delimiter=',')
df2_malic['DoH'] = 2  # 'dns2tcp'  malicious
df3_malic = pd.read_csv('DoHBrw-2020/mal-dnscat2.csv', delimiter=',')
df3_malic['DoH'] = 3  # 'dnscat2'  malicious
df1_malic = pd.concat([df1_malic, df2_malic, df3_malic], ignore_index=True)
df1_malic = df1_malic.rename(columns={'DoH': 'labels'})


In [147]:
data = shuffle(pd.concat([df1_benign, df1_malic,], ignore_index=True))

In [148]:
data.isnull().sum()

SourceIP                                     0
DestinationIP                                0
SourcePort                                   0
DestinationPort                              0
TimeStamp                                    0
Duration                                     0
FlowBytesSent                                0
FlowSentRate                                 0
FlowBytesReceived                            0
FlowReceivedRate                             0
PacketLengthVariance                         0
PacketLengthStandardDeviation                0
PacketLengthMean                             0
PacketLengthMedian                           0
PacketLengthMode                             0
PacketLengthSkewFromMedian                   0
PacketLengthSkewFromMode                     0
PacketLengthCoefficientofVariation           0
PacketTimeVariance                           0
PacketTimeStandardDeviation                  0
PacketTimeMean                               0
PacketTimeMed

In [149]:
data = data.dropna()

In [150]:
data.isnull().sum()

SourceIP                                  0
DestinationIP                             0
SourcePort                                0
DestinationPort                           0
TimeStamp                                 0
Duration                                  0
FlowBytesSent                             0
FlowSentRate                              0
FlowBytesReceived                         0
FlowReceivedRate                          0
PacketLengthVariance                      0
PacketLengthStandardDeviation             0
PacketLengthMean                          0
PacketLengthMedian                        0
PacketLengthMode                          0
PacketLengthSkewFromMedian                0
PacketLengthSkewFromMode                  0
PacketLengthCoefficientofVariation        0
PacketTimeVariance                        0
PacketTimeStandardDeviation               0
PacketTimeMean                            0
PacketTimeMedian                          0
PacketTimeMode                  

In [151]:
data.describe()

Unnamed: 0,SourcePort,DestinationPort,Duration,FlowBytesSent,FlowSentRate,FlowBytesReceived,FlowReceivedRate,PacketLengthVariance,PacketLengthStandardDeviation,PacketLengthMean,...,PacketTimeCoefficientofVariation,ResponseTimeTimeVariance,ResponseTimeTimeStandardDeviation,ResponseTimeTimeMean,ResponseTimeTimeMedian,ResponseTimeTimeMode,ResponseTimeTimeSkewFromMedian,ResponseTimeTimeSkewFromMode,ResponseTimeTimeCoefficientofVariation,labels
count,789902.0,789902.0,789902.0,789902.0,789902.0,789902.0,789902.0,789902.0,789902.0,789902.0,...,789902.0,789902.0,789902.0,789902.0,789902.0,789902.0,789902.0,789902.0,789902.0,789902.0
mean,49905.111524,4364.93325,22.739928,23884.38,5385.105,49969.5,19688.25,141600.0,268.195247,240.570897,...,1.079537,1.612518,0.254415,0.322211,0.284951,0.152449,-2.901909,-2.054077,0.802235,0.618629
std,16315.47274,13452.900826,36.023505,117453.5,138019.3,340188.5,129432.8,357183.8,263.953405,228.806545,...,0.589054,12.834192,1.244103,1.777555,1.968318,1.451103,4.577078,4.976145,1.64524,0.965247
min,443.0,443.0,8e-06,54.0,1.04778,54.0,0.5163631,0.0,0.0,54.666667,...,0.023508,0.0,0.0,4e-06,2e-06,-1e-06,-10.0,-10.0,0.0,0.0
25%,49535.0,443.0,0.078984,108.0,91.57143,66.0,185.0387,30.25,5.5,60.5,...,0.600785,0.0,0.0,0.013115,0.015176,3.2e-05,-10.0,-10.0,0.0,0.0
50%,54327.0,443.0,2.97132,1257.0,940.2016,4033.0,1814.837,45469.3,213.235318,190.589572,...,1.0,5.1e-05,0.007142,0.019989,0.018127,0.002423,-1.240163,0.739685,0.528094,0.0
75%,59481.0,443.0,33.742902,2399.0,3595.006,7175.75,4359.889,231610.5,481.259289,315.26087,...,1.50292,0.000216,0.014705,0.037631,0.027851,0.025048,0.589735,1.25108,0.934175,2.0
max,65534.0,65534.0,158.049566,11256330.0,71852710.0,52680110.0,7612113.0,81092990.0,9005.16479,3833.906977,...,8.777577,506.320247,22.501561,45.013642,45.013642,45.013642,2.970716,7.096569,73.66547,3.0


In [152]:
counts = data.labels.value_counts()
counts

0    540216
2    167318
1     46542
3     35826
Name: labels, dtype: int64

In [153]:
counts = data.SourcePort.value_counts()
counts

443      63792
49872      289
49894      265
58615      250
49880      249
         ...  
37120        4
37976        4
37644        4
33702        4
36994        4
Name: SourcePort, Length: 24573, dtype: int64

In [154]:
data['SourceIP']

139551    192.168.20.191
658361    192.168.20.212
779961    192.168.20.144
692809    192.168.20.144
128467    192.168.20.191
               ...      
341346    192.168.20.191
488934    192.168.20.191
467538    192.168.20.191
168661    192.168.20.191
191587    192.168.20.191
Name: SourceIP, Length: 789902, dtype: object

In [155]:
le=LabelEncoder()
data['SourceIP'] = le.fit_transform(data['SourceIP'])
data['DestinationIP'] = le.fit_transform(data['DestinationIP'])
data['SourcePort'] = le.fit_transform(data['SourcePort'])
data['DestinationPort'] = le.fit_transform(data['DestinationPort'])

In [156]:
data['SourceIP']

139551    2537
658361    2546
779961    2536
692809    2536
128467    2537
          ... 
341346    2537
488934    2537
467538    2537
168661    2537
191587    2537
Name: SourceIP, Length: 789902, dtype: int32

In [157]:
data.to_excel("DoHBrw-2020/data.xlsx") 
data

Unnamed: 0,SourceIP,DestinationIP,SourcePort,DestinationPort,TimeStamp,Duration,FlowBytesSent,FlowSentRate,FlowBytesReceived,FlowReceivedRate,...,PacketTimeCoefficientofVariation,ResponseTimeTimeVariance,ResponseTimeTimeStandardDeviation,ResponseTimeTimeMean,ResponseTimeTimeMedian,ResponseTimeTimeMode,ResponseTimeTimeSkewFromMedian,ResponseTimeTimeSkewFromMode,ResponseTimeTimeCoefficientofVariation,labels
139551,2537,12387,23579,0,2020-01-12 06:32:41,0.918400,1690,1840.156794,4961,5401.785714,...,0.527991,1.774736e-07,0.000421,0.181384,0.181155,0.180942,1.630759,1.049193,0.002323,0
658361,2546,23554,15497,0,2020-03-31 02:11:04,33.725058,1875,55.596643,4895,145.144302,...,1.673093,6.661896e-05,0.008162,0.010205,0.016292,0.000127,-2.237307,1.234740,0.799808,2
779961,2536,23078,3437,0,2020-03-24 23:41:29,120.531256,156317,1296.900117,368206,3054.859065,...,0.585809,2.522675e-05,0.005023,0.014710,0.015345,0.015319,-0.379134,-0.121201,0.341437,3
692809,2536,23554,5036,0,2020-03-31 02:53:53,1.904093,1085,569.825108,4100,2153.256170,...,0.584690,3.085527e-04,0.017566,0.015738,0.011411,0.000018,0.738828,0.894899,1.116166,2
128467,2537,4706,15561,0,2020-01-12 05:55:53,0.015354,108,7033.997655,60,3907.776475,...,0.707107,0.000000e+00,0.000000,0.015336,0.015336,0.015336,-10.000000,-10.000000,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
341346,2537,12450,19142,0,2020-01-14 19:09:29,1.051604,2266,2154.803519,1118,1063.137835,...,0.526692,7.386969e-05,0.008595,0.211288,0.206427,0.206125,1.696734,0.600716,0.040678,0
488934,2537,13234,16662,0,2020-01-13 15:03:35,28.481115,217,7.619084,211,7.408418,...,0.632169,7.526827e-05,0.008676,0.012334,0.018469,0.018469,-2.121320,-0.707107,0.703380,0
467538,2537,8010,11659,0,2020-01-13 23:36:12,0.196234,2532,12902.962789,4850,24715.390809,...,0.937268,6.104157e-06,0.002471,0.017867,0.016843,0.016372,1.243393,0.605102,0.138281,0
168661,2537,5804,16908,0,2020-01-12 04:15:15,0.020610,108,5240.174672,60,2911.208151,...,0.707107,0.000000e+00,0.000000,0.020593,0.020593,0.020593,-10.000000,-10.000000,0.000000,0


In [174]:
X = data.drop(["TimeStamp","labels"],axis=1)
y = data['labels'].values

In [159]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [160]:
X_train, X_test, y_train, y_test \
    = train_test_split(X, y, test_size=0.2, random_state=1)

X_train, X_val, y_train, y_val \
    = train_test_split(X_train, y_train, test_size=0.25, random_state=1)  # 0.25 x 0.8 = 0.2


In [186]:
def displayClasificationResults(z, y_test, y_pred, numClasses=4):

    print("Number of mislabeled points out of a total %d points : %d"
          % (y_test.shape[0], (y_test != y_pred).sum()))
    accurecy = round(100-(((y_test != y_pred).sum()/y_test.shape[0])*100), 2)
    print(f"Accurecy is {accurecy}%")
    #'micro', 'macro', 'weighted', 'samples'
    precision, recall, fscore, support = precision_recall_fscore_support(
        y_test, y_pred, average='weighted')
    precision *= 100
    recall *= 100
    fscore *= 100
    print(f"Precision = {round(precision,2)}%")
    print(f"Recall = {round(recall,2)}%")
    print(f"Fscore = {round(fscore,2)}%")

    if numClasses == 2:
        x = ['benign', 'malicious']
        y = ['benign', 'malicious']
    else:
        x = ['benign', 'iodine', 'dns2tcp', 'dnscat2']
        y = ['benign', 'iodine', 'dns2tcp', 'dnscat2']

    # change each element of z to type string for annotations
    z_text = [[str(y) for y in x] for x in z]

    # set up figure
    fig = ff.create_annotated_heatmap(
        z, x=x, y=y, annotation_text=z_text, colorscale='Viridis')

    # add title
    fig.update_layout(title_text='<i><b>Confusion matrix</b></i>',
                      #xaxis = dict(title='x'),
                      #yaxis = dict(title='x')
                      )

    # add custom xaxis title
    fig.add_annotation(dict(font=dict(color="black", size=14),
                            x=0.5,
                            y=-0.15,
                            showarrow=False,
                            text="Predicted value",
                            xref="paper",
                            yref="paper"))

    # add custom yaxis title
    fig.add_annotation(dict(font=dict(color="black", size=14),
                            x=-0.35,
                            y=0.5,
                            showarrow=False,
                            text="Real value",
                            textangle=-90,
                            xref="paper",
                            yref="paper"))

    # adjust margins to make room for yaxis title
    fig.update_layout(margin=dict(t=50, l=200))

    # add colorbar
    fig['data'][0]['showscale'] = True
    fig.show()


In [162]:
rfc_4_classification = RandomForestClassifier(n_estimators=500, random_state=1)
y_pred = rfc_4_classification.fit(X_train, y_train).predict(X_train)
z = confusion_matrix(y_train, y_pred)
displayClasificationResults(z, y_train, y_pred)


Number of mislabeled points out of a total 473940 points : 0
Accurecy is 100.0%
Precision = 100.0%
Recall = 100.0%
Fscore = 100.0%


In [169]:
y_pred = rfc_4_classification.predict(X_test)
z = confusion_matrix(y_test, y_pred)
displayClasificationResults(z, y_test, y_pred)

Number of mislabeled points out of a total 157981 points : 441
Accurecy is 99.72%
Precision = 99.72%
Recall = 99.72%
Fscore = 99.72%


In [168]:
y_pred = rfc_4_classification.predict(X_val)
z = confusion_matrix(y_val, y_pred)
displayClasificationResults(z, y_val, y_pred)

Number of mislabeled points out of a total 157981 points : 452
Accurecy is 99.71%
Precision = 99.72%
Recall = 99.71%
Fscore = 99.71%


In [None]:
for x, y in zip(X_val, y_val):
    y_pred = rfc_4_classification.predict(x.reshape(1, -1))
    print(y_pred[0])
    print(y)
    debug = 1


In [181]:
data.loc[data["labels"] == 2] = 1
data.loc[data["labels"] == 3] = 1
y = data['labels'].values

In [182]:
X_train, X_test, y_train, y_test \
    = train_test_split(X, y, test_size=0.2, random_state=1)

X_train, X_val, y_train, y_val \
    = train_test_split(X_train, y_train, test_size=0.25, random_state=1)  # 0.25 x 0.8 = 0.2


In [187]:
rfc_Binary_classification = RandomForestClassifier(
    n_estimators=500, random_state=1)
y_pred = rfc_Binary_classification.fit(X_train, y_train).predict(X_train)
z = confusion_matrix(y_train, y_pred)
displayClasificationResults(z, y_train, y_pred, 2)


Number of mislabeled points out of a total 473940 points : 0
Accurecy is 100.0%
Precision = 100.0%
Recall = 100.0%
Fscore = 100.0%


In [188]:
y_pred = rfc_Binary_classification.predict(X_test)
z = confusion_matrix(y_test, y_pred)
displayClasificationResults(z, y_test, y_pred, 2)


Number of mislabeled points out of a total 157981 points : 3
Accurecy is 100.0%
Precision = 100.0%
Recall = 100.0%
Fscore = 100.0%


In [189]:
y_pred = rfc_Binary_classification.predict(X_val)
z = confusion_matrix(y_val, y_pred)
displayClasificationResults(z, y_val, y_pred, 2)


Number of mislabeled points out of a total 157981 points : 0
Accurecy is 100.0%
Precision = 100.0%
Recall = 100.0%
Fscore = 100.0%
