# DNS over HTTPS Experiments
This notebook serves to run all the experiments for our work on the CIRA-CIC-DoHBrw-2020 dataset. This notebook will train and validate 9 machine learning models and 2 deep learning models. Additionally, the experiments will determine how the performance of these models changes as we increase the size of the feature set.

In [1]:
# Import the dataset saved on the google drive
from google.colab import drive

# Graphing capabilities
import matplotlib.pyplot as plt

# Data management
import pandas as pd
import numpy as np

# For stratified 10-fold cross validation
from sklearn.model_selection import StratifiedKFold

# Scikit-Learn ML Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB

# Keras-TensorFlow DNN Model
from keras.models import Sequential
from keras.layers import BatchNormalization, Dense, Dropout
from keras.regularizers import l2

# Fast.ai DNN Model
from fastai.tabular import *

# Normalization
from keras.utils import normalize, to_categorical

print('Imports complete.')

Imports complete.


In [120]:
# Objects used to help manage the metrics data
class Metric:
    def __init__(self, name, fold):
        self.name = name
        self.fold_num = fold
        self.values = {}

    def __str__(self):
        return str({self.name: self.values})

    def __repr__(self):
        return str({self.name: self.values})

    def addValue(self, m_type, value):
        if m_type != None and value != None:
            self.values[m_type] = value

    def getValue(self, m_type):
        if m_type in self.values:
            return self.values[m_type]

    def getName(self):
        return self.name

    def getMeasures(self):
        return self.values.keys()

    def getValues(self):
        return self.values

    def containsType(self, m_type):
        if m_type in self.values:
            return True
        else:
            return False

    def getModelWithMeasure(self, m_type):
        new_metric = Metric(self.name)
        new_metric.addValue(m_type, self.values[m_type])

        return new_metric

class MetricsManager:
    def __init__(self):
        self.metrics_list = []
    
    def getMetrics(self, model_name='all', m_type='all'):
        if model_name == 'all' and m_type == 'all':
            return self.metrics_list
        else:
            # Return the metrics requested and remove any of the none values that have shown up
            return list(filter(None, map( lambda m : m.getModelWithMeasure(m_type) if (m.getName() == model_name or model_name == 'all') and (m.containsType(m_type) or m_type == 'all') else None, self.metrics_list)))

    def addMetric(self, metric):
        self.metrics_list.append(metric)

    def printMeasures(self, model='all', metrics='all'):
        metrics = self.getMetrics(model_name=model, m_type=metrics)

        measurements = []
        for metric in metrics:
            metric_measures = metric.getMeasures()
            for measure in metric_measures:
                if measure not in measurements:
                    measurements.append(measure)

        print('{:10}'.format('model'), end='')
        for measure in measurements:
            print('{:6}'.format(measure), end='')
        print('\n', end='')
        print('-------'*(len(measurements)+1))

        for metric in metrics:
            print('{:9}'.format(metric.getName()), end='')
            metric_values = metric.getValues()
            for measure in measurements:
                if measure in metric_values:
                    # Edit this next line to calculate the values across all of the folds!
                    print('{:6.2f}'.format(100*metric_values[measure]), end='')
                else:
                    print(' '*6, end='')
            print('\n', end='')

In [121]:
# Metric manager tests
mm = MetricsManager()

m = Metric('rf', fold=1)
m.addValue('acc', 0.97)
m.addValue('time', 0.99)
mm.addMetric(m)

m = Metric('rf', fold=2)
m.addValue('acc', 0.95)
m.addValue('time', 0.99)
mm.addMetric(m)

m = Metric('rf', fold=3)
m.addValue('acc', 0.93)
m.addValue('time', 0.99)
mm.addMetric(m)

m = Metric('dt', fold=1)
m.addValue('time', 0.75)
mm.addMetric(m)

m = Metric('xgboost', fold=1)
m.addValue('acc', 0.99)
mm.addMetric(m)

mm.printMeasures()

model     acc   time  
---------------------
rf        97.00 99.00
rf        95.00 99.00
rf        93.00 99.00
dt              75.00
xgboost   99.00      


In [2]:
def train_and_eval_on(X, y, feature_set):
    """
    train_and_eval_on function
        Description: This function will train all the models on the given feature set of the X (data) for predicting y (target)

        Args: 
            X => pd.DataFrame object containing the data
            y => pd.Series object containings the target classifications
            feature_set => list of features in X to use for training

        Returns:
            metrics => dictionary where the model names are the key and a list of accuracies across all folds is the value
                    Keys:
                        Random Forest => rf
                        Decision Tree => dt
                        k-Nearest Neighbors => knn
                        Support Vector Machine => svm
                        Logistic Regression => lr
                        Linear Discriminant Analysis => lda
                        AdaBoost => ab
                        Naive Bayes => nb
                        Keras-TensorFlow => keras
                        Fast.ai => fastai
    """
    metrics = {'rf':[],
                'dt':[],
                'knn':[],
                'svm':[],
                'lr':[],
                'lda':[],
                'ab':[],
                'nb':[],
                'keras':[],
                'fastai':[]}

    # Select the given features within the data
    X = X[feature_set]

    print('Training with {} features'.format(len(X.columns)))

    # Create stratified, 10-fold cross validation object
    random_state = 0
    sss = StratifiedKFold(n_splits=10, shuffle=True, random_state=random_state)

    i=1

    # Experiment with 10-fold cross validation
    for train_idx, test_idx in sss.split(X, y):

        print('fold num {}'.format(i))
        i+=1

        # Split the data into the training and testing sets
        print('splitting data')
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        
        # Random Forest Model
        print('creating rf')
        rf = RandomForestClassifier(random_state=random_state)
        print('fitting rf')
        rf.fit(X_train, y_train)
        print('scoring rf')
        score = rf.score(X_test, y_test)
        metrics['rf'].append(score)

        # Decision Tree Model
        dt = DecisionTreeClassifier(random_state=random_state)
        dt.fit(X_train, y_train)
        score = dt.score(X_test, y_test)
        metrics['dt'].append(score)

        # k-Nearest Neighbors Model
        knn = KNeighborsClassifier()
        knn.fit(X_train, y_train)
        score = knn.score(X_test, y_test)
        metrics['knn'].append(score)

        # Support Vector Machine Model
        svm = SVC(random_state=random_state)
        svm.fit(X_train, y_train)
        score = svm.score(X_test, y_test)
        metrics['svm'].append(score)

        # Logistic Regression Model
        lr = LogisticRegression(random_state=random_state)
        lr.fit(X_train, y_train)
        score = lr.score(X_test, y_test)
        metrics['lr'].append(score)

        # Linear Discriminant Analysis Model
        lda = LinearDiscriminantAnalysis()
        lda.fit(X_train, y_train)
        score = lda.score(X_test, y_test)
        metrics['lda'].append(score)

        # AdaBoost Model
        ab = AdaBoostClassifier(random_state=random_state)
        ab.fit(X_train, y_train)
        score = ab.score(X_test, y_test)
        metrics['ab'].append(score)

        # Naive Bayes Model
        nb = GaussianNB()
        nb.fit(X_train, y_train)
        score = nb.score(X_test, y_test)
        metrics['nb'].append(score)

        # Keras-TensorFlow DNN Model
        dnn_keras = Sequential(layers=[
                                 Dense(128, kernel_regularizer=l2(0.001), activation='relu',input_shape=(len(X_train.columns),)),
                                 BatchNormalization(),
                                 Dense(64, activation='relu', kernel_regularizer=l2(0.001)),
                                 BatchNormalization(),
                                 Dense(y_train.nunique(), activation='softmax')
        ])
        dnn_keras.compile(
            optimizer='adam', 
            loss='categorical_crossentropy', 
            metrics=['accuracy'])
        dnn_keras.fit(X_train, pd.get_dummies(y_train), epochs=100, verbose=0, batch_size=512)
        _, score = dnn_keras.evaluate(X_test, pd.get_dummies(y_test), verbose=0)
        metrics['keras'].append(score)

        # Fast.ai DNN Model
        data_fold = (TabularList.from_df(df, path=path, cont_names=X_train.columns, procs=[Categorify, Normalize])
                     .split_by_idxs(train_idx, test_idx)
                     .label_from_df(cols=dep_var)
                     .databunch(num_workers=0))
        dnn_fastai = tabular_learner(data_fold, layers=[200, 100], metrics=accuracy)
        dnn_fastai.fit_one_cycle(cyc_len=10, callbacks=None)
        _, score = dnn_fastai.validate()
        metrics['fastai'].append(score)

    return metrics

In [3]:
def show_graph(figure, feature_count, metrics_dict, exp_type=''):
  """
  show_graph function

    Description: This function will take the metrics dictionary provided and update the graph already to show the most recent results

    Args:
      figure => matplotlib.pyplot.figure object
      metrics_dict => dictionary of metrics as described in `train_and_eval_on` function
      exp_type => string indicating the type of experiment to change the title of the graph

    Returns:
      nothing
  """
  # Reorganize the data so we have all of the random forest metrics with increasing features side by side
  reorganized_dictionary = {}

  for feature_vals in metrics_dict.keys():
    for key in metrics_dict[feature_vals].keys():
      # If a given model is not in the new dictionary, add it
      if key not in reorganized_dictionary:
        reorganized_dictionary[key] = {}

      # If there isn't a specific feature number in the model dictionary, add it
      if feature_vals not in reorganized_dictionary[key]:
        reorganized_dictionary[key][feature_vals] = []

      # If there is anything to the record, add it
      if len( metrics_dict[feature_vals][key] ) > 0:
        accuracies = metrics_dict[feature_vals][key]
        mean = np.mean(accuracies)
        std = np.std(accuracies)

        #print('Accuracies: {}'.format(accuracies))
        #print('Mean: {}'.format(mean))
        #print('Std: {}'.format(std))

        reorganized_dictionary[key][feature_vals].append( [mean, std] ) 

  #print('Models: {}'.format( list(reorganized_dictionary.keys()) ))

  for model in reorganized_dictionary.keys():
    # The x-axis will have the feature_count
    xs = []

    # The y-axis will have the accuracy for that feature_count value
    ys = []

    # The y-axis will also have the std for these accuracies since they are accumulated over 10 folds
    yerrs = []

    for x in reorganized_dictionary[model].keys():
      if len(reorganized_dictionary[model][x]) > 0:
        xs.append(x)
        ys.append(reorganized_dictionary[model][x][0][0])
        yerrs.append(reorganized_dictionary[model][x][0][1])
    #print('xs: {}'.format(xs))
    #print('ys: {}'.format(ys))
    if len(xs) > 0:
      plt.errorbar(x=xs, y=ys, yerr=yerrs, label=model)

  #print(reorganized_dictionary)
  if exp_type == 'multi':
    plt.title('Multi-class Classification Model Accuracies with Increasing Features')
  elif exp_type == 'binary':
    plt.title('Binary Classification Model Accuracies with Increasing Features')
  plt.ylabel('Accuracy')
  plt.xlabel('Number of Features')

  plt.xticks(xs[4::5])

  plt.legend()
  plt.show()


In [4]:
def get_data(path, layer=0, nans=False):
    """ get_data function
        Description: This function will take the given path and user-defined layer from the dataset, import the datafiles, and then return the combined pandas DataFrame
        Arguments:
            path => string, path to the directory containing the l1-doh.csv, l1-nonhod.csv, etc files.
            layer => int, the level of layer desired. This will change the dataset that is imported. Values can be 1 or 2. Default is 0.
            nans => boolean, Whether the user wants NaNs in the data or wants them removed. This function will automatically remove all rows with Nan values.
        Returns:
            df => pandas.DataFrame, contains complete data
        Raises:
            AttributeError for incorrect layer number
            Any additional read errors are raised to the user
    """
    import pandas as pd

    if layer not in [1,2]:
        raise AttributeError('Must provide valid layer for dataset: layer equals 1 or 2')
    else:

        # Select the files that the user has chosen
        filenames = []
        if layer == 1:
            filenames.append('l1-doh.csv')
            filenames.append('l1-nondoh.csv')
        else:
            filenames.append('l2-benign.csv')
            filenames.append('l2-malicious.csv')

        # Read the files into dataframes
        df0 = pd.read_csv(path + '/' + filenames[0])
        df1 = pd.read_csv(path + '/' + filenames[1])

        df = pd.concat([df0, df1])

        # Remove any rows with Nan values
        if not nans:
            df.dropna(axis='index', inplace=True)

        return df

In [5]:
def balance_data(df, label_column):
    labels = df[label_column].unique()
    sample_length_list = []
    for i in range(len(labels)):
        samples = df.loc[ df[label_column] == labels[i] ]
        sample_length_list.append( len(samples) )
        #print('Number of {} samples: {}'.format(labels[i], len( samples )))

    random_state = 0
    smallest_count = min(sample_length_list)
    dfs = []
    for i in range(len(labels)):
        #dfs.append( df.loc[ df[label_column] == labels[i] ].sample(smallest_count) )

        # We are only sampling 40 purely for testing reasons to help speed up the dev process!
        # Uncomment the line above this to actually run the complete tests
        dfs.append( df.loc[ df[label_column] == labels[i] ].sample(40) )

    return pd.concat(dfs)

In [6]:
# Set up google drive access
drive.mount('/content/gdrive')

Mounted at /content/gdrive


## Layer 1 Experiments: DoH or nonDoH

In [7]:
path = '/content/gdrive/My Drive/doh_dataset/Total-CSVs'
df = get_data(path=path, layer=1)
df.head()

Unnamed: 0,SourceIP,DestinationIP,SourcePort,DestinationPort,TimeStamp,Duration,FlowBytesSent,FlowSentRate,FlowBytesReceived,FlowReceivedRate,PacketLengthVariance,PacketLengthStandardDeviation,PacketLengthMean,PacketLengthMedian,PacketLengthMode,PacketLengthSkewFromMedian,PacketLengthSkewFromMode,PacketLengthCoefficientofVariation,PacketTimeVariance,PacketTimeStandardDeviation,PacketTimeMean,PacketTimeMedian,PacketTimeMode,PacketTimeSkewFromMedian,PacketTimeSkewFromMode,PacketTimeCoefficientofVariation,ResponseTimeTimeVariance,ResponseTimeTimeStandardDeviation,ResponseTimeTimeMean,ResponseTimeTimeMedian,ResponseTimeTimeMode,ResponseTimeTimeSkewFromMedian,ResponseTimeTimeSkewFromMode,ResponseTimeTimeCoefficientofVariation,Label
0,192.168.20.191,176.103.130.131,50749,443,2020-01-14 15:49:11,95.08155,62311,655.342703,65358,687.388878,7474.676771,86.456213,135.673751,102.0,54,1.168467,0.944683,0.637236,670.585814,25.895672,45.065277,48.811292,1.49506,-0.433974,1.682529,0.574626,0.001053,0.032457,0.027624,0.026854,0.026822,0.071187,0.024715,1.174948,DoH
1,192.168.20.191,176.103.130.131,50749,443,2020-01-14 15:50:52,122.309318,93828,767.136973,101232,827.672018,10458.118598,102.264943,141.245474,114.0,54,0.799261,0.853132,0.724023,708.465878,26.617022,52.287903,48.830314,31.719656,0.389704,0.772748,0.509047,0.00117,0.0342,0.024387,0.021043,0.026981,0.293297,-0.075845,1.402382,DoH
2,192.168.20.191,176.103.130.131,50749,443,2020-01-14 15:52:55,120.958413,38784,320.639127,38236,316.108645,7300.293933,85.441758,133.715278,89.0,54,1.570027,0.932978,0.638983,1358.911235,36.863413,50.316114,39.770747,0.417528,0.858198,1.353607,0.732636,0.000785,0.028021,0.029238,0.026921,0.026855,0.248064,0.085061,0.958348,DoH
3,192.168.20.191,176.103.130.131,50749,443,2020-01-14 15:54:56,110.50108,61993,561.017141,69757,631.278898,8499.282518,92.191553,139.123548,114.0,54,0.817544,0.923333,0.66266,1118.135436,33.438532,51.693726,34.882495,13.280934,1.508251,1.148758,0.646859,0.000411,0.020274,0.019925,0.019268,0.026918,0.097199,-0.344926,1.017535,DoH
4,176.103.130.131,192.168.20.191,443,50749,2020-01-14 15:56:46,54.229891,83641,1542.341289,76804,1416.266907,8052.745751,89.737092,138.91342,114.0,114,0.83288,0.277627,0.645993,341.696613,18.485038,36.435619,49.822561,7.342519,-2.172613,1.573873,0.507334,0.079079,0.281209,0.02593,4.7e-05,2.1e-05,0.276133,0.092135,10.844829,DoH


In [8]:
bad_columns = ['SourceIP', 'DestinationIP', 'SourcePort', 'DestinationPort', 'TimeStamp']
df.drop(labels=bad_columns, axis='columns', inplace=True)

In [9]:
# The target classifications are in the 'Label' columns, 
#  thus this is the independent variable!
dep_var = 'Label'
df[dep_var].value_counts()

NonDoH    889809
DoH       269299
Name: Label, dtype: int64

In [10]:
# Balance the data out
df = balance_data(df, dep_var)

In [11]:
df[dep_var].value_counts()

NonDoH    40
DoH       40
Name: Label, dtype: int64

In [12]:
# Split up the data into the data (X) and classifications (y)
X = df.loc[:, df.columns != dep_var]
y = df[dep_var]

In [13]:
best_features_layer1 = ['Duration', 'ResponseTimeTimeSkewFromMedian', 'ResponseTimeTimeMode',
       'ResponseTimeTimeMedian', 'ResponseTimeTimeMean',
       'PacketTimeSkewFromMedian', 'PacketTimeMode', 'PacketTimeMedian',
       'PacketTimeMean', 'ResponseTimeTimeSkewFromMode', 'PacketTimeVariance',
       'PacketLengthCoefficientofVariation', 'PacketTimeStandardDeviation',
       'PacketLengthMode', 'PacketLengthMedian', 'PacketLengthMean',
       'FlowBytesSent', 'ResponseTimeTimeCoefficientofVariation',
       'PacketLengthStandardDeviation', 'PacketLengthVariance',
       'PacketTimeCoefficientofVariation', 'FlowReceivedRate',
       'ResponseTimeTimeStandardDeviation', 'PacketLengthSkewFromMode',
       'FlowBytesReceived', 'PacketLengthSkewFromMedian', 'FlowSentRate',
       'ResponseTimeTimeVariance', 'PacketTimeSkewFromMode']
print('These are the best 4 features for layer 1: {}'.format(best_features_layer1[:4]))
print('These are the worst 4 features for layer 1: {}'.format(best_features_layer1[-4:]))

These are the best 4 features for layer 1: ['Duration', 'ResponseTimeTimeSkewFromMedian', 'ResponseTimeTimeMode', 'ResponseTimeTimeMedian']
These are the worst 4 features for layer 1: ['PacketLengthSkewFromMedian', 'FlowSentRate', 'ResponseTimeTimeVariance', 'PacketTimeSkewFromMode']


In [14]:
l1_performance_metrics = train_and_eval_on(X=X, y=y, feature_set=best_features_layer1)

Training with 29 features
fold num 1
splitting data
creating rf
fitting rf
scoring rf


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


epoch,train_loss,valid_loss,accuracy,time
0,0.763505,0.719691,0.125,00:00
1,0.736045,0.710602,0.5,00:00
2,0.663918,0.682709,0.5,00:00
3,0.585562,0.648778,0.5,00:00
4,0.510923,0.618839,0.5,00:00
5,0.452656,0.597687,0.5,00:00
6,0.407562,0.583574,0.5,00:00
7,0.368575,0.574431,0.5,00:00
8,0.337745,0.567568,0.5,00:00
9,0.312184,0.560807,0.5,00:00


fold num 2
splitting data
creating rf
fitting rf
scoring rf


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


epoch,train_loss,valid_loss,accuracy,time
0,0.688491,0.750789,0.625,00:00
1,0.657975,0.726984,0.625,00:00
2,0.607513,0.667507,0.625,00:00
3,0.54245,0.597342,0.875,00:00
4,0.482278,0.539155,0.875,00:00
5,0.435472,0.49375,0.875,00:00
6,0.394493,0.457822,0.875,00:00
7,0.359265,0.427534,0.875,00:00
8,0.330125,0.403633,0.875,00:00
9,0.306924,0.382882,0.875,00:00


fold num 3
splitting data
creating rf
fitting rf
scoring rf


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


epoch,train_loss,valid_loss,accuracy,time
0,0.824086,0.731713,0.25,00:00
1,0.796759,0.699738,0.5,00:00
2,0.714676,0.624083,0.75,00:00
3,0.622219,0.555828,0.75,00:00
4,0.551426,0.504607,0.75,00:00
5,0.495427,0.463444,1.0,00:00
6,0.449614,0.430484,1.0,00:00
7,0.407989,0.402542,1.0,00:00
8,0.384162,0.381251,1.0,00:00
9,0.359379,0.367135,1.0,00:00


fold num 4
splitting data
creating rf
fitting rf
scoring rf


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


epoch,train_loss,valid_loss,accuracy,time
0,0.848267,0.712276,0.125,00:00
1,0.825166,0.690281,0.625,00:00
2,0.752003,0.636151,0.75,00:00
3,0.656364,0.579293,0.75,00:00
4,0.577328,0.532369,0.75,00:00
5,0.512488,0.495334,0.75,00:00
6,0.462175,0.466126,0.75,00:00
7,0.421713,0.443621,0.75,00:00
8,0.390073,0.425493,0.75,00:00
9,0.361256,0.411583,0.75,00:00


fold num 5
splitting data
creating rf
fitting rf
scoring rf


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression




epoch,train_loss,valid_loss,accuracy,time
0,0.738838,0.685991,0.5,00:00
1,0.7124,0.671038,0.625,00:00
2,0.649803,0.641079,0.75,00:00
3,0.573205,0.608957,0.625,00:00
4,0.507126,0.578245,0.75,00:00
5,0.449694,0.549315,0.75,00:00
6,0.403822,0.522115,0.875,00:00
7,0.367265,0.499518,0.875,00:00
8,0.337754,0.482922,0.875,00:00
9,0.312997,0.471254,0.875,00:00


fold num 6
splitting data
creating rf
fitting rf
scoring rf


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression




epoch,train_loss,valid_loss,accuracy,time
0,0.665208,0.756005,0.5,00:00
1,0.643841,0.701293,0.625,00:00
2,0.582678,0.588364,0.875,00:00
3,0.510514,0.50597,1.0,00:00
4,0.450794,0.454726,1.0,00:00
5,0.403948,0.421819,1.0,00:00
6,0.365121,0.39654,1.0,00:00
7,0.333433,0.375878,1.0,00:00
8,0.308734,0.360195,1.0,00:00
9,0.284925,0.345185,1.0,00:00


fold num 7
splitting data
creating rf
fitting rf
scoring rf


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression




epoch,train_loss,valid_loss,accuracy,time
0,0.734548,0.693564,0.5,00:00
1,0.724736,0.680052,0.5,00:00
2,0.654591,0.649404,0.5,00:00
3,0.566621,0.61245,0.75,00:00
4,0.496375,0.577474,1.0,00:00
5,0.440042,0.545597,1.0,00:00
6,0.399533,0.516883,1.0,00:00
7,0.364953,0.490343,1.0,00:00
8,0.33637,0.466052,1.0,00:00
9,0.312864,0.445581,1.0,00:00


fold num 8
splitting data
creating rf
fitting rf
scoring rf


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression




epoch,train_loss,valid_loss,accuracy,time
0,0.701726,0.689592,0.5,00:00
1,0.6863,0.665223,0.75,00:00
2,0.62646,0.619417,0.75,00:00
3,0.549387,0.572381,0.75,00:00
4,0.475432,0.543194,0.75,00:00
5,0.424095,0.521679,0.75,00:00
6,0.3836,0.505811,0.75,00:00
7,0.350401,0.493771,0.75,00:00
8,0.319141,0.485316,0.75,00:00
9,0.297673,0.479662,0.75,00:00


fold num 9
splitting data
creating rf
fitting rf
scoring rf


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression




epoch,train_loss,valid_loss,accuracy,time
0,0.782374,0.701275,0.5,00:00
1,0.80867,0.689528,0.5,00:00
2,0.731007,0.657548,0.75,00:00
3,0.6496,0.621754,0.875,00:00
4,0.568791,0.591207,1.0,00:00
5,0.508456,0.568061,1.0,00:00
6,0.45842,0.549293,0.875,00:00
7,0.419321,0.532197,0.875,00:00
8,0.384223,0.51692,0.875,00:00
9,0.355049,0.503692,0.875,00:00


fold num 10
splitting data
creating rf
fitting rf
scoring rf


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression




epoch,train_loss,valid_loss,accuracy,time
0,0.718501,0.684047,0.75,00:00
1,0.680452,0.676453,0.75,00:00
2,0.621594,0.662603,0.5,00:00
3,0.537689,0.643198,0.5,00:00
4,0.474006,0.620446,0.625,00:00
5,0.421147,0.595665,0.625,00:00
6,0.37694,0.566859,0.625,00:00
7,0.342895,0.540508,0.75,00:00
8,0.314697,0.51981,0.75,00:00
9,0.291453,0.503996,0.75,00:00


In [15]:
print('Models\tAccuracy (%)')
for key in l1_performance_metrics.keys():
  avg = np.average(l1_performance_metrics[key])
  std = np.std(l1_performance_metrics[key])
  print('{}\t{:.2f}\u00B1{:.2f}'.format(key, avg*100, std*100))

Models	Accuracy (%)
rf	95.00±6.12
dt	91.25±12.56
knn	76.25±8.75
svm	52.50±5.00
lr	75.00±15.81
lda	80.00±13.92
ab	96.25±8.00
nb	61.25±18.07
keras	72.50±20.77
fastai	83.75±14.84


## Layer 2 Experiments: Benign-DoH or Malicious-DoH

In [16]:
path = '/content/gdrive/My Drive/doh_dataset/Total-CSVs'
df = get_data(path=path, layer=2)
df.head()

Unnamed: 0,SourceIP,DestinationIP,SourcePort,DestinationPort,TimeStamp,Duration,FlowBytesSent,FlowSentRate,FlowBytesReceived,FlowReceivedRate,PacketLengthVariance,PacketLengthStandardDeviation,PacketLengthMean,PacketLengthMedian,PacketLengthMode,PacketLengthSkewFromMedian,PacketLengthSkewFromMode,PacketLengthCoefficientofVariation,PacketTimeVariance,PacketTimeStandardDeviation,PacketTimeMean,PacketTimeMedian,PacketTimeMode,PacketTimeSkewFromMedian,PacketTimeSkewFromMode,PacketTimeCoefficientofVariation,ResponseTimeTimeVariance,ResponseTimeTimeStandardDeviation,ResponseTimeTimeMean,ResponseTimeTimeMedian,ResponseTimeTimeMode,ResponseTimeTimeSkewFromMedian,ResponseTimeTimeSkewFromMode,ResponseTimeTimeCoefficientofVariation,Label
0,192.168.20.191,176.103.130.131,50749,443,2020-01-14 15:49:11,95.08155,62311,655.342703,65358,687.388878,7474.676771,86.456213,135.673751,102.0,54,1.168467,0.944683,0.637236,670.585814,25.895672,45.065277,48.811292,1.49506,-0.433974,1.682529,0.574626,0.001053,0.032457,0.027624,0.026854,0.026822,0.071187,0.024715,1.174948,Benign
1,192.168.20.191,176.103.130.131,50749,443,2020-01-14 15:50:52,122.309318,93828,767.136973,101232,827.672018,10458.118598,102.264943,141.245474,114.0,54,0.799261,0.853132,0.724023,708.465878,26.617022,52.287903,48.830314,31.719656,0.389704,0.772748,0.509047,0.00117,0.0342,0.024387,0.021043,0.026981,0.293297,-0.075845,1.402382,Benign
2,192.168.20.191,176.103.130.131,50749,443,2020-01-14 15:52:55,120.958413,38784,320.639127,38236,316.108645,7300.293933,85.441758,133.715278,89.0,54,1.570027,0.932978,0.638983,1358.911235,36.863413,50.316114,39.770747,0.417528,0.858198,1.353607,0.732636,0.000785,0.028021,0.029238,0.026921,0.026855,0.248064,0.085061,0.958348,Benign
3,192.168.20.191,176.103.130.131,50749,443,2020-01-14 15:54:56,110.50108,61993,561.017141,69757,631.278898,8499.282518,92.191553,139.123548,114.0,54,0.817544,0.923333,0.66266,1118.135436,33.438532,51.693726,34.882495,13.280934,1.508251,1.148758,0.646859,0.000411,0.020274,0.019925,0.019268,0.026918,0.097199,-0.344926,1.017535,Benign
4,176.103.130.131,192.168.20.191,443,50749,2020-01-14 15:56:46,54.229891,83641,1542.341289,76804,1416.266907,8052.745751,89.737092,138.91342,114.0,114,0.83288,0.277627,0.645993,341.696613,18.485038,36.435619,49.822561,7.342519,-2.172613,1.573873,0.507334,0.079079,0.281209,0.02593,4.7e-05,2.1e-05,0.276133,0.092135,10.844829,Benign


In [17]:
bad_columns = ['SourceIP', 'DestinationIP', 'SourcePort', 'DestinationPort', 'TimeStamp']
df.drop(labels=bad_columns, axis='columns', inplace=True)

In [18]:
# The target classifications are in the 'Label' columns, 
#  thus this is the independent variable!
dep_var = 'Label'
df[dep_var].value_counts()

Malicious    249553
Benign        19746
Name: Label, dtype: int64

In [19]:
# Balance the data out
df = balance_data(df, dep_var)

In [20]:
df[dep_var].value_counts()

Benign       40
Malicious    40
Name: Label, dtype: int64

In [21]:
# Split up the data into the data (X) and classifications (y)
X = df.loc[:, df.columns != dep_var]
y = df[dep_var]

In [22]:
best_features_layer2 = ['PacketLengthStandardDeviation', 'PacketLengthCoefficientofVariation',
       'FlowReceivedRate', 'PacketLengthMean', 'Duration',
       'PacketTimeSkewFromMedian', 'FlowSentRate', 'PacketLengthVariance',
       'PacketTimeMean', 'PacketTimeStandardDeviation',
       'ResponseTimeTimeMedian', 'PacketTimeMedian',
       'ResponseTimeTimeSkewFromMode', 'ResponseTimeTimeMean',
       'ResponseTimeTimeMode', 'PacketTimeCoefficientofVariation',
       'ResponseTimeTimeSkewFromMedian', 'PacketTimeMode', 'FlowBytesSent',
       'FlowBytesReceived', 'PacketLengthMode',
       'ResponseTimeTimeCoefficientofVariation', 'PacketLengthSkewFromMedian',
       'PacketTimeVariance', 'PacketLengthMedian', 'PacketTimeSkewFromMode',
       'ResponseTimeTimeStandardDeviation', 'ResponseTimeTimeVariance',
       'PacketLengthSkewFromMode']
print('These are the best 4 features for layer 2: {}'.format(best_features_layer2[:4]))
print('These are the worst 4 features for layer 2: {}'.format(best_features_layer2[-4:]))

These are the best 4 features for layer 2: ['PacketLengthStandardDeviation', 'PacketLengthCoefficientofVariation', 'FlowReceivedRate', 'PacketLengthMean']
These are the worst 4 features for layer 2: ['PacketTimeSkewFromMode', 'ResponseTimeTimeStandardDeviation', 'ResponseTimeTimeVariance', 'PacketLengthSkewFromMode']


In [23]:
l2_performance_metrics = train_and_eval_on(X=X, y=y, feature_set=best_features_layer2)

Training with 29 features
fold num 1
splitting data
creating rf
fitting rf
scoring rf


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression




epoch,train_loss,valid_loss,accuracy,time
0,0.864865,0.663477,0.5,00:00
1,0.840592,0.642989,0.5,00:00
2,0.764643,0.606912,0.625,00:00
3,0.673161,0.551644,0.875,00:00
4,0.589611,0.498427,0.875,00:00
5,0.523795,0.450043,0.875,00:00
6,0.475013,0.414351,0.875,00:00
7,0.433419,0.388733,0.875,00:00
8,0.403051,0.370804,0.875,00:00
9,0.377784,0.358133,0.875,00:00


fold num 2
splitting data
creating rf
fitting rf
scoring rf


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression




epoch,train_loss,valid_loss,accuracy,time
0,0.786815,0.645801,0.75,00:00
1,0.77505,0.628772,0.875,00:00
2,0.715777,0.60548,1.0,00:00
3,0.629206,0.586204,1.0,00:00
4,0.558845,0.569946,0.75,00:00
5,0.505716,0.561574,0.75,00:00
6,0.46032,0.556643,0.75,00:00
7,0.421733,0.553934,0.75,00:00
8,0.391342,0.554231,0.75,00:00
9,0.371796,0.552751,0.75,00:00


fold num 3
splitting data
creating rf
fitting rf
scoring rf


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression




epoch,train_loss,valid_loss,accuracy,time
0,0.65025,0.632535,0.75,00:00
1,0.638593,0.60996,0.75,00:00
2,0.596155,0.569405,0.875,00:00
3,0.528437,0.518224,0.875,00:00
4,0.47301,0.471864,0.875,00:00
5,0.425461,0.429701,0.875,00:00
6,0.387033,0.394897,0.875,00:00
7,0.35372,0.368303,0.875,00:00
8,0.327632,0.347277,0.875,00:00
9,0.30454,0.331263,0.875,00:00


fold num 4
splitting data
creating rf
fitting rf
scoring rf


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression




epoch,train_loss,valid_loss,accuracy,time
0,0.800364,0.672972,0.625,00:00
1,0.822092,0.651087,0.625,00:00
2,0.742899,0.601389,0.875,00:00
3,0.653098,0.550488,1.0,00:00
4,0.573415,0.512485,0.875,00:00
5,0.515631,0.485,0.875,00:00
6,0.46855,0.464935,0.875,00:00
7,0.434335,0.446535,0.875,00:00
8,0.399143,0.431692,0.875,00:00
9,0.374072,0.417924,0.875,00:00


fold num 5
splitting data
creating rf
fitting rf
scoring rf


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression




epoch,train_loss,valid_loss,accuracy,time
0,0.766414,0.712117,0.5,00:00
1,0.770804,0.702833,0.5,00:00
2,0.705316,0.678271,0.625,00:00
3,0.613857,0.65477,0.5,00:00
4,0.52958,0.633168,0.625,00:00
5,0.46707,0.610183,0.625,00:00
6,0.422179,0.593326,0.75,00:00
7,0.385901,0.580872,0.75,00:00
8,0.353635,0.572332,0.75,00:00
9,0.329718,0.56469,0.75,00:00


fold num 6
splitting data
creating rf
fitting rf
scoring rf


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression




epoch,train_loss,valid_loss,accuracy,time
0,0.750017,0.698597,0.5,00:00
1,0.73183,0.706383,0.5,00:00
2,0.668298,0.72156,0.5,00:00
3,0.586487,0.736557,0.625,00:00
4,0.526868,0.754222,0.625,00:00
5,0.475596,0.768034,0.625,00:00
6,0.431904,0.782019,0.625,00:00
7,0.393676,0.795669,0.625,00:00
8,0.364434,0.809436,0.75,00:00
9,0.338492,0.823186,0.75,00:00


fold num 7
splitting data
creating rf
fitting rf
scoring rf


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression




epoch,train_loss,valid_loss,accuracy,time
0,0.82614,0.735759,0.5,00:00
1,0.819939,0.719876,0.5,00:00
2,0.74644,0.6771,0.5,00:00
3,0.658979,0.62905,0.625,00:00
4,0.58282,0.586212,0.75,00:00
5,0.520852,0.534769,0.875,00:00
6,0.472897,0.502367,0.875,00:00
7,0.436712,0.480565,0.875,00:00
8,0.40123,0.463273,0.875,00:00
9,0.375813,0.449791,0.875,00:00


fold num 8
splitting data
creating rf
fitting rf
scoring rf


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression




epoch,train_loss,valid_loss,accuracy,time
0,0.691606,0.71392,0.625,00:00
1,0.674626,0.706844,0.5,00:00
2,0.617416,0.686693,0.625,00:00
3,0.521179,0.665986,0.375,00:00
4,0.467105,0.64175,0.375,00:00
5,0.415805,0.617797,0.625,00:00
6,0.373535,0.597463,0.625,00:00
7,0.340037,0.581871,0.625,00:00
8,0.312351,0.568088,0.625,00:00
9,0.287539,0.557506,0.625,00:00


fold num 9
splitting data
creating rf
fitting rf
scoring rf


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression




epoch,train_loss,valid_loss,accuracy,time
0,0.794582,0.701135,0.5,00:00
1,0.79534,0.682705,0.5,00:00
2,0.730666,0.647014,0.625,00:00
3,0.641925,0.604692,0.625,00:00
4,0.564237,0.562739,0.75,00:00
5,0.503182,0.518107,0.875,00:00
6,0.454655,0.480612,0.875,00:00
7,0.415882,0.452627,0.875,00:00
8,0.387556,0.432113,0.875,00:00
9,0.361245,0.416922,0.875,00:00


fold num 10
splitting data
creating rf
fitting rf
scoring rf


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression




epoch,train_loss,valid_loss,accuracy,time
0,0.760042,0.700135,0.5,00:00
1,0.745668,0.680611,0.625,00:00
2,0.665301,0.645296,0.75,00:00
3,0.582763,0.592992,0.75,00:00
4,0.513412,0.553226,0.75,00:00
5,0.454625,0.51489,0.75,00:00
6,0.410662,0.491252,0.75,00:00
7,0.376298,0.474201,0.75,00:00
8,0.344788,0.461599,0.75,00:00
9,0.320469,0.452464,0.75,00:00


In [24]:
print('Models\tAccuracy (%)')
for key in l2_performance_metrics.keys():
  avg = np.average(l2_performance_metrics[key])
  std = np.std(l2_performance_metrics[key])
  print('{}\t{:.2f}\u00B1{:.2f}'.format(key, avg*100, std*100))

Models	Accuracy (%)
rf	92.50±10.00
dt	92.50±12.75
knn	75.00±14.79
svm	58.75±18.58
lr	72.50±13.46
lda	75.00±11.18
ab	92.50±11.46
nb	67.50±11.46
keras	71.25±20.95
fastai	80.00±8.29
