# DNS over HTTPS Experiments
This notebook serves to run all the experiments for our work on the CIRA-CIC-DoHBrw-2020 dataset. This notebook will train and validate 9 machine learning models and 2 deep learning models. Additionally, the experiments will determine how the performance of these models changes as we increase the size of the feature set.

In [137]:
# Import the dataset saved on the google drive
from google.colab import drive

# Graphing capabilities
import matplotlib.pyplot as plt

# Data management
import pandas as pd
import numpy as np

# For stratified 10-fold cross validation
from sklearn.model_selection import StratifiedKFold

# Scikit-Learn ML Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB

# Keras-TensorFlow DNN Model
from keras.models import Sequential
from keras.layers import BatchNormalization, Dense, Dropout
from keras.regularizers import l2

# Fast.ai DNN Model
from fastai.tabular import *

# Normalization
from keras.utils import normalize, to_categorical

print('Imports complete.')

Imports complete.


In [187]:
# Objects used to help manage the metrics data
class Metric:
    def __init__(self, name, fold):
        self.name = name
        self.fold_num = fold
        self.values = {}

    def __str__(self):
        return str({self.name: self.values})

    def __repr__(self):
        return str({self.name: self.values})

    def addValue(self, m_type, value):
        if m_type != None and value != None:
            self.values[m_type] = value

    def getValue(self, m_type):
        if m_type in self.values:
            return self.values[m_type]

    def getName(self):
        return self.name

    def getMeasures(self):
        return self.values.keys()

    def getValues(self):
        return self.values

    def containsType(self, m_type):
        if m_type in self.values:
            return True
        else:
            return False

    def getModelWithMeasure(self, m_type):
        new_metric = Metric(self.name, fold=self.fold_num)
        new_metric.addValue(m_type, self.values[m_type])

        return new_metric

class MetricsManager:
    def __init__(self):
        self.metrics_list = []
    
    def getMetrics(self, model_name='all', m_type='all'):
        if model_name == 'all' and m_type == 'all':
            return self.metrics_list
        else:
            # Return the metrics requested and remove any of the none values that have shown up
            return list(filter(None, map( lambda m : m.getModelWithMeasure(m_type) if (m.getName() == model_name or model_name == 'all') and (m.containsType(m_type) or m_type == 'all') else None, self.metrics_list)))

    def addMetric(self, metric):
        self.metrics_list.append(metric)

    def printMeasures(self, model='all', metrics='all'):
        metrics = self.getMetrics(model_name=model, m_type=metrics)

        measurements = []
        for metric in metrics:
            metric_measures = metric.getMeasures()
            for measure in metric_measures:
                if measure not in measurements:
                    measurements.append(measure)

        print('{:10}'.format('model'), end='')
        for measure in measurements:
            print('{:11}'.format(measure), end='')
        print('\n', end='')
        print('-------'*(len(measurements)+1))

        printed_models = []
        for metric in metrics:
            metric_name = metric.getName()
            
            if metric_name not in printed_models:
                print('{:9}'.format(metric_name), end='')
                metric_values = metric.getValues()
                for measure in measurements:
                    if measure in metric_values:
                        # Edit this next line to calculate the values across all of the folds!
                        #print('{:6.2f}'.format(100*metric_values[measure]), end='')

                        # grab all of the metrics with the same key and calculate the mean pm std dev
                        vals = []
                        for m in metrics:
                            if m.getName() == metric_name:
                                vals.append(m.getValues()[measure])
                        #print('vals for {}:{}'.format(metric_name, vals))
                        print('{:6.2f}\u00B1{:6<.2f}'.format(100*np.mean(vals), 100*np.std(vals)), end='')
                    else:
                        print(' '*11, end='')
                printed_models.append(metric_name)
                print('\n', end='')

In [191]:
# Metric manager tests
mm = MetricsManager()

m = Metric('rf', fold=1)
m.addValue('acc', 0.97)
m.addValue('time', 0.99)
mm.addMetric(m)

m = Metric('rf', fold=2)
m.addValue('acc', 0.95)
m.addValue('time', 0.99)
mm.addMetric(m)

m = Metric('rf', fold=3)
m.addValue('acc', 0.93)
m.addValue('time', 0.99)
mm.addMetric(m)

m = Metric('dt', fold=1)
m.addValue('time', 0.75)
mm.addMetric(m)

m = Metric('xgboost', fold=1)
m.addValue('acc', 0.99)
mm.addMetric(m)

mm.printMeasures(metrics='acc')

model     acc        
--------------
rf        95.00±1.63
xgboost   99.00±0.00


In [140]:
def train_and_eval_on(X, y, feature_set):
    """
    train_and_eval_on function
        Description: This function will train all the models on the given feature set of the X (data) for predicting y (target)

        Args: 
            X => pd.DataFrame object containing the data
            y => pd.Series object containings the target classifications
            feature_set => list of features in X to use for training

        Returns:
            metrics => dictionary where the model names are the key and a list of accuracies across all folds is the value
                    Keys:
                        Random Forest => rf
                        Decision Tree => dt
                        k-Nearest Neighbors => knn
                        Support Vector Machine => svm
                        Logistic Regression => lr
                        Linear Discriminant Analysis => lda
                        AdaBoost => ab
                        Naive Bayes => nb
                        Keras-TensorFlow => keras
                        Fast.ai => fastai
    """
    metrics = {'rf':[],
                'dt':[],
                'knn':[],
                'svm':[],
                'lr':[],
                'lda':[],
                'ab':[],
                'nb':[],
                'keras':[],
                'fastai':[]}

    # Select the given features within the data
    X = X[feature_set]

    print('Training with {} features'.format(len(X.columns)))

    # Create stratified, 10-fold cross validation object
    random_state = 0
    sss = StratifiedKFold(n_splits=10, shuffle=True, random_state=random_state)

    i=1

    # Experiment with 10-fold cross validation
    for train_idx, test_idx in sss.split(X, y):

        print('fold num {}'.format(i))
        i+=1

        # Split the data into the training and testing sets
        print('splitting data')
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        
        # Random Forest Model
        print('creating rf')
        rf = RandomForestClassifier(random_state=random_state)
        print('fitting rf')
        rf.fit(X_train, y_train)
        print('scoring rf')
        score = rf.score(X_test, y_test)
        metrics['rf'].append(score)

        # Decision Tree Model
        dt = DecisionTreeClassifier(random_state=random_state)
        dt.fit(X_train, y_train)
        score = dt.score(X_test, y_test)
        metrics['dt'].append(score)

        # k-Nearest Neighbors Model
        knn = KNeighborsClassifier()
        knn.fit(X_train, y_train)
        score = knn.score(X_test, y_test)
        metrics['knn'].append(score)

        # Support Vector Machine Model
        svm = SVC(random_state=random_state)
        svm.fit(X_train, y_train)
        score = svm.score(X_test, y_test)
        metrics['svm'].append(score)

        # Logistic Regression Model
        lr = LogisticRegression(random_state=random_state)
        lr.fit(X_train, y_train)
        score = lr.score(X_test, y_test)
        metrics['lr'].append(score)

        # Linear Discriminant Analysis Model
        lda = LinearDiscriminantAnalysis()
        lda.fit(X_train, y_train)
        score = lda.score(X_test, y_test)
        metrics['lda'].append(score)

        # AdaBoost Model
        ab = AdaBoostClassifier(random_state=random_state)
        ab.fit(X_train, y_train)
        score = ab.score(X_test, y_test)
        metrics['ab'].append(score)

        # Naive Bayes Model
        nb = GaussianNB()
        nb.fit(X_train, y_train)
        score = nb.score(X_test, y_test)
        metrics['nb'].append(score)

        # Keras-TensorFlow DNN Model
        dnn_keras = Sequential(layers=[
                                 Dense(128, kernel_regularizer=l2(0.001), activation='relu',input_shape=(len(X_train.columns),)),
                                 BatchNormalization(),
                                 Dense(64, activation='relu', kernel_regularizer=l2(0.001)),
                                 BatchNormalization(),
                                 Dense(y_train.nunique(), activation='softmax')
        ])
        dnn_keras.compile(
            optimizer='adam', 
            loss='categorical_crossentropy', 
            metrics=['accuracy'])
        dnn_keras.fit(X_train, pd.get_dummies(y_train), epochs=100, verbose=0, batch_size=512)
        _, score = dnn_keras.evaluate(X_test, pd.get_dummies(y_test), verbose=0)
        metrics['keras'].append(score)

        # Fast.ai DNN Model
        data_fold = (TabularList.from_df(df, path=path, cont_names=X_train.columns, procs=[Categorify, Normalize])
                     .split_by_idxs(train_idx, test_idx)
                     .label_from_df(cols=dep_var)
                     .databunch(num_workers=0))
        dnn_fastai = tabular_learner(data_fold, layers=[200, 100], metrics=accuracy)
        dnn_fastai.fit_one_cycle(cyc_len=10, callbacks=None)
        _, score = dnn_fastai.validate()
        metrics['fastai'].append(score)

    return metrics

In [141]:
def show_graph(figure, feature_count, metrics_dict, exp_type=''):
  """
  show_graph function

    Description: This function will take the metrics dictionary provided and update the graph already to show the most recent results

    Args:
      figure => matplotlib.pyplot.figure object
      metrics_dict => dictionary of metrics as described in `train_and_eval_on` function
      exp_type => string indicating the type of experiment to change the title of the graph

    Returns:
      nothing
  """
  # Reorganize the data so we have all of the random forest metrics with increasing features side by side
  reorganized_dictionary = {}

  for feature_vals in metrics_dict.keys():
    for key in metrics_dict[feature_vals].keys():
      # If a given model is not in the new dictionary, add it
      if key not in reorganized_dictionary:
        reorganized_dictionary[key] = {}

      # If there isn't a specific feature number in the model dictionary, add it
      if feature_vals not in reorganized_dictionary[key]:
        reorganized_dictionary[key][feature_vals] = []

      # If there is anything to the record, add it
      if len( metrics_dict[feature_vals][key] ) > 0:
        accuracies = metrics_dict[feature_vals][key]
        mean = np.mean(accuracies)
        std = np.std(accuracies)

        #print('Accuracies: {}'.format(accuracies))
        #print('Mean: {}'.format(mean))
        #print('Std: {}'.format(std))

        reorganized_dictionary[key][feature_vals].append( [mean, std] ) 

  #print('Models: {}'.format( list(reorganized_dictionary.keys()) ))

  for model in reorganized_dictionary.keys():
    # The x-axis will have the feature_count
    xs = []

    # The y-axis will have the accuracy for that feature_count value
    ys = []

    # The y-axis will also have the std for these accuracies since they are accumulated over 10 folds
    yerrs = []

    for x in reorganized_dictionary[model].keys():
      if len(reorganized_dictionary[model][x]) > 0:
        xs.append(x)
        ys.append(reorganized_dictionary[model][x][0][0])
        yerrs.append(reorganized_dictionary[model][x][0][1])
    #print('xs: {}'.format(xs))
    #print('ys: {}'.format(ys))
    if len(xs) > 0:
      plt.errorbar(x=xs, y=ys, yerr=yerrs, label=model)

  #print(reorganized_dictionary)
  if exp_type == 'multi':
    plt.title('Multi-class Classification Model Accuracies with Increasing Features')
  elif exp_type == 'binary':
    plt.title('Binary Classification Model Accuracies with Increasing Features')
  plt.ylabel('Accuracy')
  plt.xlabel('Number of Features')

  plt.xticks(xs[4::5])

  plt.legend()
  plt.show()


In [142]:
def get_data(path, layer=0, nans=False):
    """ get_data function
        Description: This function will take the given path and user-defined layer from the dataset, import the datafiles, and then return the combined pandas DataFrame
        Arguments:
            path => string, path to the directory containing the l1-doh.csv, l1-nonhod.csv, etc files.
            layer => int, the level of layer desired. This will change the dataset that is imported. Values can be 1 or 2. Default is 0.
            nans => boolean, Whether the user wants NaNs in the data or wants them removed. This function will automatically remove all rows with Nan values.
        Returns:
            df => pandas.DataFrame, contains complete data
        Raises:
            AttributeError for incorrect layer number
            Any additional read errors are raised to the user
    """
    import pandas as pd

    if layer not in [1,2]:
        raise AttributeError('Must provide valid layer for dataset: layer equals 1 or 2')
    else:

        # Select the files that the user has chosen
        filenames = []
        if layer == 1:
            filenames.append('l1-doh.csv')
            filenames.append('l1-nondoh.csv')
        else:
            filenames.append('l2-benign.csv')
            filenames.append('l2-malicious.csv')

        # Read the files into dataframes
        df0 = pd.read_csv(path + '/' + filenames[0])
        df1 = pd.read_csv(path + '/' + filenames[1])

        df = pd.concat([df0, df1])

        # Remove any rows with Nan values
        if not nans:
            df.dropna(axis='index', inplace=True)

        return df

In [143]:
def balance_data(df, label_column):
    labels = df[label_column].unique()
    sample_length_list = []
    for i in range(len(labels)):
        samples = df.loc[ df[label_column] == labels[i] ]
        sample_length_list.append( len(samples) )
        #print('Number of {} samples: {}'.format(labels[i], len( samples )))

    random_state = 0
    smallest_count = min(sample_length_list)
    dfs = []
    for i in range(len(labels)):
        #dfs.append( df.loc[ df[label_column] == labels[i] ].sample(smallest_count) )

        # We are only sampling 40 purely for testing reasons to help speed up the dev process!
        # Uncomment the line above this to actually run the complete tests
        dfs.append( df.loc[ df[label_column] == labels[i] ].sample(40) )

    return pd.concat(dfs)

In [144]:
# Set up google drive access
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## Layer 1 Experiments: DoH or nonDoH

In [145]:
path = '/content/gdrive/My Drive/doh_dataset/Total-CSVs'
df = get_data(path=path, layer=1)
df.head()

Unnamed: 0,SourceIP,DestinationIP,SourcePort,DestinationPort,TimeStamp,Duration,FlowBytesSent,FlowSentRate,FlowBytesReceived,FlowReceivedRate,PacketLengthVariance,PacketLengthStandardDeviation,PacketLengthMean,PacketLengthMedian,PacketLengthMode,PacketLengthSkewFromMedian,PacketLengthSkewFromMode,PacketLengthCoefficientofVariation,PacketTimeVariance,PacketTimeStandardDeviation,PacketTimeMean,PacketTimeMedian,PacketTimeMode,PacketTimeSkewFromMedian,PacketTimeSkewFromMode,PacketTimeCoefficientofVariation,ResponseTimeTimeVariance,ResponseTimeTimeStandardDeviation,ResponseTimeTimeMean,ResponseTimeTimeMedian,ResponseTimeTimeMode,ResponseTimeTimeSkewFromMedian,ResponseTimeTimeSkewFromMode,ResponseTimeTimeCoefficientofVariation,Label
0,192.168.20.191,176.103.130.131,50749,443,2020-01-14 15:49:11,95.08155,62311,655.342703,65358,687.388878,7474.676771,86.456213,135.673751,102.0,54,1.168467,0.944683,0.637236,670.585814,25.895672,45.065277,48.811292,1.49506,-0.433974,1.682529,0.574626,0.001053,0.032457,0.027624,0.026854,0.026822,0.071187,0.024715,1.174948,DoH
1,192.168.20.191,176.103.130.131,50749,443,2020-01-14 15:50:52,122.309318,93828,767.136973,101232,827.672018,10458.118598,102.264943,141.245474,114.0,54,0.799261,0.853132,0.724023,708.465878,26.617022,52.287903,48.830314,31.719656,0.389704,0.772748,0.509047,0.00117,0.0342,0.024387,0.021043,0.026981,0.293297,-0.075845,1.402382,DoH
2,192.168.20.191,176.103.130.131,50749,443,2020-01-14 15:52:55,120.958413,38784,320.639127,38236,316.108645,7300.293933,85.441758,133.715278,89.0,54,1.570027,0.932978,0.638983,1358.911235,36.863413,50.316114,39.770747,0.417528,0.858198,1.353607,0.732636,0.000785,0.028021,0.029238,0.026921,0.026855,0.248064,0.085061,0.958348,DoH
3,192.168.20.191,176.103.130.131,50749,443,2020-01-14 15:54:56,110.50108,61993,561.017141,69757,631.278898,8499.282518,92.191553,139.123548,114.0,54,0.817544,0.923333,0.66266,1118.135436,33.438532,51.693726,34.882495,13.280934,1.508251,1.148758,0.646859,0.000411,0.020274,0.019925,0.019268,0.026918,0.097199,-0.344926,1.017535,DoH
4,176.103.130.131,192.168.20.191,443,50749,2020-01-14 15:56:46,54.229891,83641,1542.341289,76804,1416.266907,8052.745751,89.737092,138.91342,114.0,114,0.83288,0.277627,0.645993,341.696613,18.485038,36.435619,49.822561,7.342519,-2.172613,1.573873,0.507334,0.079079,0.281209,0.02593,4.7e-05,2.1e-05,0.276133,0.092135,10.844829,DoH


In [146]:
bad_columns = ['SourceIP', 'DestinationIP', 'SourcePort', 'DestinationPort', 'TimeStamp']
df.drop(labels=bad_columns, axis='columns', inplace=True)

In [147]:
# The target classifications are in the 'Label' columns, 
#  thus this is the independent variable!
dep_var = 'Label'
df[dep_var].value_counts()

NonDoH    889809
DoH       269299
Name: Label, dtype: int64

In [148]:
# Balance the data out
df = balance_data(df, dep_var)

In [149]:
df[dep_var].value_counts()

NonDoH    40
DoH       40
Name: Label, dtype: int64

In [150]:
# Split up the data into the data (X) and classifications (y)
X = df.loc[:, df.columns != dep_var]
y = df[dep_var]

In [151]:
best_features_layer1 = ['Duration', 'ResponseTimeTimeSkewFromMedian', 'ResponseTimeTimeMode',
       'ResponseTimeTimeMedian', 'ResponseTimeTimeMean',
       'PacketTimeSkewFromMedian', 'PacketTimeMode', 'PacketTimeMedian',
       'PacketTimeMean', 'ResponseTimeTimeSkewFromMode', 'PacketTimeVariance',
       'PacketLengthCoefficientofVariation', 'PacketTimeStandardDeviation',
       'PacketLengthMode', 'PacketLengthMedian', 'PacketLengthMean',
       'FlowBytesSent', 'ResponseTimeTimeCoefficientofVariation',
       'PacketLengthStandardDeviation', 'PacketLengthVariance',
       'PacketTimeCoefficientofVariation', 'FlowReceivedRate',
       'ResponseTimeTimeStandardDeviation', 'PacketLengthSkewFromMode',
       'FlowBytesReceived', 'PacketLengthSkewFromMedian', 'FlowSentRate',
       'ResponseTimeTimeVariance', 'PacketTimeSkewFromMode']
print('These are the best 4 features for layer 1: {}'.format(best_features_layer1[:4]))
print('These are the worst 4 features for layer 1: {}'.format(best_features_layer1[-4:]))

These are the best 4 features for layer 1: ['Duration', 'ResponseTimeTimeSkewFromMedian', 'ResponseTimeTimeMode', 'ResponseTimeTimeMedian']
These are the worst 4 features for layer 1: ['PacketLengthSkewFromMedian', 'FlowSentRate', 'ResponseTimeTimeVariance', 'PacketTimeSkewFromMode']


In [152]:
l1_performance_metrics = train_and_eval_on(X=X, y=y, feature_set=best_features_layer1)

Training with 29 features
fold num 1
splitting data
creating rf
fitting rf
scoring rf


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression




epoch,train_loss,valid_loss,accuracy,time
0,0.658,0.662643,0.5,00:00
1,0.647074,0.632397,0.5,00:00
2,0.586271,0.581118,0.75,00:00
3,0.521237,0.538254,0.875,00:00
4,0.464888,0.508344,0.875,00:00
5,0.415401,0.481748,0.875,00:00
6,0.375769,0.454852,0.875,00:00
7,0.344257,0.430754,0.875,00:00
8,0.318929,0.412161,0.875,00:00
9,0.297068,0.399837,0.875,00:00


fold num 2
splitting data
creating rf
fitting rf
scoring rf


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression




epoch,train_loss,valid_loss,accuracy,time
0,0.873688,0.68823,0.5,00:00
1,0.855985,0.676121,0.5,00:00
2,0.780844,0.659493,0.5,00:00
3,0.67256,0.63951,0.5,00:00
4,0.588202,0.620155,0.75,00:00
5,0.520592,0.602809,0.875,00:00
6,0.4707,0.586704,0.875,00:00
7,0.428249,0.574017,0.875,00:00
8,0.393187,0.5634,0.875,00:00
9,0.362478,0.554159,0.875,00:00


fold num 3
splitting data
creating rf
fitting rf
scoring rf


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression




epoch,train_loss,valid_loss,accuracy,time
0,0.919794,0.704007,0.5,00:00
1,0.886671,0.700284,0.5,00:00
2,0.785696,0.693243,0.625,00:00
3,0.677714,0.690487,0.625,00:00
4,0.589734,0.69497,0.75,00:00
5,0.524984,0.704155,0.75,00:00
6,0.47415,0.716629,0.75,00:00
7,0.432469,0.727801,0.75,00:00
8,0.3989,0.735077,0.75,00:00
9,0.369746,0.741759,0.75,00:00


fold num 4
splitting data
creating rf
fitting rf
scoring rf


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression




epoch,train_loss,valid_loss,accuracy,time
0,0.739352,0.783164,0.75,00:00
1,0.712927,0.63311,0.875,00:00
2,0.640778,0.814626,0.875,00:00
3,0.556504,1.213004,0.875,00:00
4,0.495412,1.638979,0.875,00:00
5,0.450401,2.152503,0.875,00:00
6,0.407617,2.592076,0.875,00:00
7,0.376706,3.005668,0.875,00:00
8,0.349197,3.458602,0.875,00:00
9,0.326651,3.793485,0.875,00:00


fold num 5
splitting data
creating rf
fitting rf
scoring rf


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression




epoch,train_loss,valid_loss,accuracy,time
0,0.77881,0.681202,0.625,00:00
1,0.767223,0.667533,0.625,00:00
2,0.691385,0.6401,0.625,00:00
3,0.606668,0.612028,0.75,00:00
4,0.537829,0.589467,0.75,00:00
5,0.48231,0.571795,0.75,00:00
6,0.436918,0.555896,0.625,00:00
7,0.398758,0.542364,0.625,00:00
8,0.369591,0.528532,0.75,00:00
9,0.343074,0.514229,0.75,00:00


fold num 6
splitting data
creating rf
fitting rf
scoring rf


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression




epoch,train_loss,valid_loss,accuracy,time
0,0.785396,0.686237,0.375,00:00
1,0.762292,0.665147,0.5,00:00
2,0.694068,0.621248,0.625,00:00
3,0.612529,0.572217,1.0,00:00
4,0.545689,0.526508,1.0,00:00
5,0.487308,0.484023,1.0,00:00
6,0.444148,0.445635,1.0,00:00
7,0.406765,0.412376,1.0,00:00
8,0.376899,0.385627,1.0,00:00
9,0.354078,0.363483,1.0,00:00


fold num 7
splitting data
creating rf
fitting rf
scoring rf


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression




epoch,train_loss,valid_loss,accuracy,time
0,0.819633,1.013445,0.5,00:00
1,0.81217,0.9595,0.625,00:00
2,0.732619,0.781004,0.5,00:00
3,0.63536,0.644601,0.75,00:00
4,0.562504,0.583277,0.75,00:00
5,0.499758,0.554323,0.75,00:00
6,0.456722,0.538389,0.75,00:00
7,0.420981,0.52744,0.75,00:00
8,0.390646,0.520111,0.75,00:00
9,0.367486,0.513839,0.75,00:00


fold num 8
splitting data
creating rf
fitting rf
scoring rf


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression




epoch,train_loss,valid_loss,accuracy,time
0,0.847742,0.712203,0.25,00:00
1,0.833421,0.69272,0.5,00:00
2,0.764226,0.651678,0.625,00:00
3,0.671594,0.604861,0.875,00:00
4,0.588863,0.562497,0.875,00:00
5,0.533738,0.531381,0.75,00:00
6,0.488072,0.506413,0.75,00:00
7,0.448847,0.488103,0.75,00:00
8,0.413027,0.474062,0.75,00:00
9,0.386314,0.464377,0.75,00:00


fold num 9
splitting data
creating rf
fitting rf
scoring rf


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression




epoch,train_loss,valid_loss,accuracy,time
0,0.59057,0.691346,0.5,00:00
1,0.577987,0.667692,0.625,00:00
2,0.520377,0.620836,0.75,00:00
3,0.469259,0.578696,1.0,00:00
4,0.423119,0.540191,1.0,00:00
5,0.382558,0.498017,1.0,00:00
6,0.349029,0.465772,1.0,00:00
7,0.322417,0.437575,1.0,00:00
8,0.299716,0.411787,1.0,00:00
9,0.282959,0.392138,1.0,00:00


fold num 10
splitting data
creating rf
fitting rf
scoring rf


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression




epoch,train_loss,valid_loss,accuracy,time
0,0.703666,0.668829,0.75,00:00
1,0.683672,0.642324,0.75,00:00
2,0.627842,0.603521,0.75,00:00
3,0.560688,0.562382,0.75,00:00
4,0.49955,0.524358,0.75,00:00
5,0.456132,0.499535,0.75,00:00
6,0.419005,0.483894,0.75,00:00
7,0.385424,0.473334,0.75,00:00
8,0.358382,0.467251,0.75,00:00
9,0.337172,0.464479,0.75,00:00


In [153]:
print('Models\tAccuracy (%)')
for key in l1_performance_metrics.keys():
  avg = np.average(l1_performance_metrics[key])
  std = np.std(l1_performance_metrics[key])
  print('{}\t{:.2f}\u00B1{:.2f}'.format(key, avg*100, std*100))

Models	Accuracy (%)
rf	93.75±8.39
dt	88.75±10.38
knn	85.00±12.25
svm	67.50±11.46
lr	75.00±17.68
lda	77.50±13.46
ab	91.25±8.00
nb	72.50±10.90
keras	73.75±18.07
fastai	83.75±9.76


## Layer 2 Experiments: Benign-DoH or Malicious-DoH

In [154]:
path = '/content/gdrive/My Drive/doh_dataset/Total-CSVs'
df = get_data(path=path, layer=2)
df.head()

Unnamed: 0,SourceIP,DestinationIP,SourcePort,DestinationPort,TimeStamp,Duration,FlowBytesSent,FlowSentRate,FlowBytesReceived,FlowReceivedRate,PacketLengthVariance,PacketLengthStandardDeviation,PacketLengthMean,PacketLengthMedian,PacketLengthMode,PacketLengthSkewFromMedian,PacketLengthSkewFromMode,PacketLengthCoefficientofVariation,PacketTimeVariance,PacketTimeStandardDeviation,PacketTimeMean,PacketTimeMedian,PacketTimeMode,PacketTimeSkewFromMedian,PacketTimeSkewFromMode,PacketTimeCoefficientofVariation,ResponseTimeTimeVariance,ResponseTimeTimeStandardDeviation,ResponseTimeTimeMean,ResponseTimeTimeMedian,ResponseTimeTimeMode,ResponseTimeTimeSkewFromMedian,ResponseTimeTimeSkewFromMode,ResponseTimeTimeCoefficientofVariation,Label
0,192.168.20.191,176.103.130.131,50749,443,2020-01-14 15:49:11,95.08155,62311,655.342703,65358,687.388878,7474.676771,86.456213,135.673751,102.0,54,1.168467,0.944683,0.637236,670.585814,25.895672,45.065277,48.811292,1.49506,-0.433974,1.682529,0.574626,0.001053,0.032457,0.027624,0.026854,0.026822,0.071187,0.024715,1.174948,Benign
1,192.168.20.191,176.103.130.131,50749,443,2020-01-14 15:50:52,122.309318,93828,767.136973,101232,827.672018,10458.118598,102.264943,141.245474,114.0,54,0.799261,0.853132,0.724023,708.465878,26.617022,52.287903,48.830314,31.719656,0.389704,0.772748,0.509047,0.00117,0.0342,0.024387,0.021043,0.026981,0.293297,-0.075845,1.402382,Benign
2,192.168.20.191,176.103.130.131,50749,443,2020-01-14 15:52:55,120.958413,38784,320.639127,38236,316.108645,7300.293933,85.441758,133.715278,89.0,54,1.570027,0.932978,0.638983,1358.911235,36.863413,50.316114,39.770747,0.417528,0.858198,1.353607,0.732636,0.000785,0.028021,0.029238,0.026921,0.026855,0.248064,0.085061,0.958348,Benign
3,192.168.20.191,176.103.130.131,50749,443,2020-01-14 15:54:56,110.50108,61993,561.017141,69757,631.278898,8499.282518,92.191553,139.123548,114.0,54,0.817544,0.923333,0.66266,1118.135436,33.438532,51.693726,34.882495,13.280934,1.508251,1.148758,0.646859,0.000411,0.020274,0.019925,0.019268,0.026918,0.097199,-0.344926,1.017535,Benign
4,176.103.130.131,192.168.20.191,443,50749,2020-01-14 15:56:46,54.229891,83641,1542.341289,76804,1416.266907,8052.745751,89.737092,138.91342,114.0,114,0.83288,0.277627,0.645993,341.696613,18.485038,36.435619,49.822561,7.342519,-2.172613,1.573873,0.507334,0.079079,0.281209,0.02593,4.7e-05,2.1e-05,0.276133,0.092135,10.844829,Benign


In [155]:
bad_columns = ['SourceIP', 'DestinationIP', 'SourcePort', 'DestinationPort', 'TimeStamp']
df.drop(labels=bad_columns, axis='columns', inplace=True)

In [156]:
# The target classifications are in the 'Label' columns, 
#  thus this is the independent variable!
dep_var = 'Label'
df[dep_var].value_counts()

Malicious    249553
Benign        19746
Name: Label, dtype: int64

In [157]:
# Balance the data out
df = balance_data(df, dep_var)

In [158]:
df[dep_var].value_counts()

Benign       40
Malicious    40
Name: Label, dtype: int64

In [159]:
# Split up the data into the data (X) and classifications (y)
X = df.loc[:, df.columns != dep_var]
y = df[dep_var]

In [160]:
best_features_layer2 = ['PacketLengthStandardDeviation', 'PacketLengthCoefficientofVariation',
       'FlowReceivedRate', 'PacketLengthMean', 'Duration',
       'PacketTimeSkewFromMedian', 'FlowSentRate', 'PacketLengthVariance',
       'PacketTimeMean', 'PacketTimeStandardDeviation',
       'ResponseTimeTimeMedian', 'PacketTimeMedian',
       'ResponseTimeTimeSkewFromMode', 'ResponseTimeTimeMean',
       'ResponseTimeTimeMode', 'PacketTimeCoefficientofVariation',
       'ResponseTimeTimeSkewFromMedian', 'PacketTimeMode', 'FlowBytesSent',
       'FlowBytesReceived', 'PacketLengthMode',
       'ResponseTimeTimeCoefficientofVariation', 'PacketLengthSkewFromMedian',
       'PacketTimeVariance', 'PacketLengthMedian', 'PacketTimeSkewFromMode',
       'ResponseTimeTimeStandardDeviation', 'ResponseTimeTimeVariance',
       'PacketLengthSkewFromMode']
print('These are the best 4 features for layer 2: {}'.format(best_features_layer2[:4]))
print('These are the worst 4 features for layer 2: {}'.format(best_features_layer2[-4:]))

These are the best 4 features for layer 2: ['PacketLengthStandardDeviation', 'PacketLengthCoefficientofVariation', 'FlowReceivedRate', 'PacketLengthMean']
These are the worst 4 features for layer 2: ['PacketTimeSkewFromMode', 'ResponseTimeTimeStandardDeviation', 'ResponseTimeTimeVariance', 'PacketLengthSkewFromMode']


In [161]:
l2_performance_metrics = train_and_eval_on(X=X, y=y, feature_set=best_features_layer2)

Training with 29 features
fold num 1
splitting data
creating rf
fitting rf
scoring rf


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression




epoch,train_loss,valid_loss,accuracy,time
0,0.656338,0.670049,0.5,00:00
1,0.620491,0.653161,0.5,00:00
2,0.580415,0.628044,0.625,00:00
3,0.514519,0.597889,0.625,00:00
4,0.450856,0.565691,0.625,00:00
5,0.403949,0.539517,0.625,00:00
6,0.365541,0.521436,0.625,00:00
7,0.331803,0.507643,0.625,00:00
8,0.309029,0.497451,0.625,00:00
9,0.288026,0.48949,0.625,00:00


fold num 2
splitting data
creating rf
fitting rf
scoring rf


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression




epoch,train_loss,valid_loss,accuracy,time
0,0.970978,0.711229,0.5,00:00
1,0.943185,0.686186,0.5,00:00
2,0.850576,0.623388,0.875,00:00
3,0.736611,0.56182,1.0,00:00
4,0.643459,0.51881,0.875,00:00
5,0.580251,0.493924,0.875,00:00
6,0.524095,0.483016,0.875,00:00
7,0.478432,0.476715,0.875,00:00
8,0.443575,0.468903,0.875,00:00
9,0.413214,0.459397,0.875,00:00


fold num 3
splitting data
creating rf
fitting rf
scoring rf


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression




epoch,train_loss,valid_loss,accuracy,time
0,0.699498,0.712801,0.5,00:00
1,0.690307,0.689839,0.5,00:00
2,0.630675,0.639822,0.625,00:00
3,0.561794,0.588795,0.875,00:00
4,0.494656,0.533335,0.875,00:00
5,0.442199,0.487644,1.0,00:00
6,0.40637,0.451466,0.875,00:00
7,0.373923,0.422004,0.875,00:00
8,0.348986,0.399465,0.875,00:00
9,0.325742,0.382056,0.875,00:00


fold num 4
splitting data
creating rf
fitting rf
scoring rf


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression




epoch,train_loss,valid_loss,accuracy,time
0,0.621601,0.699438,0.5,00:00
1,0.619555,0.692524,0.375,00:00
2,0.571143,0.680645,0.375,00:00
3,0.50165,0.6678,0.375,00:00
4,0.447568,0.655256,0.75,00:00
5,0.396754,0.63947,0.75,00:00
6,0.361894,0.626264,0.75,00:00
7,0.335146,0.61601,0.875,00:00
8,0.310183,0.608273,0.875,00:00
9,0.289665,0.604695,0.875,00:00


fold num 5
splitting data
creating rf
fitting rf
scoring rf


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression




epoch,train_loss,valid_loss,accuracy,time
0,0.929395,0.711182,0.375,00:00
1,0.909437,0.683128,0.5,00:00
2,0.834777,0.627549,0.875,00:00
3,0.716317,0.573808,0.875,00:00
4,0.623072,0.537263,0.875,00:00
5,0.553336,0.510507,0.875,00:00
6,0.49933,0.491829,0.875,00:00
7,0.453091,0.475526,0.875,00:00
8,0.417937,0.4612,0.875,00:00
9,0.389205,0.448009,0.875,00:00


fold num 6
splitting data
creating rf
fitting rf
scoring rf


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression




epoch,train_loss,valid_loss,accuracy,time
0,0.799218,0.665199,0.875,00:00
1,0.779173,0.663154,0.75,00:00
2,0.713417,0.647216,0.625,00:00
3,0.622408,0.604103,0.75,00:00
4,0.55531,0.563059,0.625,00:00
5,0.493167,0.540462,0.625,00:00
6,0.445596,0.528139,0.625,00:00
7,0.408524,0.519605,0.625,00:00
8,0.37809,0.512991,0.625,00:00
9,0.353213,0.507728,0.625,00:00


fold num 7
splitting data
creating rf
fitting rf
scoring rf


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression




epoch,train_loss,valid_loss,accuracy,time
0,0.691257,0.691702,0.375,00:00
1,0.67067,0.675533,0.625,00:00
2,0.600166,0.639832,0.625,00:00
3,0.517166,0.602529,0.75,00:00
4,0.463032,0.571104,0.875,00:00
5,0.415927,0.545262,0.875,00:00
6,0.3756,0.523614,0.875,00:00
7,0.340569,0.505204,0.875,00:00
8,0.312825,0.488544,0.875,00:00
9,0.290826,0.474576,0.875,00:00


fold num 8
splitting data
creating rf
fitting rf
scoring rf


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression




epoch,train_loss,valid_loss,accuracy,time
0,0.777086,0.711782,0.375,00:00
1,0.732147,0.683152,0.5,00:00
2,0.661201,0.631233,0.75,00:00
3,0.588882,0.590016,0.875,00:00
4,0.528521,0.55969,0.875,00:00
5,0.473312,0.532616,0.875,00:00
6,0.431649,0.509771,0.875,00:00
7,0.40031,0.490961,0.875,00:00
8,0.371226,0.475106,0.875,00:00
9,0.347155,0.461835,0.875,00:00


fold num 9
splitting data
creating rf
fitting rf
scoring rf


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression




epoch,train_loss,valid_loss,accuracy,time
0,0.863605,0.695153,0.625,00:00
1,0.817289,0.67138,0.75,00:00
2,0.745293,0.619203,0.875,00:00
3,0.650546,0.56768,0.875,00:00
4,0.576367,0.526989,0.875,00:00
5,0.522479,0.496423,0.875,00:00
6,0.477576,0.471768,0.875,00:00
7,0.439974,0.451651,0.875,00:00
8,0.407385,0.433811,0.875,00:00
9,0.380431,0.417228,0.875,00:00


fold num 10
splitting data
creating rf
fitting rf
scoring rf


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression




epoch,train_loss,valid_loss,accuracy,time
0,0.922671,0.678825,0.5,00:00
1,0.911193,0.67221,0.375,00:00
2,0.814869,0.662346,0.625,00:00
3,0.692902,0.66321,0.75,00:00
4,0.605226,0.67455,0.75,00:00
5,0.533592,0.695053,0.75,00:00
6,0.479474,0.720936,0.75,00:00
7,0.438493,0.750095,0.75,00:00
8,0.400723,0.774683,0.75,00:00
9,0.371209,0.794913,0.75,00:00


In [162]:
print('Models\tAccuracy (%)')
for key in l2_performance_metrics.keys():
  avg = np.average(l2_performance_metrics[key])
  std = np.std(l2_performance_metrics[key])
  print('{}\t{:.2f}\u00B1{:.2f}'.format(key, avg*100, std*100))

Models	Accuracy (%)
rf	88.75±10.38
dt	92.50±10.00
knn	75.00±12.50
svm	75.00±11.18
lr	73.75±14.20
lda	72.50±10.90
ab	93.75±11.52
nb	80.00±17.85
keras	63.75±14.20
fastai	81.25±10.08
