# DNS over HTTPS Experiments
This notebook serves to run all the experiments for our work on the CIRA-CIC-DoHBrw-2020 dataset. This notebook will train and validate 9 machine learning models and 2 deep learning models. Additionally, the experiments will determine how the performance of these models changes as we increase the size of the feature set.

In [1]:
# Import the dataset saved on the google drive
from google.colab import drive

# Graphing capabilities
import matplotlib.pyplot as plt

# Data management
import pandas as pd
import numpy as np

# For stratified 10-fold cross validation
from sklearn.model_selection import StratifiedKFold

# Scikit-Learn ML Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB

# Keras-TensorFlow DNN Model
from keras.models import Sequential
from keras.layers import BatchNormalization, Dense, Dropout
from keras.regularizers import l2

# Fast.ai DNN Model
from fastai.tabular import *

# Normalization
from keras.utils import normalize, to_categorical

print('Imports complete.')

Imports complete.


In [2]:
# Objects used to help manage the metrics data
class Metric:
    def __init__(self, name, fold):
        self.name = name
        self.fold_num = fold
        self.values = {}

    def __str__(self):
        return str({self.name: self.values})

    def __repr__(self):
        return str({self.name: self.values})

    def addValue(self, m_type, value):
        if m_type != None and value != None:
            self.values[m_type] = value

    def getValue(self, m_type):
        if m_type in self.values:
            return self.values[m_type]

    def getName(self):
        return self.name

    def getMeasures(self):
        return self.values.keys()

    def getValues(self):
        return self.values

    def containsType(self, m_type):
        if type(m_type) == list:
            for m in m_type:
                if m not in self.values:
                    return False
            return True
        elif type(m_type) == str:
            if m_type in self.values:
                return True
            else:
                return False
        else:
            return False

    def getModelWithMeasure(self, m_type):
        if type(m_type) == list:
            new_metric = Metric(self.name, fold=self.fold_num)
            for m in m_type:
                new_metric.addValue(m, self.values[m])

            return new_metric
        elif type(m_type) == str:
            new_metric = Metric(self.name, fold=self.fold_num)
            new_metric.addValue(m_type, self.values[m_type])

            return new_metric

class MetricsManager:
    def __init__(self):
        self.metrics_list = []
    
    def getMetrics(self, model_name='all', m_type='all'):
        if model_name == 'all' and m_type == 'all':
            return self.metrics_list
        else:
            # Return the metrics requested and remove any of the none values that have shown up
            return list(filter(None, map( lambda m : m.getModelWithMeasure(m_type) if (m.getName() == model_name or model_name == 'all') and (m.containsType(m_type) or m_type == 'all') else None, self.metrics_list)))

    def addMetric(self, metric):
        self.metrics_list.append(metric)

    def printMeasures(self, model='all', metrics='all'):
        metrics = self.getMetrics(model_name=model, m_type=metrics)

        measurements = []
        for metric in metrics:
            metric_measures = metric.getMeasures()
            for measure in metric_measures:
                if measure not in measurements:
                    measurements.append(measure)

        print('{:10}'.format('model'), end='')
        for measure in measurements:
            print('{:11}'.format(measure), end='')
        print('\n', end='')
        print('-------'*(len(measurements)+1))

        printed_models = []
        for metric in metrics:
            metric_name = metric.getName()
            
            if metric_name not in printed_models:
                print('{:9}'.format(metric_name), end='')
                metric_values = metric.getValues()
                for measure in measurements:
                    if measure in metric_values:
                        # Edit this next line to calculate the values across all of the folds!
                        #print('{:6.2f}'.format(100*metric_values[measure]), end='')

                        # grab all of the metrics with the same key and calculate the mean pm std dev
                        vals = []
                        for m in metrics:
                            if m.getName() == metric_name:
                                vals.append(m.getValues()[measure])
                        #print('vals for {}:{}'.format(metric_name, vals))
                        print('{:6.2f}\u00B1{:6<.2f}'.format(np.mean(vals), np.std(vals)), end='')
                    else:
                        print(' '*11, end='')
                printed_models.append(metric_name)
                print('\n', end='')

In [3]:
# Metric manager tests and examples
mm = MetricsManager()

m = Metric('rf', fold=1)
m.addValue('acc', 0.97)
m.addValue('time', 0.99)
mm.addMetric(m)

m = Metric('rf', fold=2)
m.addValue('acc', 0.95)
m.addValue('time', 0.99)
mm.addMetric(m)

m = Metric('rf', fold=3)
m.addValue('acc', 0.93)
m.addValue('time', 0.99)
mm.addMetric(m)

m = Metric('dt', fold=1)
m.addValue('time', 0.75)
mm.addMetric(m)

m = Metric('xgboost', fold=1)
m.addValue('acc', 0.99)
m.addValue('time', 50)
mm.addMetric(m)

mm.printMeasures(metrics=['time'])

model     time       
--------------
rf         0.99±0.00
dt         0.75±0.00
xgboost   50.00±0.00


In [4]:
def train_and_eval_on(X, y, feature_set, metrics_manager):
    """
    train_and_eval_on function
        Description: This function will train all the models on the given feature set of the X (data) for predicting y (target) and add the acquired metrics 
          to the MetricsManager object from the user

        Args: 
            X => pd.DataFrame object containing the data
            y => pd.Series object containings the target classifications
            feature_set => list of features in X to use for training
            metrics_manager => MetricsManager object (custom)

        Returns:
            Nothing
        
        Keys used for the manager:
                        Random Forest => rf
                        Decision Tree => dt
                        k-Nearest Neighbors => knn
                        Support Vector Machine => svm
                        Logistic Regression => lr
                        Linear Discriminant Analysis => lda
                        AdaBoost => ab
                        Naive Bayes => nb
                        Keras-TensorFlow => keras
                        Fast.ai => fastai
    """

    # Select the given features within the data
    X = X[feature_set]

    print('Training with {} features'.format(len(X.columns)))

    # Create stratified, 10-fold cross validation object
    random_state = 0
    sss = StratifiedKFold(n_splits=10, shuffle=True, random_state=random_state)

    i=1

    # Experiment with 10-fold cross validation
    for train_idx, test_idx in sss.split(X, y):

        print('fold num {}'.format(i))
        i+=1

        # Split the data into the training and testing sets
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        # Random Forest Model
        rf = RandomForestClassifier(random_state=random_state)
        rf.fit(X_train, y_train)
        score = rf.score(X_test, y_test)

        m = Metric('rf', fold=i)
        m.addValue('acc', 100*score)
        mm.addMetric(m)

        # Decision Tree Model
        dt = DecisionTreeClassifier(random_state=random_state)
        dt.fit(X_train, y_train)
        score = dt.score(X_test, y_test)

        m = Metric('dt', fold=i)
        m.addValue('acc', 100*score)
        mm.addMetric(m)

        # k-Nearest Neighbors Model
        knn = KNeighborsClassifier()
        knn.fit(X_train, y_train)
        score = knn.score(X_test, y_test)

        m = Metric('knn', fold=i)
        m.addValue('acc', 100*score)
        mm.addMetric(m)

        # Support Vector Machine Model
        svm = SVC(random_state=random_state)
        svm.fit(X_train, y_train)
        score = svm.score(X_test, y_test)

        m = Metric('svm', fold=i)
        m.addValue('acc', 100*score)
        mm.addMetric(m)

        # Logistic Regression Model
        lr = LogisticRegression(random_state=random_state)
        lr.fit(X_train, y_train)
        score = lr.score(X_test, y_test)

        m = Metric('lr', fold=i)
        m.addValue('acc', 100*score)
        mm.addMetric(m)

        # Linear Discriminant Analysis Model
        lda = LinearDiscriminantAnalysis()
        lda.fit(X_train, y_train)
        score = lda.score(X_test, y_test)

        m = Metric('lda', fold=i)
        m.addValue('acc', 100*score)
        mm.addMetric(m)

        # AdaBoost Model
        ab = AdaBoostClassifier(random_state=random_state)
        ab.fit(X_train, y_train)
        score = ab.score(X_test, y_test)

        m = Metric('ab', fold=i)
        m.addValue('acc', 100*score)
        mm.addMetric(m)

        # Naive Bayes Model
        nb = GaussianNB()
        nb.fit(X_train, y_train)
        score = nb.score(X_test, y_test)

        m = Metric('nb', fold=i)
        m.addValue('acc', 100*score)
        mm.addMetric(m)

        # Keras-TensorFlow DNN Model
        dnn_keras = Sequential(layers=[
                                 Dense(128, kernel_regularizer=l2(0.001), activation='relu',input_shape=(len(X_train.columns),)),
                                 BatchNormalization(),
                                 Dense(64, activation='relu', kernel_regularizer=l2(0.001)),
                                 BatchNormalization(),
                                 Dense(y_train.nunique(), activation='softmax')
        ])
        dnn_keras.compile(
            optimizer='adam', 
            loss='categorical_crossentropy', 
            metrics=['accuracy'])
        dnn_keras.fit(X_train, pd.get_dummies(y_train), epochs=100, verbose=0, batch_size=512)
        _, score = dnn_keras.evaluate(X_test, pd.get_dummies(y_test), verbose=0)

        m = Metric('keras', fold=i)
        m.addValue('acc', 100*score)
        mm.addMetric(m)

        # Fast.ai DNN Model
        data_fold = (TabularList.from_df(df, path=path, cont_names=X_train.columns, procs=[Categorify, Normalize])
                     .split_by_idxs(train_idx, test_idx)
                     .label_from_df(cols=dep_var)
                     .databunch(num_workers=0))
        dnn_fastai = tabular_learner(data_fold, layers=[200, 100], metrics=accuracy)
        dnn_fastai.fit_one_cycle(cyc_len=10, callbacks=None)
        _, score = dnn_fastai.validate()

        m = Metric('fastai', fold=i)
        m.addValue('acc', 100*score)
        mm.addMetric(m)

In [5]:
def show_graph(figure, feature_count, metrics_dict, exp_type=''):
  """
  show_graph function

    Description: This function will take the metrics dictionary provided and update the graph already to show the most recent results

    Args:
      figure => matplotlib.pyplot.figure object
      metrics_dict => dictionary of metrics as described in `train_and_eval_on` function
      exp_type => string indicating the type of experiment to change the title of the graph

    Returns:
      nothing
  """
  # Reorganize the data so we have all of the random forest metrics with increasing features side by side
  reorganized_dictionary = {}

  for feature_vals in metrics_dict.keys():
    for key in metrics_dict[feature_vals].keys():
      # If a given model is not in the new dictionary, add it
      if key not in reorganized_dictionary:
        reorganized_dictionary[key] = {}

      # If there isn't a specific feature number in the model dictionary, add it
      if feature_vals not in reorganized_dictionary[key]:
        reorganized_dictionary[key][feature_vals] = []

      # If there is anything to the record, add it
      if len( metrics_dict[feature_vals][key] ) > 0:
        accuracies = metrics_dict[feature_vals][key]
        mean = np.mean(accuracies)
        std = np.std(accuracies)

        #print('Accuracies: {}'.format(accuracies))
        #print('Mean: {}'.format(mean))
        #print('Std: {}'.format(std))

        reorganized_dictionary[key][feature_vals].append( [mean, std] ) 

  #print('Models: {}'.format( list(reorganized_dictionary.keys()) ))

  for model in reorganized_dictionary.keys():
    # The x-axis will have the feature_count
    xs = []

    # The y-axis will have the accuracy for that feature_count value
    ys = []

    # The y-axis will also have the std for these accuracies since they are accumulated over 10 folds
    yerrs = []

    for x in reorganized_dictionary[model].keys():
      if len(reorganized_dictionary[model][x]) > 0:
        xs.append(x)
        ys.append(reorganized_dictionary[model][x][0][0])
        yerrs.append(reorganized_dictionary[model][x][0][1])
    #print('xs: {}'.format(xs))
    #print('ys: {}'.format(ys))
    if len(xs) > 0:
      plt.errorbar(x=xs, y=ys, yerr=yerrs, label=model)

  #print(reorganized_dictionary)
  if exp_type == 'multi':
    plt.title('Multi-class Classification Model Accuracies with Increasing Features')
  elif exp_type == 'binary':
    plt.title('Binary Classification Model Accuracies with Increasing Features')
  plt.ylabel('Accuracy')
  plt.xlabel('Number of Features')

  plt.xticks(xs[4::5])

  plt.legend()
  plt.show()


In [6]:
def get_data(path, layer=0, nans=False):
    """ get_data function
        Description: This function will take the given path and user-defined layer from the dataset, import the datafiles, and then return the combined pandas DataFrame
        Arguments:
            path => string, path to the directory containing the l1-doh.csv, l1-nonhod.csv, etc files.
            layer => int, the level of layer desired. This will change the dataset that is imported. Values can be 1 or 2. Default is 0.
            nans => boolean, Whether the user wants NaNs in the data or wants them removed. This function will automatically remove all rows with Nan values.
        Returns:
            df => pandas.DataFrame, contains complete data
        Raises:
            AttributeError for incorrect layer number
            Any additional read errors are raised to the user
    """
    import pandas as pd

    if layer not in [1,2]:
        raise AttributeError('Must provide valid layer for dataset: layer equals 1 or 2')
    else:

        # Select the files that the user has chosen
        filenames = []
        if layer == 1:
            filenames.append('l1-doh.csv')
            filenames.append('l1-nondoh.csv')
        else:
            filenames.append('l2-benign.csv')
            filenames.append('l2-malicious.csv')

        # Read the files into dataframes
        df0 = pd.read_csv(path + '/' + filenames[0])
        df1 = pd.read_csv(path + '/' + filenames[1])

        df = pd.concat([df0, df1])

        # Remove any rows with Nan values
        if not nans:
            df.dropna(axis='index', inplace=True)

        return df

In [7]:
def balance_data(df, label_column):
    labels = df[label_column].unique()
    sample_length_list = []
    for i in range(len(labels)):
        samples = df.loc[ df[label_column] == labels[i] ]
        sample_length_list.append( len(samples) )
        #print('Number of {} samples: {}'.format(labels[i], len( samples )))

    random_state = 0
    smallest_count = min(sample_length_list)
    dfs = []
    for i in range(len(labels)):
        #dfs.append( df.loc[ df[label_column] == labels[i] ].sample(smallest_count) )

        # We are only sampling 40 purely for testing reasons to help speed up the dev process!
        # Uncomment the line above this to actually run the complete tests
        dfs.append( df.loc[ df[label_column] == labels[i] ].sample(40) )

    return pd.concat(dfs)

In [8]:
# Set up google drive access
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## Layer 1 Experiments: DoH or nonDoH

In [9]:
path = '/content/gdrive/My Drive/doh_dataset/Total-CSVs'
df = get_data(path=path, layer=1)
df.head()

Unnamed: 0,SourceIP,DestinationIP,SourcePort,DestinationPort,TimeStamp,Duration,FlowBytesSent,FlowSentRate,FlowBytesReceived,FlowReceivedRate,PacketLengthVariance,PacketLengthStandardDeviation,PacketLengthMean,PacketLengthMedian,PacketLengthMode,PacketLengthSkewFromMedian,PacketLengthSkewFromMode,PacketLengthCoefficientofVariation,PacketTimeVariance,PacketTimeStandardDeviation,PacketTimeMean,PacketTimeMedian,PacketTimeMode,PacketTimeSkewFromMedian,PacketTimeSkewFromMode,PacketTimeCoefficientofVariation,ResponseTimeTimeVariance,ResponseTimeTimeStandardDeviation,ResponseTimeTimeMean,ResponseTimeTimeMedian,ResponseTimeTimeMode,ResponseTimeTimeSkewFromMedian,ResponseTimeTimeSkewFromMode,ResponseTimeTimeCoefficientofVariation,Label
0,192.168.20.191,176.103.130.131,50749,443,2020-01-14 15:49:11,95.08155,62311,655.342703,65358,687.388878,7474.676771,86.456213,135.673751,102.0,54,1.168467,0.944683,0.637236,670.585814,25.895672,45.065277,48.811292,1.49506,-0.433974,1.682529,0.574626,0.001053,0.032457,0.027624,0.026854,0.026822,0.071187,0.024715,1.174948,DoH
1,192.168.20.191,176.103.130.131,50749,443,2020-01-14 15:50:52,122.309318,93828,767.136973,101232,827.672018,10458.118598,102.264943,141.245474,114.0,54,0.799261,0.853132,0.724023,708.465878,26.617022,52.287903,48.830314,31.719656,0.389704,0.772748,0.509047,0.00117,0.0342,0.024387,0.021043,0.026981,0.293297,-0.075845,1.402382,DoH
2,192.168.20.191,176.103.130.131,50749,443,2020-01-14 15:52:55,120.958413,38784,320.639127,38236,316.108645,7300.293933,85.441758,133.715278,89.0,54,1.570027,0.932978,0.638983,1358.911235,36.863413,50.316114,39.770747,0.417528,0.858198,1.353607,0.732636,0.000785,0.028021,0.029238,0.026921,0.026855,0.248064,0.085061,0.958348,DoH
3,192.168.20.191,176.103.130.131,50749,443,2020-01-14 15:54:56,110.50108,61993,561.017141,69757,631.278898,8499.282518,92.191553,139.123548,114.0,54,0.817544,0.923333,0.66266,1118.135436,33.438532,51.693726,34.882495,13.280934,1.508251,1.148758,0.646859,0.000411,0.020274,0.019925,0.019268,0.026918,0.097199,-0.344926,1.017535,DoH
4,176.103.130.131,192.168.20.191,443,50749,2020-01-14 15:56:46,54.229891,83641,1542.341289,76804,1416.266907,8052.745751,89.737092,138.91342,114.0,114,0.83288,0.277627,0.645993,341.696613,18.485038,36.435619,49.822561,7.342519,-2.172613,1.573873,0.507334,0.079079,0.281209,0.02593,4.7e-05,2.1e-05,0.276133,0.092135,10.844829,DoH


In [10]:
bad_columns = ['SourceIP', 'DestinationIP', 'SourcePort', 'DestinationPort', 'TimeStamp']
df.drop(labels=bad_columns, axis='columns', inplace=True)

In [11]:
# The target classifications are in the 'Label' columns, 
#  thus this is the independent variable!
dep_var = 'Label'
df[dep_var].value_counts()

NonDoH    889809
DoH       269299
Name: Label, dtype: int64

In [12]:
# Balance the data out
df = balance_data(df, dep_var)

In [13]:
df[dep_var].value_counts()

NonDoH    40
DoH       40
Name: Label, dtype: int64

In [14]:
# Split up the data into the data (X) and classifications (y)
X = df.loc[:, df.columns != dep_var]
y = df[dep_var]

In [15]:
best_features_layer1 = ['Duration', 'ResponseTimeTimeSkewFromMedian', 'ResponseTimeTimeMode',
       'ResponseTimeTimeMedian', 'ResponseTimeTimeMean',
       'PacketTimeSkewFromMedian', 'PacketTimeMode', 'PacketTimeMedian',
       'PacketTimeMean', 'ResponseTimeTimeSkewFromMode', 'PacketTimeVariance',
       'PacketLengthCoefficientofVariation', 'PacketTimeStandardDeviation',
       'PacketLengthMode', 'PacketLengthMedian', 'PacketLengthMean',
       'FlowBytesSent', 'ResponseTimeTimeCoefficientofVariation',
       'PacketLengthStandardDeviation', 'PacketLengthVariance',
       'PacketTimeCoefficientofVariation', 'FlowReceivedRate',
       'ResponseTimeTimeStandardDeviation', 'PacketLengthSkewFromMode',
       'FlowBytesReceived', 'PacketLengthSkewFromMedian', 'FlowSentRate',
       'ResponseTimeTimeVariance', 'PacketTimeSkewFromMode']
print('These are the best 4 features for layer 1: {}'.format(best_features_layer1[:4]))
print('These are the worst 4 features for layer 1: {}'.format(best_features_layer1[-4:]))

These are the best 4 features for layer 1: ['Duration', 'ResponseTimeTimeSkewFromMedian', 'ResponseTimeTimeMode', 'ResponseTimeTimeMedian']
These are the worst 4 features for layer 1: ['PacketLengthSkewFromMedian', 'FlowSentRate', 'ResponseTimeTimeVariance', 'PacketTimeSkewFromMode']


In [16]:
mm = MetricsManager()

train_and_eval_on(X=X, y=y, feature_set=best_features_layer1, metrics_manager=mm)

Training with 29 features
fold num 1


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


epoch,train_loss,valid_loss,accuracy,time
0,0.615438,1.580577,0.375,00:00
1,0.604253,1.252201,0.5,00:00
2,0.558038,0.561066,0.875,00:00
3,0.511953,0.466901,0.75,00:00
4,0.469057,0.426965,0.75,00:00
5,0.431724,0.401478,0.75,00:00
6,0.395923,0.38556,0.875,00:00
7,0.371257,0.374945,0.875,00:00
8,0.350126,0.367318,0.875,00:00
9,0.331463,0.36087,0.875,00:00


fold num 2


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


epoch,train_loss,valid_loss,accuracy,time
0,0.644672,0.757726,0.375,00:00
1,0.636418,0.739809,0.5,00:00
2,0.597778,0.703476,0.5,00:00
3,0.544875,0.663264,0.625,00:00
4,0.489977,0.637309,0.5,00:00
5,0.451808,0.619137,0.75,00:00
6,0.421687,0.603152,0.75,00:00
7,0.391087,0.588845,0.75,00:00
8,0.372392,0.577951,0.75,00:00
9,0.348001,0.569759,0.75,00:00


fold num 3


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


epoch,train_loss,valid_loss,accuracy,time
0,0.882083,0.736766,0.125,00:00
1,0.876716,0.712057,0.375,00:00
2,0.795624,0.650845,0.875,00:00
3,0.69904,0.587196,0.875,00:00
4,0.624317,0.542047,0.875,00:00
5,0.560329,0.506923,0.875,00:00
6,0.516704,0.482158,0.875,00:00
7,0.479217,0.463935,0.875,00:00
8,0.449055,0.450576,0.875,00:00
9,0.425161,0.438851,0.875,00:00


fold num 4


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


epoch,train_loss,valid_loss,accuracy,time
0,0.8273,0.710074,0.75,00:00
1,0.803003,0.685075,0.75,00:00
2,0.735166,0.6284,0.875,00:00
3,0.651323,0.575915,0.875,00:00
4,0.568397,0.533332,0.875,00:00
5,0.525353,0.510306,0.875,00:00
6,0.491574,0.496277,0.875,00:00
7,0.456872,0.486378,0.875,00:00
8,0.427218,0.477961,0.875,00:00
9,0.408727,0.471124,0.875,00:00


fold num 5


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression




epoch,train_loss,valid_loss,accuracy,time
0,0.724292,0.788631,0.5,00:00
1,0.711576,0.743522,0.75,00:00
2,0.665292,0.642782,0.75,00:00
3,0.603812,0.558146,0.75,00:00
4,0.559903,0.49672,0.875,00:00
5,0.523346,0.452766,0.875,00:00
6,0.48453,0.423493,0.875,00:00
7,0.456119,0.401452,0.875,00:00
8,0.431924,0.384318,0.875,00:00
9,0.412804,0.371104,0.875,00:00


fold num 6


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression




epoch,train_loss,valid_loss,accuracy,time
0,0.586985,0.667321,0.75,00:00
1,0.586774,0.655268,0.75,00:00
2,0.555455,0.639091,0.625,00:00
3,0.497547,0.620791,0.625,00:00
4,0.458519,0.607666,0.75,00:00
5,0.425794,0.594653,0.75,00:00
6,0.395469,0.582599,0.75,00:00
7,0.36926,0.571476,0.75,00:00
8,0.348396,0.561005,0.75,00:00
9,0.329563,0.552752,0.75,00:00


fold num 7


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression




epoch,train_loss,valid_loss,accuracy,time
0,0.82452,0.715964,0.375,00:00
1,0.820558,0.705384,0.5,00:00
2,0.738375,0.678903,0.5,00:00
3,0.646842,0.64658,0.625,00:00
4,0.588153,0.620009,0.75,00:00
5,0.53517,0.60066,0.75,00:00
6,0.494264,0.585076,0.75,00:00
7,0.461517,0.572231,0.75,00:00
8,0.436136,0.56124,0.875,00:00
9,0.414326,0.551516,0.875,00:00


fold num 8


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression




epoch,train_loss,valid_loss,accuracy,time
0,0.714852,0.664647,0.625,00:00
1,0.70665,0.64829,0.625,00:00
2,0.654672,0.62199,0.75,00:00
3,0.577305,0.587957,0.75,00:00
4,0.526831,0.56111,0.75,00:00
5,0.479313,0.541985,0.75,00:00
6,0.434793,0.52651,0.75,00:00
7,0.406443,0.514618,0.75,00:00
8,0.377304,0.505323,0.75,00:00
9,0.360103,0.498513,0.75,00:00


fold num 9


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression




epoch,train_loss,valid_loss,accuracy,time
0,0.771193,0.73965,0.25,00:00
1,0.759561,0.70407,0.375,00:00
2,0.699544,0.663828,0.75,00:00
3,0.625955,0.602857,1.0,00:00
4,0.570407,0.553245,1.0,00:00
5,0.528422,0.519904,1.0,00:00
6,0.492197,0.497672,1.0,00:00
7,0.459243,0.481357,1.0,00:00
8,0.433193,0.466176,0.875,00:00
9,0.411372,0.453464,0.875,00:00


fold num 10


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression




epoch,train_loss,valid_loss,accuracy,time
0,0.744218,0.691945,0.375,00:00
1,0.728977,0.681417,0.5,00:00
2,0.669456,0.666431,0.5,00:00
3,0.591472,0.654881,0.75,00:00
4,0.526573,0.656751,0.875,00:00
5,0.475614,0.665935,0.875,00:00
6,0.435316,0.678615,0.875,00:00
7,0.40315,0.68859,0.875,00:00
8,0.374671,0.693007,0.875,00:00
9,0.34997,0.693356,0.875,00:00


In [17]:
mm.printMeasures()

model     acc        
--------------
rf        87.50±11.18
dt        77.50±12.25
knn       77.50±9.35
svm       61.25±15.26
lr        75.00±12.50
lda       76.25±17.18
ab        81.25±13.98
nb        77.50±10.90
keras     66.25±19.41
fastai    83.75±5.73


## Layer 2 Experiments: Benign-DoH or Malicious-DoH

In [18]:
path = '/content/gdrive/My Drive/doh_dataset/Total-CSVs'
df = get_data(path=path, layer=2)
df.head()

Unnamed: 0,SourceIP,DestinationIP,SourcePort,DestinationPort,TimeStamp,Duration,FlowBytesSent,FlowSentRate,FlowBytesReceived,FlowReceivedRate,PacketLengthVariance,PacketLengthStandardDeviation,PacketLengthMean,PacketLengthMedian,PacketLengthMode,PacketLengthSkewFromMedian,PacketLengthSkewFromMode,PacketLengthCoefficientofVariation,PacketTimeVariance,PacketTimeStandardDeviation,PacketTimeMean,PacketTimeMedian,PacketTimeMode,PacketTimeSkewFromMedian,PacketTimeSkewFromMode,PacketTimeCoefficientofVariation,ResponseTimeTimeVariance,ResponseTimeTimeStandardDeviation,ResponseTimeTimeMean,ResponseTimeTimeMedian,ResponseTimeTimeMode,ResponseTimeTimeSkewFromMedian,ResponseTimeTimeSkewFromMode,ResponseTimeTimeCoefficientofVariation,Label
0,192.168.20.191,176.103.130.131,50749,443,2020-01-14 15:49:11,95.08155,62311,655.342703,65358,687.388878,7474.676771,86.456213,135.673751,102.0,54,1.168467,0.944683,0.637236,670.585814,25.895672,45.065277,48.811292,1.49506,-0.433974,1.682529,0.574626,0.001053,0.032457,0.027624,0.026854,0.026822,0.071187,0.024715,1.174948,Benign
1,192.168.20.191,176.103.130.131,50749,443,2020-01-14 15:50:52,122.309318,93828,767.136973,101232,827.672018,10458.118598,102.264943,141.245474,114.0,54,0.799261,0.853132,0.724023,708.465878,26.617022,52.287903,48.830314,31.719656,0.389704,0.772748,0.509047,0.00117,0.0342,0.024387,0.021043,0.026981,0.293297,-0.075845,1.402382,Benign
2,192.168.20.191,176.103.130.131,50749,443,2020-01-14 15:52:55,120.958413,38784,320.639127,38236,316.108645,7300.293933,85.441758,133.715278,89.0,54,1.570027,0.932978,0.638983,1358.911235,36.863413,50.316114,39.770747,0.417528,0.858198,1.353607,0.732636,0.000785,0.028021,0.029238,0.026921,0.026855,0.248064,0.085061,0.958348,Benign
3,192.168.20.191,176.103.130.131,50749,443,2020-01-14 15:54:56,110.50108,61993,561.017141,69757,631.278898,8499.282518,92.191553,139.123548,114.0,54,0.817544,0.923333,0.66266,1118.135436,33.438532,51.693726,34.882495,13.280934,1.508251,1.148758,0.646859,0.000411,0.020274,0.019925,0.019268,0.026918,0.097199,-0.344926,1.017535,Benign
4,176.103.130.131,192.168.20.191,443,50749,2020-01-14 15:56:46,54.229891,83641,1542.341289,76804,1416.266907,8052.745751,89.737092,138.91342,114.0,114,0.83288,0.277627,0.645993,341.696613,18.485038,36.435619,49.822561,7.342519,-2.172613,1.573873,0.507334,0.079079,0.281209,0.02593,4.7e-05,2.1e-05,0.276133,0.092135,10.844829,Benign


In [19]:
bad_columns = ['SourceIP', 'DestinationIP', 'SourcePort', 'DestinationPort', 'TimeStamp']
df.drop(labels=bad_columns, axis='columns', inplace=True)

In [20]:
# The target classifications are in the 'Label' columns, 
#  thus this is the independent variable!
dep_var = 'Label'
df[dep_var].value_counts()

Malicious    249553
Benign        19746
Name: Label, dtype: int64

In [21]:
# Balance the data out
df = balance_data(df, dep_var)

In [22]:
df[dep_var].value_counts()

Benign       40
Malicious    40
Name: Label, dtype: int64

In [23]:
# Split up the data into the data (X) and classifications (y)
X = df.loc[:, df.columns != dep_var]
y = df[dep_var]

In [24]:
best_features_layer2 = ['PacketLengthStandardDeviation', 'PacketLengthCoefficientofVariation',
       'FlowReceivedRate', 'PacketLengthMean', 'Duration',
       'PacketTimeSkewFromMedian', 'FlowSentRate', 'PacketLengthVariance',
       'PacketTimeMean', 'PacketTimeStandardDeviation',
       'ResponseTimeTimeMedian', 'PacketTimeMedian',
       'ResponseTimeTimeSkewFromMode', 'ResponseTimeTimeMean',
       'ResponseTimeTimeMode', 'PacketTimeCoefficientofVariation',
       'ResponseTimeTimeSkewFromMedian', 'PacketTimeMode', 'FlowBytesSent',
       'FlowBytesReceived', 'PacketLengthMode',
       'ResponseTimeTimeCoefficientofVariation', 'PacketLengthSkewFromMedian',
       'PacketTimeVariance', 'PacketLengthMedian', 'PacketTimeSkewFromMode',
       'ResponseTimeTimeStandardDeviation', 'ResponseTimeTimeVariance',
       'PacketLengthSkewFromMode']
print('These are the best 4 features for layer 2: {}'.format(best_features_layer2[:4]))
print('These are the worst 4 features for layer 2: {}'.format(best_features_layer2[-4:]))

These are the best 4 features for layer 2: ['PacketLengthStandardDeviation', 'PacketLengthCoefficientofVariation', 'FlowReceivedRate', 'PacketLengthMean']
These are the worst 4 features for layer 2: ['PacketTimeSkewFromMode', 'ResponseTimeTimeStandardDeviation', 'ResponseTimeTimeVariance', 'PacketLengthSkewFromMode']


In [25]:
mm = MetricsManager()

train_and_eval_on(X=X, y=y, feature_set=best_features_layer1, metrics_manager=mm)

Training with 29 features
fold num 1


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression




epoch,train_loss,valid_loss,accuracy,time
0,0.703125,0.684023,0.5,00:00
1,0.686927,0.66985,0.625,00:00
2,0.62903,0.640672,0.75,00:00
3,0.560555,0.610262,0.875,00:00
4,0.492214,0.578148,0.875,00:00
5,0.440126,0.548016,0.875,00:00
6,0.400985,0.523115,0.875,00:00
7,0.366734,0.502937,0.875,00:00
8,0.33644,0.48665,0.875,00:00
9,0.312656,0.473583,0.875,00:00


fold num 2


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression




epoch,train_loss,valid_loss,accuracy,time
0,0.812059,0.785382,0.125,00:00
1,0.807775,0.802859,0.375,00:00
2,0.73816,0.841363,0.375,00:00
3,0.646848,0.882718,0.625,00:00
4,0.571459,0.918108,0.625,00:00
5,0.508967,0.952194,0.75,00:00
6,0.461502,0.987399,0.75,00:00
7,0.423304,1.038832,0.75,00:00
8,0.395221,1.089454,0.75,00:00
9,0.371233,1.127551,0.75,00:00


fold num 3


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression




epoch,train_loss,valid_loss,accuracy,time
0,1.06775,0.684581,0.375,00:00
1,1.022085,0.667485,0.375,00:00
2,0.943431,0.640187,0.75,00:00
3,0.80989,0.631933,0.625,00:00
4,0.700292,0.643792,0.75,00:00
5,0.60968,0.665861,0.75,00:00
6,0.544037,0.689083,0.75,00:00
7,0.490901,0.713269,0.75,00:00
8,0.449663,0.734598,0.75,00:00
9,0.415629,0.751422,0.75,00:00


fold num 4


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression




epoch,train_loss,valid_loss,accuracy,time
0,0.696429,0.680623,0.5,00:00
1,0.658842,0.66823,0.625,00:00
2,0.595931,0.637231,0.875,00:00
3,0.524255,0.580582,0.875,00:00
4,0.467769,0.522569,0.875,00:00
5,0.423788,0.472764,0.875,00:00
6,0.384971,0.431364,0.875,00:00
7,0.350972,0.400276,0.875,00:00
8,0.323533,0.376076,0.875,00:00
9,0.301945,0.357335,0.875,00:00


fold num 5


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression




epoch,train_loss,valid_loss,accuracy,time
0,0.788765,0.675188,0.75,00:00
1,0.798184,0.65629,0.75,00:00
2,0.735561,0.607955,0.875,00:00
3,0.640123,0.564926,0.875,00:00
4,0.557815,0.535773,0.875,00:00
5,0.490223,0.515814,0.875,00:00
6,0.44047,0.501209,0.875,00:00
7,0.401167,0.490118,0.875,00:00
8,0.367554,0.479512,0.875,00:00
9,0.343414,0.471583,0.875,00:00


fold num 6


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression




epoch,train_loss,valid_loss,accuracy,time
0,0.633494,0.687524,0.5,00:00
1,0.628188,0.667317,0.5,00:00
2,0.573642,0.624011,0.875,00:00
3,0.507296,0.575495,0.875,00:00
4,0.448944,0.533182,0.875,00:00
5,0.401257,0.500136,0.875,00:00
6,0.362757,0.474319,0.875,00:00
7,0.331335,0.4534,0.875,00:00
8,0.302085,0.436885,0.875,00:00
9,0.280072,0.424677,0.875,00:00


fold num 7


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression




epoch,train_loss,valid_loss,accuracy,time
0,0.710757,0.673855,0.625,00:00
1,0.690152,0.641046,0.75,00:00
2,0.643563,0.573969,1.0,00:00
3,0.563814,0.503573,1.0,00:00
4,0.489343,0.441103,1.0,00:00
5,0.442848,0.38887,1.0,00:00
6,0.406777,0.346488,1.0,00:00
7,0.374454,0.313499,1.0,00:00
8,0.350101,0.286089,1.0,00:00
9,0.328655,0.261076,1.0,00:00


fold num 8


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression




epoch,train_loss,valid_loss,accuracy,time
0,0.657711,0.71482,0.375,00:00
1,0.624958,0.713939,0.25,00:00
2,0.57465,0.71185,0.5,00:00
3,0.509827,0.71075,0.5,00:00
4,0.446886,0.708419,0.5,00:00
5,0.401683,0.704156,0.5,00:00
6,0.362533,0.700283,0.5,00:00
7,0.328866,0.698371,0.5,00:00
8,0.303913,0.699861,0.5,00:00
9,0.285539,0.703726,0.5,00:00


fold num 9


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression




epoch,train_loss,valid_loss,accuracy,time
0,0.576706,0.662447,0.75,00:00
1,0.564742,0.647446,0.75,00:00
2,0.514644,0.61731,0.75,00:00
3,0.454404,0.581426,0.75,00:00
4,0.39518,0.54428,0.875,00:00
5,0.356811,0.509347,0.875,00:00
6,0.327881,0.477086,0.875,00:00
7,0.303935,0.448383,0.875,00:00
8,0.283622,0.424917,0.875,00:00
9,0.266536,0.405914,0.875,00:00


fold num 10


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression




epoch,train_loss,valid_loss,accuracy,time
0,0.830582,0.672954,0.625,00:00
1,0.869137,0.667157,0.625,00:00
2,0.81846,0.660723,0.75,00:00
3,0.721832,0.659414,0.75,00:00
4,0.626003,0.659431,0.75,00:00
5,0.557452,0.653232,0.75,00:00
6,0.502395,0.644923,0.75,00:00
7,0.459387,0.635146,0.75,00:00
8,0.424484,0.628392,0.75,00:00
9,0.395135,0.624691,0.75,00:00


In [26]:
mm.printMeasures()

model     acc        
--------------
rf        91.25±11.25
dt        86.25±10.38
knn       76.25±11.79
svm       76.25±11.79
lr        77.50±10.90
lda       73.75±16.25
ab        93.75±8.39
nb        76.25±11.79
keras     80.00±8.29
fastai    81.25±12.81
