In [1]:
# Import the experiment objects
from alexandria.experiment import Experiment, Experiments

# Data management
import pandas as pd

print('Imports complete.')

Imports complete.


## Import the dataset

In [2]:
# Set up this function to automatically read the data and import it
def get_data(path, layer=0, nans=False):
    """ get_data function
        Description: This function will take the given path and user-defined layer from the dataset, import the datafiles, and then return the combined pandas DataFrame
        Arguments:
            path => string, path to the directory containing the l1-doh.csv, l1-nonhod.csv, etc files.
            layer => int, the level of layer desired. This will change the dataset that is imported. Values can be 1 or 2. Default is 0.
            nans => boolean, Whether the user wants NaNs in the data or wants them removed. This function will automatically remove all rows with Nan values.
        Returns:
            df => pandas.DataFrame, contains complete data
        Raises:
            AttributeError for incorrect layer number
            Any additional read errors are raised to the user
    """
    import pandas as pd

    if layer not in [1,2]:
        raise AttributeError('Must provide valid layer for dataset: layer equals 1 or 2')
    else:

        # Select the files that the user has chosen
        filenames = []
        if layer == 1:
            filenames.append('l1-doh.csv')
            filenames.append('l1-nondoh.csv')
        else:
            filenames.append('l2-benign.csv')
            filenames.append('l2-malicious.csv')

        # Read the files into dataframes
        df0 = pd.read_csv(path + '/' + filenames[0])
        df1 = pd.read_csv(path + '/' + filenames[1])

        df = pd.concat([df0, df1])

        # Remove any rows with Nan values
        if not nans:
            df.dropna(axis='index', inplace=True)

        return df

In [3]:
path = '/media/notclaytonjohnson/Seagate Portable Drive/Data/doh_dataset/Total-CSVs'

# Get the data for the DoH vs NonDoH classifications
df = get_data(path=path, layer=1)
df.head()

Unnamed: 0,SourceIP,DestinationIP,SourcePort,DestinationPort,TimeStamp,Duration,FlowBytesSent,FlowSentRate,FlowBytesReceived,FlowReceivedRate,...,PacketTimeCoefficientofVariation,ResponseTimeTimeVariance,ResponseTimeTimeStandardDeviation,ResponseTimeTimeMean,ResponseTimeTimeMedian,ResponseTimeTimeMode,ResponseTimeTimeSkewFromMedian,ResponseTimeTimeSkewFromMode,ResponseTimeTimeCoefficientofVariation,Label
0,192.168.20.191,176.103.130.131,50749,443,2020-01-14 15:49:11,95.08155,62311,655.342703,65358,687.388878,...,0.574626,0.001053,0.032457,0.027624,0.026854,0.026822,0.071187,0.024715,1.174948,DoH
1,192.168.20.191,176.103.130.131,50749,443,2020-01-14 15:50:52,122.309318,93828,767.136973,101232,827.672018,...,0.509047,0.00117,0.0342,0.024387,0.021043,0.026981,0.293297,-0.075845,1.402382,DoH
2,192.168.20.191,176.103.130.131,50749,443,2020-01-14 15:52:55,120.958413,38784,320.639127,38236,316.108645,...,0.732636,0.000785,0.028021,0.029238,0.026921,0.026855,0.248064,0.085061,0.958348,DoH
3,192.168.20.191,176.103.130.131,50749,443,2020-01-14 15:54:56,110.50108,61993,561.017141,69757,631.278898,...,0.646859,0.000411,0.020274,0.019925,0.019268,0.026918,0.097199,-0.344926,1.017535,DoH
4,176.103.130.131,192.168.20.191,443,50749,2020-01-14 15:56:46,54.229891,83641,1542.341289,76804,1416.266907,...,0.507334,0.079079,0.281209,0.02593,4.7e-05,2.1e-05,0.276133,0.092135,10.844829,DoH


In [4]:
# These are the columns that could lead to overfitting of the models
overfitting_cols = [
    'SourceIP',
    'DestinationIP',
    'SourcePort',
    'DestinationPort',
    'TimeStamp'
]

# This is the target column
target_col = 'Label'

# These are the data columns
data_cols = list(  set(df.columns) - set(overfitting_cols) - set(target_col) )

print('Data Columns: {} total'.format(len(data_cols)))
for col in data_cols: print('\t' + col) 

print('Target Column:')
print('\t' + target_col)

Data Columns: 30 total
	PacketTimeVariance
	PacketLengthMode
	FlowBytesSent
	Label
	ResponseTimeTimeSkewFromMode
	ResponseTimeTimeStandardDeviation
	ResponseTimeTimeVariance
	PacketLengthStandardDeviation
	ResponseTimeTimeCoefficientofVariation
	PacketTimeMedian
	PacketLengthVariance
	PacketTimeSkewFromMedian
	PacketLengthSkewFromMedian
	PacketTimeCoefficientofVariation
	PacketLengthMean
	PacketLengthSkewFromMode
	ResponseTimeTimeMode
	ResponseTimeTimeSkewFromMedian
	PacketTimeMode
	PacketLengthMedian
	PacketLengthCoefficientofVariation
	ResponseTimeTimeMean
	ResponseTimeTimeMedian
	FlowBytesReceived
	PacketTimeMean
	FlowSentRate
	FlowReceivedRate
	PacketTimeSkewFromMode
	PacketTimeStandardDeviation
	Duration
Target Column:
	L


In [5]:
# This is the ordered list of features from lowest p-value to highest p-value (best to worst)
#  acquired from the 'chi2_feature_selection' notebook
data_cols = [
    'Duration', 
    'ResponseTimeTimeSkewFromMedian', 
    'ResponseTimeTimeMode',
    'ResponseTimeTimeMedian', 
    'ResponseTimeTimeMean',
    'PacketTimeSkewFromMedian', 
    'PacketTimeMode', 
    'PacketTimeMedian',
    'PacketTimeMean', 
    'ResponseTimeTimeSkewFromMode', 
    'PacketTimeVariance',
    'PacketLengthCoefficientofVariation', 
    'PacketTimeStandardDeviation',
    'PacketLengthMode', 
    'PacketLengthMedian', 
    'PacketLengthMean',
    'FlowBytesSent', 
    'ResponseTimeTimeCoefficientofVariation',
    'PacketLengthStandardDeviation', 
    'PacketLengthVariance',
    'PacketTimeCoefficientofVariation', 
    'FlowReceivedRate',
    'ResponseTimeTimeStandardDeviation', 
    'PacketLengthSkewFromMode',
    'FlowBytesReceived', 
    'PacketLengthSkewFromMedian', 
    'FlowSentRate',
    'ResponseTimeTimeVariance', 
    'PacketTimeSkewFromMode'
]

In [6]:
# Create the experiments object
exps = Experiments()

# Set up the models we want to use in the experiment
models = [
    'rf',
    'dt',
    'knn',
    'da',
    'nb',
    'ab',
    'gb',
    'lr'
]

In [7]:
for i in range(1, len(data_cols)+1 ):
    print('Experiment {} has {}\n'.format(i, data_cols[:i]))
    
    # Create the experiment for these features
    exp = Experiment(
        name='Experiment with top {} features'.format(i),
        dataset=df,
        xlabels=data_cols[:i],
        ylabels=target_col,
        models=models
    )
    
    # Add the experiment object to the experiments object
    exps.addExperiment(exp)

Experiment 1 has ['Duration']

Experiment 2 has ['Duration', 'ResponseTimeTimeSkewFromMedian']

Experiment 3 has ['Duration', 'ResponseTimeTimeSkewFromMedian', 'ResponseTimeTimeMode']

Experiment 4 has ['Duration', 'ResponseTimeTimeSkewFromMedian', 'ResponseTimeTimeMode', 'ResponseTimeTimeMedian']



Library unspecified, using default library: 'scikit-learn'


Experiment 5 has ['Duration', 'ResponseTimeTimeSkewFromMedian', 'ResponseTimeTimeMode', 'ResponseTimeTimeMedian', 'ResponseTimeTimeMean']

Experiment 6 has ['Duration', 'ResponseTimeTimeSkewFromMedian', 'ResponseTimeTimeMode', 'ResponseTimeTimeMedian', 'ResponseTimeTimeMean', 'PacketTimeSkewFromMedian']

Experiment 7 has ['Duration', 'ResponseTimeTimeSkewFromMedian', 'ResponseTimeTimeMode', 'ResponseTimeTimeMedian', 'ResponseTimeTimeMean', 'PacketTimeSkewFromMedian', 'PacketTimeMode']

Experiment 8 has ['Duration', 'ResponseTimeTimeSkewFromMedian', 'ResponseTimeTimeMode', 'ResponseTimeTimeMedian', 'ResponseTimeTimeMean', 'PacketTimeSkewFromMedian', 'PacketTimeMode', 'PacketTimeMedian']

Experiment 9 has ['Duration', 'ResponseTimeTimeSkewFromMedian', 'ResponseTimeTimeMode', 'ResponseTimeTimeMedian', 'ResponseTimeTimeMean', 'PacketTimeSkewFromMedian', 'PacketTimeMode', 'PacketTimeMedian', 'PacketTimeMean']

Experiment 10 has ['Duration', 'ResponseTimeTimeSkewFromMedian', 'ResponseTimeTim

Experiment 28 has ['Duration', 'ResponseTimeTimeSkewFromMedian', 'ResponseTimeTimeMode', 'ResponseTimeTimeMedian', 'ResponseTimeTimeMean', 'PacketTimeSkewFromMedian', 'PacketTimeMode', 'PacketTimeMedian', 'PacketTimeMean', 'ResponseTimeTimeSkewFromMode', 'PacketTimeVariance', 'PacketLengthCoefficientofVariation', 'PacketTimeStandardDeviation', 'PacketLengthMode', 'PacketLengthMedian', 'PacketLengthMean', 'FlowBytesSent', 'ResponseTimeTimeCoefficientofVariation', 'PacketLengthStandardDeviation', 'PacketLengthVariance', 'PacketTimeCoefficientofVariation', 'FlowReceivedRate', 'ResponseTimeTimeStandardDeviation', 'PacketLengthSkewFromMode', 'FlowBytesReceived', 'PacketLengthSkewFromMedian', 'FlowSentRate', 'ResponseTimeTimeVariance']

Experiment 29 has ['Duration', 'ResponseTimeTimeSkewFromMedian', 'ResponseTimeTimeMode', 'ResponseTimeTimeMedian', 'ResponseTimeTimeMean', 'PacketTimeSkewFromMedian', 'PacketTimeMode', 'PacketTimeMedian', 'PacketTimeMean', 'ResponseTimeTimeSkewFromMode', 'Pac

In [8]:
# The metrics we want to collect for this multiclass problem
metrics = ['accuracy', 'recall', 'precision', 'auc']

In [None]:
# Train and evaluate all of the experiments using 10-fold cv
exps.trainCV(metrics=metrics, nfolds=10, reportduring=True)

Running Experiment with top 1 features...done (this took 5266.02 sec)

Experiment with top 1 features
name                                  Accuracy       Recall         Precision      AUC
------------------------------------  -------------  -------------  -------------  -------------
sklearn.random forest                 0.8665±0.0269  0.8665±0.0269  0.8647±0.0344  0.8637±0.0683
sklearn.decision tree                 0.8598±0.0264  0.8598±0.0264  0.8596±0.0344  0.7993±0.0636
sklearn.k neighbors                   0.9021±0.0309  0.9021±0.0309  0.9001±0.0313  0.8711±0.0686
sklearn.discriminant analysis.Linear  0.8069±0.0234  0.8069±0.0234  0.7952±0.0301  0.8463±0.0473
sklearn.naive bayes.Gaussian          0.8059±0.0240  0.8059±0.0240  0.7940±0.0304  0.8463±0.0473
sklearn.adaboost                      0.9004±0.0307  0.9004±0.0307  0.9035±0.0250  0.9006±0.0655
sklearn.gradient boost                0.9071±0.0318  0.9071±0.0318  0.9087±0.0277  0.9118±0.0592
sklearn.logistic regression        