In [11]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

print('Imports complete.')

Imports complete.


In [4]:
# Import the data
# Set up this function to automatically read the data and import it
def get_data(path, layer=0, nans=False):
    """ get_data function
        Description: This function will take the given path and user-defined layer from the dataset, import the datafiles, and then return the combined pandas DataFrame
        Arguments:
            path => string, path to the directory containing the l1-doh.csv, l1-nonhod.csv, etc files.
            layer => int, the level of layer desired. This will change the dataset that is imported. Values can be 1 or 2. Default is 0.
            nans => boolean, Whether the user wants NaNs in the data or wants them removed. This function will automatically remove all rows with Nan values.
        Returns:
            df => pandas.DataFrame, contains complete data
        Raises:
            AttributeError for incorrect layer number
            Any additional read errors are raised to the user
    """
    import pandas as pd

    if layer not in [1,2]:
        raise AttributeError('Must provide valid layer for dataset: layer equals 1 or 2')
    else:

        # Select the files that the user has chosen
        filenames = []
        if layer == 1:
            filenames.append('l1-doh.csv')
            filenames.append('l1-nondoh.csv')
        else:
            filenames.append('l2-benign.csv')
            filenames.append('l2-malicious.csv')

        # Read the files into dataframes
        df0 = pd.read_csv(path + '/' + filenames[0])
        df1 = pd.read_csv(path + '/' + filenames[1])

        df = pd.concat([df0, df1])

        # Remove any rows with Nan values
        if not nans:
            df.dropna(axis='index', inplace=True)

        return df

In [5]:
path = '/media/notclaytonjohnson/Seagate Portable Drive/Data/doh_dataset/Total-CSVs'

# Get the data for the DoH vs NonDoH classifications
df = get_data(path=path, layer=1)
df.head()

Unnamed: 0,SourceIP,DestinationIP,SourcePort,DestinationPort,TimeStamp,Duration,FlowBytesSent,FlowSentRate,FlowBytesReceived,FlowReceivedRate,...,PacketTimeCoefficientofVariation,ResponseTimeTimeVariance,ResponseTimeTimeStandardDeviation,ResponseTimeTimeMean,ResponseTimeTimeMedian,ResponseTimeTimeMode,ResponseTimeTimeSkewFromMedian,ResponseTimeTimeSkewFromMode,ResponseTimeTimeCoefficientofVariation,Label
0,192.168.20.191,176.103.130.131,50749,443,2020-01-14 15:49:11,95.08155,62311,655.342703,65358,687.388878,...,0.574626,0.001053,0.032457,0.027624,0.026854,0.026822,0.071187,0.024715,1.174948,DoH
1,192.168.20.191,176.103.130.131,50749,443,2020-01-14 15:50:52,122.309318,93828,767.136973,101232,827.672018,...,0.509047,0.00117,0.0342,0.024387,0.021043,0.026981,0.293297,-0.075845,1.402382,DoH
2,192.168.20.191,176.103.130.131,50749,443,2020-01-14 15:52:55,120.958413,38784,320.639127,38236,316.108645,...,0.732636,0.000785,0.028021,0.029238,0.026921,0.026855,0.248064,0.085061,0.958348,DoH
3,192.168.20.191,176.103.130.131,50749,443,2020-01-14 15:54:56,110.50108,61993,561.017141,69757,631.278898,...,0.646859,0.000411,0.020274,0.019925,0.019268,0.026918,0.097199,-0.344926,1.017535,DoH
4,176.103.130.131,192.168.20.191,443,50749,2020-01-14 15:56:46,54.229891,83641,1542.341289,76804,1416.266907,...,0.507334,0.079079,0.281209,0.02593,4.7e-05,2.1e-05,0.276133,0.092135,10.844829,DoH


In [8]:
df['Label'].value_counts()

NonDoH    889809
DoH       269299
Name: Label, dtype: int64

In [7]:
# This is the ordered list of features from lowest p-value to highest p-value (best to worst)
#  acquired from the 'chi2_feature_selection' notebook
data_cols = [
    'Duration', 
    'ResponseTimeTimeSkewFromMedian', 
    'ResponseTimeTimeMode',
    'ResponseTimeTimeMedian', 
    'ResponseTimeTimeMean',
    'PacketTimeSkewFromMedian', 
    'PacketTimeMode', 
    'PacketTimeMedian',
    'PacketTimeMean', 
    'ResponseTimeTimeSkewFromMode'
]
target_col = 'Label'

In [None]:
# Code adapted from https://towardsdatascience.com/feature-selection-correlation-and-p-value-da8921bfb3cf

fig = plt.figure(figsize = (20, 25))

i=0
for col in data_cols:
    # Set up the subplot for this column
    plt.subplot(6, 4, i+1)
    i+=1    
    
    # Plot the distributions for both classifications
    sns.displot(
        # Select the col values from 'NonDoH' samples
        df[ col ][ df[target_col]=='NonDoH' ], 
        color='g', 
        label = 'NonDoH'
    )
    sns.displot(
        # Select the col values from 'DoH' samples 
        df[ col ][ df[target_col]=='DoH' ],
        color='r', 
        label = 'DoH'
    )
    
    plt.legend(loc='best')
    
fig.suptitle('DoH vs NonDoH Feature Distributions')
fig.tight_layout()
fig.subplots_adjust(top=0.95)
plt.show()