## Case Study 5 - Firewall Data
##### Jason McDonald

This case study uses a dataset that consists of firewall traffic moving across a network.  We're to create a ml model that can auto classify based on the Action, and take such action in response (API).

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import time as time

#sklearn and intel acceleration library
from sklearnex import patch_sklearn
patch_sklearn()
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score


Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


##### Data
The data must be imported, followed by checking basic metrics of the columns.

In [2]:
df = pd.read_csv('log2.csv')
df.describe()

Unnamed: 0,Source Port,Destination Port,NAT Source Port,NAT Destination Port,Bytes,Bytes Sent,Bytes Received,Packets,Elapsed Time (sec),pkts_sent,pkts_received
count,65532.0,65532.0,65532.0,65532.0,65532.0,65532.0,65532.0,65532.0,65532.0,65532.0,65532.0
mean,49391.969343,10577.385812,19282.972761,2671.04993,97123.95,22385.8,74738.15,102.866,65.833577,41.39953,61.466505
std,15255.712537,18466.027039,21970.689669,9739.162278,5618439.0,3828139.0,2463208.0,5133.002,302.461762,3218.871288,2223.332271
min,0.0,0.0,0.0,0.0,60.0,60.0,0.0,1.0,0.0,1.0,0.0
25%,49183.0,80.0,0.0,0.0,66.0,66.0,0.0,1.0,0.0,1.0,0.0
50%,53776.5,445.0,8820.5,53.0,168.0,90.0,79.0,2.0,15.0,1.0,1.0
75%,58638.0,15000.0,38366.25,443.0,752.25,210.0,449.0,6.0,30.0,3.0,2.0
max,65534.0,65535.0,65535.0,65535.0,1269359000.0,948477200.0,320881800.0,1036116.0,10824.0,747520.0,327208.0


##### Missing Data Check

In [3]:
missing_values = df.isnull().sum()
print(missing_values)

Source Port             0
Destination Port        0
NAT Source Port         0
NAT Destination Port    0
Action                  0
Bytes                   0
Bytes Sent              0
Bytes Received          0
Packets                 0
Elapsed Time (sec)      0
pkts_sent               0
pkts_received           0
dtype: int64


##### Response Variable

In [4]:
action_counts = df['Action'].value_counts()
color_list = ['#92B2F7', '#FA7D7A', '#F5B790', '#F7CE6F']
fig = go.Figure()
#create a horizontal bar for each class in the firewall data.
for idx, (action, count) in enumerate(action_counts.items()):
    fig.add_trace(
        go.Bar(
            y=[action],
            x=[count],
            name=action,
            orientation='h',
            text=[count],
            textposition='auto',
            marker_color=color_list[idx % len(color_list)]
        )
    )

fig.update_layout(
    title_text='Frequency of Each Class in Action Taken',
    title_x=0.5, 
    xaxis_title='Frequency',
    yaxis_title='',
    annotations=[
        dict(
            x=-.35,
            y=0.5,
            showarrow=False,
            text="Action<br>Taken",
            textangle=0,
            xref='paper',
            yref='paper',
            font=dict(
                size=14,
            ),
        )
    ],
    yaxis={'categoryorder': 'total ascending'},
    autosize=False,
    width=500,
    height=500,
    margin=dict(l=50, r=50, b=100, t=100, pad=4),
    plot_bgcolor='white', 
    xaxis=dict(gridcolor='lightgrey')
)

fig.show()

Due to the extreme lack of reset-both class and that the effective result is expected to be the same as drop, the reset-both class will be replaced with drop.

In [5]:
#keep an original copy just in case we need to use it later
df_original = df.copy()
#replace the reset-both class with drop due to such low occurrences (54)
df['Action'] = df['Action'].replace('reset-both', 'drop')

### Features

##### Numerical Features

In [6]:
#create a function that can generate a histogram of each numerical column
def generate_violin_plots(df, columns):
    fig = make_subplots(rows=2, cols=4)
    for i, column in enumerate(columns):
        violin = go.Violin(
            y=df[column],
            name=column,
            box_visible=True,
            meanline_visible=True,
            points='outliers'
        )
        fig.add_trace(violin, row=i//4+1, col=i%4+1)
        #overlay a scatterplot of outliers
        Q90 = df[column].quantile(0.90)
        IQR = Q90
        outliers = df[(df[column] > (Q90 + 1.5 * IQR))][column]
        scatter = go.Scatter(
            x=[column]*len(outliers),
            y=outliers,
            mode='markers',
            name='outliers',
            marker=dict(
                color='#9467bd',
                size=3,
                line=dict(
                    color='#9467bd',
                    width=2
                )
            ),
            showlegend=False
        )

        fig.add_trace(scatter, row=i//4+1, col=i%4+1)

    fig.update_layout(
        title_text='Distribution of Numercial Features in the Firewall Activity Dataset<br>(Outliers Highlighted Over Each Individual Plot)',
        title_x=0.5, 
        height=600,
        width=900,
        legend=dict(
            x=0.8,  #legend x location
            y=0.02,  #legend y location
            traceorder="normal",
            font=dict(
                family="sans-serif",
                size=12,
                color="black"
            ),
            bgcolor="white",
            bordercolor="Black",
            borderwidth=1 
        ),
        plot_bgcolor='white',
        annotations=[
            dict(
                x=-0.1, 
                y=0.01, 
                text="Log(10) of # Occurrences",
                showarrow=False,
                textangle=-90,
                xref="paper",
                yref="paper",
                font=dict(size=14)
            ),
            dict(
                x=-0.1, 
                y=.99, 
                text="Log(10) of # Occurrences",
                showarrow=False,
                textangle=-90,
                xref="paper",
                yref="paper",
                font=dict(size=14)
            )
        ]
    )
    #change the background and color of the gridlines
    for i in range(1, 3):  #rows
        for j in range(1, 5):  #columns
            fig.update_yaxes(showline=True, linewidth=2, linecolor='lightgrey', gridcolor='lightgrey', row=i, col=j, type='log')
            fig.update_xaxes(showline=True, linewidth=2, linecolor='lightgrey', gridcolor='lightgrey', row=i, col=j)

    fig.show()

In [7]:
numerical_columns = ['Bytes', 'Bytes Sent', 'Bytes Received', 'Packets', 'Elapsed Time (sec)', 'pkts_sent', 'pkts_received']
generate_violin_plots(df,  numerical_columns)

##### Categorical Features

In [8]:
cat_columns = ['Source Port', 'Destination Port', 'NAT Source Port', 'NAT Destination Port']
#convert the category columns to type category in the df
for column in cat_columns:
    df[column] = df[column].astype(str).astype('category')

In [9]:
#Function to make a subplot containing the category plots of the top 8 values in each
def plot_top_categories(df, columns):
    fig = make_subplots(rows=2, cols=2, subplot_titles=columns)

    for i, column in enumerate(columns):
        #count each category
        category_count = df[column].value_counts().nlargest(8)

        #make the charts for each subplot
        bar = go.Bar(x=category_count.index, y=category_count.values, 
                     text=category_count.values,
                     textposition='auto',
                     name=column,
                     marker_color='#92B2F7') 

        fig.add_trace(bar, row=i//2+1, col=i%2+1)

    fig.update_layout(
        title_text='Categorical Features: Top 8 Ports<br>Shown on Log(10) Scale',
        title_x=0.5, 
        height=600,
        width=800,
        paper_bgcolor='white',
        plot_bgcolor='white',
        showlegend=False,
        annotations=[
            dict(
                x=-0.1, 
                y=0.01, 
                text="Log(10) of # Occurrences",
                showarrow=False,
                textangle=-90,
                xref="paper",
                yref="paper",
                font=dict(size=14)
            ),
            dict(
                x=-0.1, 
                y=.60, 
                text="Log(10) of # Occurrences",
                showarrow=False,
                textangle=-90,
                xref="paper",
                yref="paper",
                font=dict(size=14)
            )
        ]
    )

    for i in range(1, 3):  #rows
        for j in range(1, 3):  #columns
            fig.update_yaxes(type="log", showline=False, linewidth=1, linecolor='lightgrey', gridcolor='lightgrey', row=i, col=j)
            fig.update_xaxes(title_text=columns[(i-1)*2+j-1], row=i, col=j)

    fig.show()

In [10]:
plot_top_categories(df, cat_columns)

##### Categorical Features per Action - Top 8

In [11]:
def plot_category_action_heatmaps(df, columns):
    matrix_all = pd.DataFrame()

    for i, column in enumerate(columns):
        # Create count matrix
        matrix = df.groupby([column, 'Action']).size().unstack(fill_value=0)

        # Get the top 8 categories based on the sum of the rows
        top_categories = matrix.sum(axis=1).nlargest(8).index.tolist()

        # Filter matrix to only include top 8 categories
        matrix = matrix.loc[top_categories]

        # Add 'Action' column to matrix
        matrix['facet'] = column

        # Append the filtered matrix to matrix_all
        matrix_all = pd.concat([matrix_all, matrix])

    matrix_all['Port'] = matrix_all.index
    matrix_all = matrix_all.reset_index()
    matrix_all.index.name = None
    matrix_all = matrix_all.drop('index', axis=1)
    facet_array = matrix_all.to_numpy()


    label_to_int = {'Source Port': 0,'Destination Port': 1, 'NAT Source Port': 2, 'NAT Destination Port':3}
    labels = [0,1,2,3]


    #facet_array = array[:,:4]
    #port = array[:,4]

    #port = port.reshape(-1,1)

    for i in range(len(facet_array)):
        facet_array[i, 3] = label_to_int[facet_array[i, 3]]



    arr_by_label = []
    ports = []
    for value in labels:
        # get rows where the fourth column is equal to the current unique value
        mask = facet_array[:, 3] == value

        # select those rows and exclude the fourth column
        sub_array = facet_array[mask, :3]
        sub_port_array = facet_array[mask, 4]
        
        # add this array to the list
        arr_by_label.append(sub_array)
        ports.append(sub_port_array)

    # convert the list to a numpy array for convenience
    arr_by_label = np.array(arr_by_label)
    ports = np.array(ports)



    fig = px.imshow(arr_by_label, 
                     facet_col=0,
                     color_continuous_scale='Blues',
                     #aspect='auto',
                     facet_col_spacing=.03,
                     text_auto='.0f',
                 )

    fig.update_yaxes(matches=None)
    fig.for_each_yaxis(lambda yaxis: yaxis.update(showticklabels=True))
    original_y_vals = list(range(8))
    # corresponding new y axis labels from the 'port' array

    # update y-axis for each subplot
    for i in range(1, 5):  # adjust as needed for your number of facets
        new_y_labels = [ports[i-1, j] for j in original_y_vals]
        fig.layout[f'yaxis{i}'].update(tickvals=original_y_vals, ticktext=new_y_labels,tickfont_family="Arial Black")
        fig.layout[f'xaxis{i}'].update(tickvals=[0,1,2], ticktext=['Allow', 'Deny', 'Drop'],tickfont_family="Arial Black")
    #fig.for_each_annotation(lambda a: a.update(text=(key for key, value in label_to_int.items() if str(value) == a.text.split("=")[-1])))
    
    #fig.for_each_annotation(lambda a: a.update(text="<b>" + next((key for key, value in label_to_int.items() if str(value) == a.text.split("=")[-1]), None) + "</b>"))
    #label_to_int = {'Source Port': 0,'Destination Port': 1, 'NAT Source Port': 2, 'NAT Destination Port':3}
    fig.update_layout(
        title='Heatmap of Action by Top 8 Ports, by Type of Activity',
        title_x=0.5,
        height=500,
        width=1000,
        paper_bgcolor='white',
        plot_bgcolor='white',
        annotations=[
            dict(
                x=-0.05, 
                y=0.4, 
                text="Network Port",
                showarrow=False,
                textangle=-90,
                xref="paper",
                yref="paper",
                font=dict(size=16)
            ),
            dict(
                x=0.47, 
                y=-0.20, 
                text="Cybersecurity Action",
                showarrow=False,
                textangle=0,
                xref="paper",
                yref="paper",
                font=dict(size=16)
            ),
            dict(
                x=0.11,
                y=1,
                text="Source Port",
                showarrow=False,
                textangle=0,
                xref="paper",
                yref="paper",
                font=dict(size=14)
            ),
            dict(
                x=0.37,
                y=1,
                text="Destination Port",
                showarrow=False,
                textangle=0,
                xref="paper",
                yref="paper",
                font=dict(size=14)
            )
        ]
        
    )
    fig.add_annotation(
                x=0.7,
                y=1.06,
                text="NAT Source Port",
                showarrow=False,
                textangle=0,
                xref="paper",
                yref="paper",
                font=dict(size=14)
            )
    fig.add_annotation(
                x=0.95,
                y=1.06,
                text="NAT Destination Port",
                showarrow=False,
                textangle=0,
                xref="paper",
                yref="paper",
                font=dict(size=14)
                )
    fig.update_coloraxes(showscale=False)
    fig.show()






In [12]:
plot_category_action_heatmaps(df, cat_columns)

### Building SKLearn Pipelines to Standardize the Data and Fit a Model

In [13]:
#split into X for all features, and y for the classes
X = df.drop('Action', axis=1).values
y = df['Action'].values

In [14]:
#Setup a training and test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=123)
results = []
X_timer = np.array([X_test[0]])

For timing each model per prediction, the X_timer contains a single row from the test array to predict.  This will be run through a timing function to determine the amount of time in nano seconds for a single prediction to be made.

##### SVC Linear Kernel

In [16]:
svc_lin_pl = make_pipeline(StandardScaler(), SVC(kernel='linear', class_weight='balanced'))
svc_lin_pl.fit(X_train, y_train)

In [17]:
acc = svc_lin_pl.score(X_test, y_test)
start_time = time.perf_counter_ns()
svc_lin_pl.predict(X_timer)
end_time = time.perf_counter_ns()
execution_time = end_time - start_time
results.append({'SVC - Linear Kernel': [acc, execution_time]})

##### SVC Poly Kernel

In [20]:
svc_poly_pl = make_pipeline(StandardScaler(), SVC(kernel='poly', class_weight='balanced', coef0= 10.0))
svc_poly_pl.fit(X_train, y_train)

In [21]:
acc = svc_poly_pl.score(X_test, y_test)
start_time = time.perf_counter_ns()
svc_poly_pl.predict(X_timer)
end_time = time.perf_counter_ns()
execution_time = end_time - start_time
results.append({'SVC - Poly Kernel': [acc, execution_time]})

##### SVC RBF Kernel

In [22]:
svc_rbf_pl = make_pipeline(StandardScaler(), SVC(kernel='rbf', class_weight='balanced'))
svc_rbf_pl.fit(X_train, y_train)

In [23]:
acc = svc_rbf_pl.score(X_test, y_test)
start_time = time.perf_counter_ns()
svc_rbf_pl.predict(X_timer)
end_time = time.perf_counter_ns()
execution_time = end_time - start_time
results.append({'SVC - RBF Kernel': [acc, execution_time]})

##### SGDClassifier with Hinge Loss Function

In [24]:
sgd_hinge_pl = make_pipeline(StandardScaler(), SGDClassifier(loss='hinge', penalty='elasticnet', alpha=.000001, l1_ratio=.90, class_weight='balanced'))
sgd_hinge_pl.fit(X_train, y_train)

In [25]:
acc = sgd_hinge_pl.score(X_test, y_test)
start_time = time.perf_counter_ns()
sgd_hinge_pl.predict(X_timer)
end_time = time.perf_counter_ns()
execution_time = end_time - start_time
results.append({'SGD - Hinge Loss': [acc, execution_time]})

##### SGD Classifier with Log Loss

In [26]:
sgd_log_pl = make_pipeline(StandardScaler(), SGDClassifier(loss='log_loss', penalty='elasticnet', alpha=.00000001, l1_ratio=.5, class_weight='balanced', learning_rate='optimal'))
sgd_log_pl.fit(X_train, y_train)

In [27]:
acc = sgd_log_pl.score(X_test, y_test)
start_time = time.perf_counter_ns()
sgd_log_pl.predict(X_timer)
end_time = time.perf_counter_ns()
execution_time = end_time - start_time
results.append({'SGD - Log Loss': [acc, execution_time]})

##### SGD Classifier with Squared Hinge

In [28]:
sgd_h2_pl = make_pipeline(StandardScaler(), SGDClassifier(loss='squared_hinge', penalty='elasticnet', l1_ratio=.5, class_weight='balanced', learning_rate='optimal'))
sgd_h2_pl.fit(X_train, y_train)


Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.



In [29]:
acc = sgd_h2_pl.score(X_test, y_test)
start_time = time.perf_counter_ns()
sgd_h2_pl.predict(X_timer)
end_time = time.perf_counter_ns()
execution_time = end_time - start_time
results.append({'SGD - Squared Hinge': [acc, execution_time]})

##### SGDClassifier with Modified Huber Loss

In [30]:
sgd_mh_pl = make_pipeline(StandardScaler(), SGDClassifier(loss='modified_huber', penalty='elasticnet', alpha=.000001, l1_ratio=.5, class_weight='balanced'))
sgd_mh_pl.fit(X_train, y_train)

In [31]:
acc = sgd_mh_pl.score(X_test, y_test)
start_time = time.perf_counter_ns()
sgd_mh_pl.predict(X_timer)
end_time = time.perf_counter_ns()
execution_time = end_time - start_time
results.append({'SGD - Modified Huber': [acc, execution_time]})

# Output the final results

In [34]:
#Output the results table
model_names = [list(res.keys())[0] for res in results]
accuracies = [list(res.values())[0][0] for res in results]  # get the first element of the list
times = [list(res.values())[0][1] for res in results]  # get the second element of the list

#Make it into a DataFrame
df = pd.DataFrame(list(zip(model_names, accuracies, times)), columns=['Model Type', 'Accuracy', 'Time (ns)'])

#print the DataFrame to console in pretty format
print(df.to_string(index=False))

          Model Type  Accuracy  Time (ns)
 SVC - Linear Kernel  0.988301     944200
   SVC - Poly Kernel  0.989217    1313500
    SVC - RBF Kernel  0.984944     466500
    SGD - Hinge Loss  0.985351     245700
      SGD - Log Loss  0.991963     194400
 SGD - Squared Hinge  0.859207     198900
SGD - Modified Huber  0.976399     207300
