In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df = pd.read_csv("log2.csv")
df

In [None]:
df.info()

In [None]:


allow, deny, drop, reset_both = df['Action'].value_counts()
print('Number of allowed logs: ', allow)  
print('Number of denied logs: ', deny)
print('Number of dropped logs: ', drop)  
print('Number of reset-both logs: ', reset_both) 

print('\n')
print('% of allowed logs', round(allow / len(df) * 100, 1), '%')
print('% of denied logs', round(deny / len(df) * 100, 1), '%')
print('% of dropped logs', round(drop / len(df) * 100, 1), '%')
print('% of reset-both logs', round(reset_both / len(df) * 100, 1), '%')

In [None]:
df['Action'].value_counts().plot(kind='barh')

Pretty unbalanced classes to predict

In [None]:
# missing data
df.isnull().sum() 
# no missing data, no imputation needed

In [None]:
description = df.groupby(['Action']).describe()

In [None]:
c = df.corr().abs()
sns.heatmap(c, cmap=sns.color_palette("Blues", as_cmap=True))

'Bytes Sent', 'Bytes Received', 'pkts_sent' and 'pkts_received' can be discarded as Bytes and Packets are the total of the two pairs respectively.

I am also going to discard packets for bytes as it is highly correlated. I'll keep Bytes since it is more detailed than packets (1 packet consists of multiple bytes)

All of the port variables should not be seen as continous, but probably are interesting to see the range

In [None]:
sns.displot(df, x="Source Port", hue="Action")
plt.show()

In [None]:
description['Source Port']

All Actions of drop seem to happen in high source ports (minimum 49156). Reset-both Source Port minimum is 1024

In [None]:
sns.displot(df, x="Destination Port", hue="Action")
plt.show()

In [None]:
description['Destination Port']

Most actions seem to have a very low destination port. All drop actions are done on Destination Port 445

In [None]:
sns.displot(df, x="NAT Source Port", hue="Action")
plt.show()

In [None]:
description['NAT Source Port']

Allowed actions seme to be uniformly distributed over NAT Source Ports. All dropped NAT Source Ports are equal to 0. Most deny and reset-both actions have NAT Source Ports of 0.

In [None]:
sns.displot(df, x="NAT Destination Port", hue="Action")
plt.show()

In [None]:
description['NAT Destination Port']

Allowed actions seme to be uniformly distributed over NAT Destination Ports. All dropped NAT Destination Ports are equal to 0. Most deny and reset-both actions have NAT Destination Ports of 0.

In [None]:
cols = ['Source Port', 
        'Destination Port', 
        'NAT Source Port', 
        'NAT Destination Port', 
        'Bytes', 
        'Elapsed Time (sec)']
df_features = df[cols].rename(columns={'Source Port':'source_port',
                                       'Destination Port':'destination_port', 
                                       'NAT Source Port':'nat_source_port', 
                                       'NAT Destination Port':'nat_destination_port',
                                       'Bytes':'bytes',
                                       'Elapsed Time (sec)':'elapsed_time'})

In [None]:
c = df_features.corr().abs()
sns.heatmap(c, cmap=sns.color_palette("Blues", as_cmap=True))

In [None]:
y = np.array(df['Action'])
X = np.array(df_features)
X.shape