# Data preparation

In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


## Constructing the network packets dataset

### First set

First, we need to import the first set of before-after packets.

In [2]:
before_firewall_all_1 = pd.read_csv('before_firewall_1_flags.csv')
after_firewall_all_1 = pd.read_csv('after_firewall_1_flags.csv')

The firewall can change the Identification of some packets after filtering.

See the case of ip.src==95.163.255.65 - 3 TCP packets that should have the 'Identification' number of 63179, 63180 and 63181 now have the 'Identification' number of 64462, 37223 and 22683 respectively.

Therefore, 'Identification' cannot be trusted. So we need to drop the column.

In [3]:
before_firewall_all_1

Unnamed: 0,Source,Destination,Protocol,Source Port,Destination Port,Flags,Content Type
0,164.132.230.244,193.231.20.40,TLSv1.2,38394,443,0x018,Application Data
1,82.77.127.28,193.231.20.40,TLSv1.2,55123,443,0x018,Alert
2,82.77.127.28,193.231.20.40,TCP,55123,443,0x011,
3,82.77.127.28,193.231.20.40,TCP,55126,443,0x002,
4,66.249.66.88,193.231.20.40,TCP,44678,80,0x002,
5,82.77.127.28,193.231.20.40,TCP,55126,443,0x010,
6,82.77.127.28,193.231.20.40,TLSv1.2,55126,443,0x018,Handshake
7,66.249.66.86,193.231.20.40,TCP,56993,80,0x002,
8,164.132.230.244,193.231.20.40,TCP,38394,443,0x010,
9,164.132.230.244,193.231.20.40,TCP,38394,443,0x010,


In [4]:
after_firewall_all_1

Unnamed: 0,Source,Destination,Protocol,Source Port,Destination Port,Flags,Content Type
0,52.176.91.137,193.231.20.40,TCP,32848,80,0x010,
1,52.176.91.137,193.231.20.40,TCP,32848,80,0x010,
2,52.176.91.137,193.231.20.40,TCP,32848,80,0x011,
3,52.176.91.137,193.231.20.40,TCP,32986,80,0x002,
4,82.77.127.28,193.231.20.40,TLSv1.2,55120,443,0x018,"Change Cipher Spec,Handshake"
5,52.176.91.137,193.231.20.40,TCP,32986,80,0x010,
6,52.176.91.137,193.231.20.40,TCP,32986,80,0x018,
7,82.77.127.28,193.231.20.40,TCP,55120,443,0x010,
8,82.77.127.28,193.231.20.40,TLSv1.2,55120,443,0x018,Application Data
9,46.229.168.132,193.231.20.40,TCP,5372,80,0x002,


In [5]:
before_firewall_all_1.shape

(315744, 7)

In [6]:
before_dropped = before_firewall_all_1.drop_duplicates()
after_dropped = after_firewall_all_1.drop_duplicates()

In [7]:
after_dropped.shape

(59733, 7)

In [8]:
before_dropped.shape

(60063, 7)

In [9]:
before_dropped.Protocol.unique()

array(['TLSv1.2', 'TCP', 'HTTP', 'TLSv1', 'SSLv2', 'ICMP', 'HTTP/XML'],
      dtype=object)

In [10]:
after_dropped.Protocol.unique()

array(['TCP', 'TLSv1.2', 'HTTP', 'TLSv1', 'SSLv2', 'SSHv2', 'HTTP/XML',
       'ICMP'], dtype=object)

In [26]:
df_merged_on_exist = pd.merge(before_dropped, after_dropped, 
                              on=['Source','Destination','Protocol','Source Port','Destination Port','Flags','Content Type'], 
                              how='left', 
                              indicator='Exist')

In [27]:
df_merged_on_exist

Unnamed: 0,Source,Destination,Protocol,Source Port,Destination Port,Flags,Content Type,Exist
0,164.132.230.244,193.231.20.40,TLSv1.2,38394,443,0x018,Application Data,both
1,82.77.127.28,193.231.20.40,TLSv1.2,55123,443,0x018,Alert,both
2,82.77.127.28,193.231.20.40,TCP,55123,443,0x011,,both
3,82.77.127.28,193.231.20.40,TCP,55126,443,0x002,,both
4,66.249.66.88,193.231.20.40,TCP,44678,80,0x002,,both
5,82.77.127.28,193.231.20.40,TCP,55126,443,0x010,,both
6,82.77.127.28,193.231.20.40,TLSv1.2,55126,443,0x018,Handshake,both
7,66.249.66.86,193.231.20.40,TCP,56993,80,0x002,,both
8,164.132.230.244,193.231.20.40,TCP,38394,443,0x010,,both
9,164.132.230.244,193.231.20.40,TLSv1.2,38394,443,0x018,Alert,both


In [28]:
df_merged_on_exist.Exist.unique()

[both, left_only]
Categories (2, object): [both, left_only]

Only the packets with 'Exist'=='left_only' are the ones dropped.

In [29]:
df_merged_on_exist[df_merged_on_exist["Exist"]=='both']

Unnamed: 0,Source,Destination,Protocol,Source Port,Destination Port,Flags,Content Type,Exist
0,164.132.230.244,193.231.20.40,TLSv1.2,38394,443,0x018,Application Data,both
1,82.77.127.28,193.231.20.40,TLSv1.2,55123,443,0x018,Alert,both
2,82.77.127.28,193.231.20.40,TCP,55123,443,0x011,,both
3,82.77.127.28,193.231.20.40,TCP,55126,443,0x002,,both
4,66.249.66.88,193.231.20.40,TCP,44678,80,0x002,,both
5,82.77.127.28,193.231.20.40,TCP,55126,443,0x010,,both
6,82.77.127.28,193.231.20.40,TLSv1.2,55126,443,0x018,Handshake,both
7,66.249.66.86,193.231.20.40,TCP,56993,80,0x002,,both
8,164.132.230.244,193.231.20.40,TCP,38394,443,0x010,,both
9,164.132.230.244,193.231.20.40,TLSv1.2,38394,443,0x018,Alert,both


In [30]:
rejected_packets_1_no_duplicates = df_merged_on_exist[df_merged_on_exist["Exist"]=='left_only']

In [31]:
rejected_packets_1_no_duplicates

Unnamed: 0,Source,Destination,Protocol,Source Port,Destination Port,Flags,Content Type,Exist
21,104.238.118.103,193.231.20.40,TCP,48030,443,0x011,,left_only
88,95.163.255.67,193.231.20.40,TCP,44497,443,0x011,,left_only
116,192.0.78.33,193.231.20.40,TLSv1.2,443,36216,0x010,Handshake,left_only
125,54.146.176.100,193.231.20.40,TLSv1.2,443,60686,0x010,Handshake,left_only
131,95.163.255.65,193.231.20.40,TCP,59275,443,0x011,,left_only
348,178.138.99.219,193.231.20.40,TLSv1.2,38682,443,0x019,Alert,left_only
412,69.46.36.14,193.231.20.40,TLSv1.2,443,39188,0x010,Handshake,left_only
601,192.0.78.32,193.231.20.40,TLSv1.2,443,47474,0x010,Handshake,left_only
654,69.46.36.14,193.231.20.40,TLSv1.2,443,39196,0x010,Handshake,left_only
808,192.0.78.33,193.231.20.40,TLSv1.2,443,36254,0x010,Handshake,left_only


In [17]:
rejected_packets_1_final = rejected_packets_1_no_duplicates.drop(columns=['Exist'])

In [34]:
rejected_packets_1_final

Unnamed: 0,Source,Destination,Protocol,Source Port,Destination Port,Flags,Content Type
21,104.238.118.103,193.231.20.40,TCP,48030,443,0x011,
88,95.163.255.67,193.231.20.40,TCP,44497,443,0x011,
116,192.0.78.33,193.231.20.40,TLSv1.2,443,36216,0x010,Handshake
125,54.146.176.100,193.231.20.40,TLSv1.2,443,60686,0x010,Handshake
131,95.163.255.65,193.231.20.40,TCP,59275,443,0x011,
348,178.138.99.219,193.231.20.40,TLSv1.2,38682,443,0x019,Alert
412,69.46.36.14,193.231.20.40,TLSv1.2,443,39188,0x010,Handshake
601,192.0.78.32,193.231.20.40,TLSv1.2,443,47474,0x010,Handshake
654,69.46.36.14,193.231.20.40,TLSv1.2,443,39196,0x010,Handshake
808,192.0.78.33,193.231.20.40,TLSv1.2,443,36254,0x010,Handshake


### Second set

In [19]:
before_firewall_all_2 = pd.read_csv('before_firewall_2_flags.csv')
after_firewall_all_2 = pd.read_csv('after_firewall_2_flags.csv')

In [20]:
before_dropped_2 = before_firewall_all_2.drop_duplicates()
after_dropped_2 = after_firewall_all_2.drop_duplicates()

In [21]:
before_dropped_2.Protocol.unique()

array(['TCP', 'TLSv1.2', 'HTTP', 'ICMP', 'DNS', 'NTP', 'TLSv1', 'TLSv1.1',
       'ESP', 'SSLv2', 'HTTP/XML', 'SIP', 'UDP'], dtype=object)

In [22]:
df_merged_on_exist_2 = pd.merge(before_dropped_2, after_dropped_2, 
                              on=['Source','Destination','Protocol','Source Port','Destination Port','Flags','Content Type'], 
                              how='left', 
                              indicator='Exist')

In [23]:
rejected_packets_2_no_duplicates = df_merged_on_exist_2[df_merged_on_exist_2["Exist"]=='left_only']

In [24]:
rejected_packets_2_no_duplicates

Unnamed: 0,Source,Destination,Protocol,Source Port,Destination Port,Flags,Content Type,Exist
4,78.157.210.66,193.231.20.40,TCP,59218.0,80.0,0x011,,left_only
80,69.46.36.14,193.231.20.40,TLSv1.2,443.0,44788.0,0x010,Handshake,left_only
93,188.27.151.47,193.231.20.40,ICMP,,,,,left_only
173,86.124.142.222,193.231.20.40,TCP,23677.0,80.0,0x011,,left_only
201,86.124.142.222,193.231.20.40,TCP,23685.0,80.0,0x011,,left_only
354,54.36.148.207,193.231.20.40,TCP,39046.0,443.0,0x004,,left_only
657,220.181.124.1,193.231.20.40,TCP,42420.0,443.0,0x004,,left_only
716,216.58.208.42,193.231.20.40,TLSv1.2,443.0,40172.0,0x010,Handshake,left_only
726,216.58.207.74,193.231.20.40,TLSv1.2,443.0,56242.0,0x010,Handshake,left_only
731,216.58.207.74,193.231.20.40,TLSv1.2,443.0,56242.0,0x010,Application Data,left_only


In [32]:
rejected_packets_2_final = rejected_packets_2_no_duplicates.drop(columns=['Exist'])

In [33]:
rejected_packets_2_final

Unnamed: 0,Source,Destination,Protocol,Source Port,Destination Port,Flags,Content Type
4,78.157.210.66,193.231.20.40,TCP,59218.0,80.0,0x011,
80,69.46.36.14,193.231.20.40,TLSv1.2,443.0,44788.0,0x010,Handshake
93,188.27.151.47,193.231.20.40,ICMP,,,,
173,86.124.142.222,193.231.20.40,TCP,23677.0,80.0,0x011,
201,86.124.142.222,193.231.20.40,TCP,23685.0,80.0,0x011,
354,54.36.148.207,193.231.20.40,TCP,39046.0,443.0,0x004,
657,220.181.124.1,193.231.20.40,TCP,42420.0,443.0,0x004,
716,216.58.208.42,193.231.20.40,TLSv1.2,443.0,40172.0,0x010,Handshake
726,216.58.207.74,193.231.20.40,TLSv1.2,443.0,56242.0,0x010,Handshake
731,216.58.207.74,193.231.20.40,TLSv1.2,443.0,56242.0,0x010,Application Data


In [37]:
concat = pd.concat([rejected_packets_1_final,rejected_packets_2_final],axis=0)
concat

Unnamed: 0,Source,Destination,Protocol,Source Port,Destination Port,Flags,Content Type
21,104.238.118.103,193.231.20.40,TCP,48030.0,443.0,0x011,
88,95.163.255.67,193.231.20.40,TCP,44497.0,443.0,0x011,
116,192.0.78.33,193.231.20.40,TLSv1.2,443.0,36216.0,0x010,Handshake
125,54.146.176.100,193.231.20.40,TLSv1.2,443.0,60686.0,0x010,Handshake
131,95.163.255.65,193.231.20.40,TCP,59275.0,443.0,0x011,
348,178.138.99.219,193.231.20.40,TLSv1.2,38682.0,443.0,0x019,Alert
412,69.46.36.14,193.231.20.40,TLSv1.2,443.0,39188.0,0x010,Handshake
601,192.0.78.32,193.231.20.40,TLSv1.2,443.0,47474.0,0x010,Handshake
654,69.46.36.14,193.231.20.40,TLSv1.2,443.0,39196.0,0x010,Handshake
808,192.0.78.33,193.231.20.40,TLSv1.2,443.0,36254.0,0x010,Handshake


In [38]:
concat.drop_duplicates()

Unnamed: 0,Source,Destination,Protocol,Source Port,Destination Port,Flags,Content Type
21,104.238.118.103,193.231.20.40,TCP,48030.0,443.0,0x011,
88,95.163.255.67,193.231.20.40,TCP,44497.0,443.0,0x011,
116,192.0.78.33,193.231.20.40,TLSv1.2,443.0,36216.0,0x010,Handshake
125,54.146.176.100,193.231.20.40,TLSv1.2,443.0,60686.0,0x010,Handshake
131,95.163.255.65,193.231.20.40,TCP,59275.0,443.0,0x011,
348,178.138.99.219,193.231.20.40,TLSv1.2,38682.0,443.0,0x019,Alert
412,69.46.36.14,193.231.20.40,TLSv1.2,443.0,39188.0,0x010,Handshake
601,192.0.78.32,193.231.20.40,TLSv1.2,443.0,47474.0,0x010,Handshake
654,69.46.36.14,193.231.20.40,TLSv1.2,443.0,39196.0,0x010,Handshake
808,192.0.78.33,193.231.20.40,TLSv1.2,443.0,36254.0,0x010,Handshake
