# Data preparation

In [35]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.utils import shuffle

## Importing the network packets exported from Wireshark

First, we need to import all the sets of before-after packets.

In [2]:
before_firewall_all_1 = pd.read_csv('Data/Set 1/before_firewall_1_flags.csv')
after_firewall_all_1 = pd.read_csv('Data/Set 1/after_firewall_1_flags.csv')

In [3]:
before_firewall_all_2 = pd.read_csv('Data/Set 2/before_firewall_2_flags.csv')
after_firewall_all_2 = pd.read_csv('Data/Set 2/after_firewall_2_flags.csv')

In [4]:
before_firewall_all_3 = pd.read_csv('Data/Set 3/before_firewall_3_flags.csv')
after_firewall_all_3 = pd.read_csv('Data/Set 3/after_firewall_3_flags.csv')

In [5]:
before_firewall_all_4 = pd.read_csv('Data/Set 4/before_firewall_4_flags.csv')
after_firewall_all_4 = pd.read_csv('Data/Set 4/after_firewall_4_flags.csv')

In [6]:
before_firewall_all_5 = pd.read_csv('Data/Set 5/before_firewall_5_flags.csv')
after_firewall_all_5 = pd.read_csv('Data/Set 5/after_firewall_5_flags.csv')

In [7]:
before_firewall_all_6 = pd.read_csv('Data/Set 6/before_firewall_6_flags.csv')
after_firewall_all_6 = pd.read_csv('Data/Set 6/after_firewall_6_flags.csv')

In [8]:
before_firewall_all_7 = pd.read_csv('Data/Set 7/before_firewall_7_flags.csv')
after_firewall_all_7 = pd.read_csv('Data/Set 7/after_firewall_7_flags.csv')

In [9]:
before_firewall_all_8 = pd.read_csv('Data/Set 8/before_firewall_8_flags.csv')
after_firewall_all_8 = pd.read_csv('Data/Set 8/after_firewall_8_flags.csv')

## Automatization process...

In [10]:
def separate_packets(before_firewall, after_firewall):
    """Separates the rejected/dropped packets from the accepted ones"""
    
    # Drop duplicates from both dataframes
    before_firewall_no_duplicates = before_firewall.drop_duplicates()
    after_firewall_no_duplicates = after_firewall.drop_duplicates()
    
    # Left join to select only dropped/rejected packets
    df_left_join = pd.merge(before_firewall_no_duplicates, after_firewall_no_duplicates, 
                              on=['Source','Destination','Protocol','Source Port','Destination Port','Flags','Content Type'], 
                              how='left', 
                              indicator='Exist')
    
    # Separate the packets
    dropped_packets = df_left_join[df_left_join["Exist"]=="left_only"]
    accepted_packets = df_left_join[df_left_join["Exist"]=="both"]
    
    # Drop "Exist" column
    dropped_packets = dropped_packets.drop(columns=["Exist"])
    accepted_packets = accepted_packets.drop(columns=["Exist"])
    
    # Add label
    dropped_packets["Accepted"] = 0
    accepted_packets["Accepted"] = 1
    
    return dropped_packets, accepted_packets

In [11]:
after_firewall_all_8

Unnamed: 0,Source,Destination,Protocol,Source Port,Destination Port,Flags,Content Type
0,148.70.231.231,193.231.20.40,TCP,49453.0,80.0,0x010,
1,148.70.231.231,193.231.20.40,HTTP/XML,49453.0,80.0,0x018,
2,207.46.13.4,193.231.20.40,TCP,5020.0,80.0,0x0c2,
3,207.46.13.4,193.231.20.40,TCP,5020.0,80.0,0x010,
4,207.46.13.4,193.231.20.40,HTTP,5020.0,80.0,0x018,
5,148.70.231.231,193.231.20.40,TCP,49453.0,80.0,0x010,
6,148.70.231.231,193.231.20.40,TCP,49453.0,80.0,0x011,
7,148.70.231.231,193.231.20.40,TCP,49471.0,80.0,0x002,
8,207.46.13.4,193.231.20.40,TCP,5020.0,80.0,0x010,
9,207.46.13.4,193.231.20.40,TCP,5020.0,80.0,0x011,


In [12]:
df_packets = pd.DataFrame()

before_firewall_array = [before_firewall_all_1, before_firewall_all_2, before_firewall_all_3, 
                         before_firewall_all_4, before_firewall_all_5,before_firewall_all_6,
                        before_firewall_all_7,before_firewall_all_8]

after_firewall_array = [after_firewall_all_1, after_firewall_all_2,after_firewall_all_3,
                        after_firewall_all_4,after_firewall_all_5,after_firewall_all_6,
                       after_firewall_all_7,after_firewall_all_8]

In [13]:
def prepare_dataset(before_firewall_list, after_firewall_list):
    """Prepares the full dataset containing all the dropped and accepted packets with their labels"""
    df_rejected = pd.DataFrame()
    df_accepted = pd.DataFrame()

    for i in range(0, len(before_firewall_list)):
        dropped_pkts, accepted_pkts = separate_packets(before_firewall_list[i], after_firewall_list[i])
        df_rejected = pd.concat([df_rejected, dropped_pkts],axis=0).drop_duplicates()
        df_accepted = pd.concat([df_accepted, accepted_pkts], axis=0).drop_duplicates()
        
    return df_rejected, df_accepted

In [14]:
df_rejected, df_accepted = prepare_dataset(before_firewall_array,after_firewall_array)

In [15]:
df_rejected

Unnamed: 0,Source,Destination,Protocol,Source Port,Destination Port,Flags,Content Type,Accepted
21,104.238.118.103,193.231.20.40,TCP,48030.0,443.0,0x011,,0
89,95.163.255.67,193.231.20.40,TCP,44497.0,443.0,0x011,,0
117,192.0.78.33,193.231.20.40,TLSv1.2,443.0,36216.0,0x010,Handshake,0
126,54.146.176.100,193.231.20.40,TLSv1.2,443.0,60686.0,0x010,Handshake,0
132,95.163.255.65,193.231.20.40,TCP,59275.0,443.0,0x011,,0
349,178.138.99.219,193.231.20.40,TLSv1.2,38682.0,443.0,0x019,Alert,0
413,69.46.36.14,193.231.20.40,TLSv1.2,443.0,39188.0,0x010,Handshake,0
602,192.0.78.32,193.231.20.40,TLSv1.2,443.0,47474.0,0x010,Handshake,0
655,69.46.36.14,193.231.20.40,TLSv1.2,443.0,39196.0,0x010,Handshake,0
809,192.0.78.33,193.231.20.40,TLSv1.2,443.0,36254.0,0x010,Handshake,0


In [16]:
df_accepted

Unnamed: 0,Source,Destination,Protocol,Source Port,Destination Port,Flags,Content Type,Accepted
0,164.132.230.244,193.231.20.40,TLSv1.2,38394.0,443.0,0x018,Application Data,1
1,82.77.127.28,193.231.20.40,TLSv1.2,55123.0,443.0,0x018,Alert,1
2,82.77.127.28,193.231.20.40,TCP,55123.0,443.0,0x011,,1
3,82.77.127.28,193.231.20.40,TCP,55126.0,443.0,0x002,,1
4,66.249.66.88,193.231.20.40,TCP,44678.0,80.0,0x002,,1
5,82.77.127.28,193.231.20.40,TCP,55126.0,443.0,0x010,,1
6,82.77.127.28,193.231.20.40,TLSv1.2,55126.0,443.0,0x018,Handshake,1
7,66.249.66.86,193.231.20.40,TCP,56993.0,80.0,0x002,,1
8,164.132.230.244,193.231.20.40,TCP,38394.0,443.0,0x010,,1
9,164.132.230.244,193.231.20.40,TLSv1.2,38394.0,443.0,0x018,Alert,1


We are doing another left merge between the rejected packets and the accepted packets to see if any duplcates slipped into both dataframes unnoticed.

In [17]:
df_left_join_testing = pd.merge(df_rejected, df_accepted, 
                              on=['Source','Destination','Protocol','Source Port','Destination Port','Flags','Content Type','Accepted'], 
                              how='left', 
                              indicator='IsDuplicated')

In [18]:
df_left_join_testing[df_left_join_testing['IsDuplicated']=="both"]

Unnamed: 0,Source,Destination,Protocol,Source Port,Destination Port,Flags,Content Type,Accepted,IsDuplicated


In [20]:
dataset = pd.concat([df_rejected, df_accepted], axis=0)

In [21]:
dataset

Unnamed: 0,Source,Destination,Protocol,Source Port,Destination Port,Flags,Content Type,Accepted
21,104.238.118.103,193.231.20.40,TCP,48030.0,443.0,0x011,,0
89,95.163.255.67,193.231.20.40,TCP,44497.0,443.0,0x011,,0
117,192.0.78.33,193.231.20.40,TLSv1.2,443.0,36216.0,0x010,Handshake,0
126,54.146.176.100,193.231.20.40,TLSv1.2,443.0,60686.0,0x010,Handshake,0
132,95.163.255.65,193.231.20.40,TCP,59275.0,443.0,0x011,,0
349,178.138.99.219,193.231.20.40,TLSv1.2,38682.0,443.0,0x019,Alert,0
413,69.46.36.14,193.231.20.40,TLSv1.2,443.0,39188.0,0x010,Handshake,0
602,192.0.78.32,193.231.20.40,TLSv1.2,443.0,47474.0,0x010,Handshake,0
655,69.46.36.14,193.231.20.40,TLSv1.2,443.0,39196.0,0x010,Handshake,0
809,192.0.78.33,193.231.20.40,TLSv1.2,443.0,36254.0,0x010,Handshake,0


In [40]:
ds_final = shuffle(dataset)

In [41]:
ds_final

Unnamed: 0,Source,Destination,Protocol,Source Port,Destination Port,Flags,Content Type,Accepted
25666,104.31.70.141,193.231.20.40,HTTP,80.0,35764.0,0x011,,1
54806,174.138.30.236,193.231.20.40,TCP,60484.0,80.0,0x011,,1
44038,78.96.80.4,193.231.20.40,HTTP,57852.0,80.0,0x018,,1
9283,207.46.13.129,193.231.20.40,TCP,14217.0,80.0,0x0c2,,1
29964,27.19.170.131,193.231.20.40,TCP,60171.0,80.0,0x011,,1
33342,94.176.146.172,193.231.20.40,TLSv1.2,50335.0,443.0,0x018,Application Data,1
44571,213.233.84.17,193.231.20.40,TCP,13159.0,443.0,0x010,,1
62962,81.89.14.113,193.231.20.40,TCP,47880.0,80.0,0x011,,1
1451,86.34.180.6,193.231.20.40,TCP,58681.0,80.0,0x010,,1
19334,213.233.84.61,193.231.20.40,TLSv1.2,3130.0,443.0,0x018,"Change Cipher Spec,Handshake",0


In [45]:
ds_final.to_csv("initial_packets_dataset.csv")