# Feature Extraction More Math 4100 Project 
Main difference: 

### Data Features  
Features extracted and used for analysis/classification are below:
* Total Number of Packets 
* Total Number of Bytes
* Average inter-arrival time of packets
* Average Byte-Rate of Connection (bytes per second)
* Maximum packet size
* Minimum packet size
* Longest time between packet arrivals
* Shortest time between packet arrivals
* Direction of travel (Outgoing: 1, Incoming: 0)
* Number of [ARP](https://erg.abdn.ac.uk/users/gorry/course/inet-pages/arp.html) packets 
* Number of [DNS](https://www.networkworld.com/article/3268449/what-is-dns-and-how-does-it-work.html)
* Number of [TCP ACKs](http://packetlife.net/blog/2010/jun/7/understanding-tcp-sequence-acknowledgment-numbers/)
* Maximum advertised [receive window](http://packetbomb.com/understanding-throughput-and-tcp-windows/)
* Minimum advertised [receive window](http://packetbomb.com/understanding-throughput-and-tcp-windows/)  

All are taken directly from [this paper](https://www.flux.utah.edu/paper/283) (Baker et al.).  

Below is code that featurizes my data, returning the feature vectors for each file as a list of np arrays.  
These arrays put into a dictionary of form: `data_set name : list of feature vectors` and saved as `.npy` files in the data directory.  

In [2]:
TSLimport numpy as np
import netaddr
import csv
import math
import datetime

In [3]:
#
# Given a CSV file of packet captures and the source and the ip source of capture, 
# extracts feature vectors for all packets in the file. 
# Separates packets into their connections and extracts a feature for every 2 seconds of a connection.
#
# Fetaure vectors are of length 14 
#
def featurize(filename, ip):
    # tracks connection features (every 2-second window is featurized)
    connections = {}
    # list of feature vectors that is returned
    return_data = []
    start_time = 0.0
    
    # reading in the CSV file of packet captures
    with open(filename, 'r') as data_file:
        reader = csv.reader(data_file, delimiter=',')
        # each packet is represented by each line
        for index, line in enumerate(reader):
            # line format: [0:frame number 1:Time 2:Source IP 3:Dest IP 4:Protocol 5:(Frame)Length 6:Info]
            if line[1] != 'Time':
                time = float(line[1])
                protocol = line[4]
                try:
                    ip_src = int(netaddr.IPAddress(line[2]))
                    ip_dst = int(netaddr.IPAddress(line[3]))
                except:
                    continue
                # each unique connection has the key: ip_source-ip_dest-protocol, this defines a "flow"
                key = str(ip_src) + '-' + str(ip_dst) + '-' + protocol
                if key not in connections:                    
                    # make new conncetion entry for new connections
                    connections[key] = {'baseTime': start_time, 'lastArrival': time, 
                                        'endTime': start_time+2.0, 'longestInterArrival': 0, 
                                        'shortestInterArrival': 2, 'packets': 0, 'bytes': 0, 
                                        'largest_packet': 0, 'smallest_packet': 999999999999, 
                                        'direction': 0, 'arp_count': 0, 'dns_count': 0, 'ack_count': 0, 
                                        'min_cong_win': 999999999999, 'max_cong_win': 0}
                    start_time += 2.0
                conn_data = connections[key]
                length = int(line[5])
                extra_info = line[6]
                # if current packet's time is after the 2 second window, create 
                # new feature vector and add previous one to returned feature vectors
                if time > conn_data['endTime']:
                    inter_arrival = ((conn_data['endTime'] - conn_data['baseTime']) / 
                                     conn_data['packets']) if conn_data['packets'] > 0 else 0
                    bit_rate = conn_data['bytes'] / 2.0
                    feature_vec = [conn_data['packets'], conn_data['bytes'], round(inter_arrival, 5), 
                                        round(bit_rate, 5), conn_data['largest_packet'], conn_data['smallest_packet'], 
                                        conn_data['longestInterArrival'], conn_data['shortestInterArrival'], 
                                        conn_data['direction'], conn_data['arp_count'], conn_data['dns_count'], 
                                        conn_data['ack_count'], conn_data['min_cong_win'], conn_data['max_cong_win']]
                    return_data.append(np.array(feature_vec))
                    connections[key] = {'baseTime': start_time, 'lastArrival': time, 
                                        'endTime': start_time+2.0, 'longestInterArrival': 0, 
                                        'shortestInterArrival': 2, 'packets': 0, 'bytes': 0, 
                                        'largest_packet': 0, 'smallest_packet': 999999999999, 
                                        'direction': 0, 'arp_count': 0, 'dns_count': 0, 'ack_count': 0, 
                                        'min_cong_win': 999999999999, 'max_cong_win': 0}
                    conn_data = connections[key]
                    start_time += 2.0
                # updating connection statistics with the information in the current packet
                if (time - conn_data['lastArrival']) > 0 and (time - conn_data['lastArrival']) < conn_data['shortestInterArrival']:
                    conn_data['shortestInterArrival'] = time - conn_data['lastArrival']
                if (time - conn_data['lastArrival']) > conn_data['longestInterArrival']:
                    conn_data['longestInterArrival'] = time - conn_data['lastArrival']
                if length > conn_data['largest_packet']:
                    conn_data['largest_packet'] = length
                if length < conn_data['smallest_packet']:
                    conn_data['smallest_packet'] = length
                # if packet's source is the capture ip then it's outgoing and direction feature is 1
                if ip in line[2]:
                    conn_data['direction'] = 1
                if protocol == 'DNS':
                    conn_data['dns_count'] += 1
                if protocol.find('ARP') >= 0:
                    conn_data['arp_count'] += 1
                # extra info (last col of CSV) is used to get feaqture: Number of TCP ACKs 
                if protocol == 'TCP' and extra_info.find('[ACK]'):
                    conn_data['ack_count'] += 1
                # extra info (last col of CSV) is used to get features: Max/Min Advertised Receive Window
                if protocol == 'TCP' and extra_info.find('Win=') >= 0:
                    window_index = extra_info.index('Win=')
                    if window_index >= 0:
                        window_size = int(extra_info[window_index+4:extra_info.index(' ', window_index)])
                        if window_size < conn_data['min_cong_win']:
                            conn_data['min_cong_win'] = window_size
                        if window_size > conn_data['max_cong_win']:
                            conn_data['max_cong_win'] = window_size
                conn_data['packets'] += 1
                conn_data['bytes'] += length
                conn_data['lastArrival'] = time
    return return_data

# Directory all data is stored in:
data_dir = 'Data'

**Some notes on features:**  
A feature vector is created for every 2 seconds of each connection or flow. A flow is defined by a TCP 5-Tuple that defines the connection through which packets are transmitted:  
`{source port, sorce IP address, destination port, destination IP address, networking protocol}`  

This means that even if a connection has no packets transmitted over a period of 2 seconds, a feature vector is still created. (Default feature vector values such as 'Number of Packets' = 0 are valid, and aren't discarded in dataset for analysis)

In [4]:
# Mirror labelled traffic dataset
mirror1 = featurize(data_dir + '/mirror1.csv', '204.99.128.20')
mirror2 = featurize(data_dir + '/mirror2.csv', '204.99.128.20')
mirror3 = featurize(data_dir + '/mirror3.csv', '204.99.128.20')
mirror4 = featurize(data_dir + '/mirror4.csv', '204.99.128.20')
mirror5 = featurize(data_dir + '/mirror5.csv', '204.99.128.20')

In [5]:
# Data Transfer Node labelled traffic dataset
dtn1 = featurize(data_dir + '/dtn1.csv', '204.99.128.81')
dtn2 = featurize(data_dir + '/dtn2.csv', '204.99.128.81')
dtn3 = featurize(data_dir + '/dtn3.csv', '204.99.128.81')
dtn4 = featurize(data_dir + '/dtn4.csv', '204.99.128.81')
dtn5 = featurize(data_dir + '/dtn5.csv', '204.99.128.81')

In [6]:
# Youtube traffic labelled traffic dataset
y1 = featurize(data_dir + '/youtube1.csv', '155.101.8.11')
y2 = featurize(data_dir + '/youtube2.csv', '155.101.8.11')
y3 = featurize(data_dir + '/youtube3.csv', '155.101.8.11')
y4 = featurize(data_dir + '/youtube4.csv', '155.101.8.11')
y5 = featurize(data_dir + '/youtube5.csv', '155.101.8.11')

In [7]:
# Airplane traffic labelled dataset
a1 = featurize(data_dir + '/airplane1.csv', '204.99.128.82')
a2 = featurize(data_dir + '/airplane2.csv', '204.99.128.82')
a3 = featurize(data_dir + '/airplane3.csv', '204.99.128.82')
a4 = featurize(data_dir + '/airplane4.csv', '204.99.128.82')
a5 = featurize(data_dir + '/airplane5.csv', '204.99.128.82')

In [2]:
# Creating dictionary of all data, saving it
data_dict = {'mirror1' : mirror1, 'mirror2' : mirror2, 'mirror3' : mirror3,
                'mirror4' : mirror4, 'mirror5' : mirror5,
             'dtn1' : dtn1, 'dtn2' : dtn2, 'dtn3' : dtn3, 
                'dtn4' : dtn4, 'dtn5' : dtn5, 
             'youtube1' : y1, 'youtube2' : y2, 'youtube3' : y3,
                 'youtube4' : y4, 'youtube5' : y5,
             'airplane1' : a1, 'airplane2' : a2, 'airplane3' : a3,
                 'airplane4' : a4, 'airplane5' : a5}

np.save('Data/all_data.npy', data_dict)

NameError: name 'mirror1' is not defined