# Flow Feature Extraction
## TODO: MAKE CHANGES ##
    
These features are extracted per bi-directional flow:  
	0. Total Number of Packets
	1. Total Bytes
	2. Largest Packet Size
	3. Smallest Packet Size
	4. Number of ARP Packets
	5. Number of DNS Packets
	6. Number of TCP ACKs
	7. Minimum Advertised Receive Window
	8. Maximum Advertised Receive Window
	9. Direction of Outgoing Packets
	10. Std. Dev of packet size
	11. Average Packet Size
	12. Size of first 10 packets
	13. Number of TCP FIN: FIN in Info col. 
	14. Number of TCP SYN: SYN in Info col.
	15. Number of TCP RSTS: RST in Info col.
	16. Number of TCP PUSH: PSH in Info col. 
	17. Number of TCP URG: URG in Info col.
	18. Number of TCP CWR/CWE (Congestion Window Reduced)
	19. Number of TCP ECE (Explicit Congestion Notification Echo)
	20. Avg. Packet Inter-arrival time
	21. Max. Inter-arrival time
	22. Min. Inter-arrival time
	23. Avg. Packet Throughput (packets/second)
	24. Avg. Byte Throughput (bytes/second)
    25: Duration
  
Note: Ryan's "flows" were not featurized, his sessions are. (IP Source and IP Dest were only things used to create features for, no ports, so multiple flows (by TCP 5 Tuple) were featurized together).  

In [38]:
import numpy as np
import netaddr
import csv
import math
import datetime

#
# Extracts a feature vector for each flow in the input file.
#
# A flow is defined by a unique bi-directional tuple: 
# {IP Source Address, Source Port, IP Destination Address, Destination Port, Protocol}
# 
# The 26 features above are extracted for each feature in the given datafile. 
# The src_ip parameter is used to determine the Number of Incoming and Outgoing Packets features.
#
def extract_flow_features(filename, src_ip):
    # Dictionary holding values for feature calculations with keys of:
    # 'ip_source-source_port-ip_dest-dest_port-protocol'
    flows = {}
    
    # Returned feature values extracted from the given file
    feature_vecs = []
    
    # Reading in the CSV file of packet captures
    with open(filename, 'r') as data_csv:
        reader = csv.reader(data_csv, delimiter=',')
        # Loop through all lines (all packets), fill in statistics of each flow found
        for index, line in enumerate(reader):
            # Line/CSV row format (represents a packet): 
            # [0:Time (Packet Arrival) 1:Source IP 2:Dest IP 3:Protocol 
            #  4: Length 5:Info 6:Source Port 7: Dest Port]
            if line[0] == 'Time':
                continue
            # Parse out fields needed to construct key (if not the first CSV line)
            ip_src = line[1]
            src_port = line[6]
            ip_dest = line[2]
            dest_port = line[7]
            protocol = line[3]
            time = float(line[0])
            
            key = ip_src + '-' + src_port + '-' + ip_dest + '-' + dest_port + '-' + protocol
            
            # Create new flow and flow statistics if a new flow is encountered
            if key not in flows: 
                flows[key] = {'tot_bytes': 0, 'direction': 0, 'ARP': 0, 'DNS': 0,  
                              'ACK': 0, 'min_arw': 1000000, 'max_arw': 0, 'FIN': 0,
                              'SYN': 0, 'RST': 0, 'PUSH': 0, 'URG': 0, 'CWE': 0, 
                              'ECE': 0, 'start': time, 'end': 0,
                              'all_sizes': [], 'all_intervals': [], 'last_arrival': 0}
                # Direction of flow is consistent throughout all packets
                flows[key]['direction'] = 1 if ip_src == src_ip else 0
            
            # Update flow statistics based on current packet's information
            flow_stats = flows[key]
            length = float(line[4])
            info = line[5]
            
            # Total bytes update
            flow_stats['tot_bytes'] += length
            # THIS UPDATE WOULD BE FOR BI-DIRECTIONAL FLOWS, W COUNTS OF PACKETS 
            # Number of outgoing packets update
#             if ip_src == src_ip:
#                 flow_stats['num_outgoing'] += 1
            # Number of ARP Packets update
            if 'ARP' in protocol:
                flow_stats['ARP'] += 1
            # Number of DNS Packets update
            if 'DNS' in protocol:
                flow_stats['DNS'] += 1
            # Number of ACK Packets update
            if protocol == 'TCP' and 'ACK' in info:
                flow_stats['ACK'] += 1
            # Minimum or Maximum Advertised Receive Window update
            if protocol == 'TCP' and 'Win=' in info:
                win_ind = info.index('Win=')
                # Window size is immediately after the 'Win=' and goes until next space
                win_size = int(info[win_ind+4 : info.index(' ', win_ind)])
                if win_size < flow_stats['min_arw']:
                    flow_stats['min_arw'] = win_size
                elif win_size > flow_stats['max_arw']:
                    flow_stats['max_arw'] = win_size
            # Number of FIN Packets update
            if protocol == 'TCP' and 'FIN' in info:
                flow_stats['FIN'] += 1
            # Number of SYN Packets update
            if protocol == 'TCP' and 'SYN' in info:
                flow_stats['SYN'] += 1
            # Number of RST Packets update
            if protocol == 'TCP' and 'RST' in info:
                flow_stats['RST'] += 1
            # Number of PUSH Packets update
            if protocol == 'TCP' and 'PUSH' in info:
                flow_stats['PUSH'] += 1
            # Number of URG Packets update
            if protocol == 'TCP' and 'URG' in info:
                flow_stats['URG'] += 1
            # Number of CWE Packets update
            if protocol == 'TCP' and ('CWE' in info or 'CWR' in info):
                flow_stats['CWE'] += 1
            # Number of ECE Packets update
            if protocol == 'TCP' and 'ECE' in info:
                flow_stats['ECE'] += 1
            # End time update (every packet that's not first becomes most recent end)
            flow_stats['end'] = time
            # Adding this packet's size to array of all packet sizes
            flow_stats['all_sizes'].append(length)
            # Adding interval between this packet's arrival and last packet's arrival 
            # to array of all inter-packet arrival times
            flow_stats['all_intervals'].append(time - flow_stats['last_arrival'])
            # Update last arrival time
            flow_stats['last_arrival'] = time
    
    # After all packets are processed, calculate feature values for each flow
    
    # TODO: HANDLE 1 PACKET SITUATION FOR DURATION/RATES
    
    for flow_key in flows.keys():
        stats = flows[flow_key]
        all_pckt_sizes = np.array(stats['all_sizes'])    
        all_pckt_intervals = np.array(stats['all_intervals'])
        first_sizes = np.sum(all_pckt_sizes[:10]) if all_pckt_sizes.size >= 10 else np.sum(all_pckt_sizes)
        duration = stats['end'] - stats['start'] if all_pckt_sizes.size >= 1 else 0
        byte_rate = stats['tot_bytes']/duration
        pckt_rate = all_pckt_sizes.size / duration
        flow_features = np.array([len(all_pckt_sizes), stats['tot_bytes'], np.max(all_pckt_sizes),
                                  np.min(all_pckt_sizes), stats['ACK'], stats['DNS'], stats['ACK'],
                                  stats['min_arw'], stats['max_arw'], stats['direction'], 
                                  np.std(all_pckt_sizes), np.average(all_pckt_sizes), first_sizes,
                                  stats['FIN'], stats['SYN'], stats['RST'], stats['PUSH'],
                                  stats['URG'], stats['CWE'], stats['ECE'], np.average(all_pckt_intervals),
                                  np.max(all_pckt_intervals), np.min(all_pckt_intervals), 
                                  pckt_rate, byte_rate, duration]) #np.std(all_pckt_intervals)
        feature_vecs.append(flow_features)
    
    # Return a list of feature vectors, one per flow 
    # ALSO RETURNS DICTIONARY OF FLOW STATISTICS FOR TESTING
    return np.array(feature_vecs), flows

In [18]:
data_dir = "../DT-Data/"
dtn1_ip = '204.99.128.105'
clustereddtn_ip = '204.99.128.81'
kchow_ip = '155.101.8.11'
airplane2_ip = '204.99.128.82'
gdrive_ip = '172.217.11.170'
gdrive_ip2 = '172.217.4.138'
gdrive_ip3 = '172.217.5.74'

In [39]:
# Globus Flows feature extractions
globus_dtn1_src1 = data_dir + 'globus-dtn1-src-iso.csv'
globus_dtn1_src1, test_flows = extract_flow_features(globus_dtn1_src1, dtn1_ip)
print(f'Number of flow feature vectors from globus-dtn1-src-iso: {len(globus_dtn1_src1)}')

globus_dtn1_dest1 = data_dir + 'globus-dtn1-dest-iso.csv'
globus_dtn1_dest1, test_flows = extract_flow_features(globus_dtn1_dest1, kchow_ip)
print(f'Number of flow feature vectors from globus-dtn1-dest-iso: {len(globus_dtn1_dest1)}')

globus_dtn1_src2 = data_dir + 'globus-dtn1-src-iso2.csv'
globus_dtn1_src2, test_flows = extract_flow_features(globus_dtn1_src2, dtn1_ip)
print(f'Number of flow feature vectors from globus-dtn1-src-iso2.csv: {len(globus_dtn1_src2)}')

globus_dtn1_dest2 = data_dir + 'globus-dtn1-dest-iso2.csv'
globus_dtn1_dest2, test_flows = extract_flow_features(globus_dtn1_dest2, kchow_ip)
print(f'Number of flow feature vectors from globus-dtn1-dest-iso: {len(globus_dtn1_dest2)}')

globus_clusterdtn_src = data_dir + 'globus-clusterdtn-src-iso.csv'
globus_clusterdtn_src, test_flows = extract_flow_features(globus_clusterdtn_src, clustereddtn_ip)
print(f'Number of flow feature vectors from globus-dtn1-src-iso2.csv: {len(globus_clusterdtn_src)}')

globus_clusterdtn_dest = data_dir + 'globus-clusterdtn-dest-iso.csv'
globus_clusterdtn_dest, test_flows = extract_flow_features(globus_clusterdtn_dest, kchow_ip)
print(f'Number of flow feature vectors from globus-dtn1-dest-iso: {len(globus_clusterdtn_dest)}')

# Verify the duration calculation
for i, flow in enumerate(test_flows.keys()):
#         print('start: ', test_flows[flow]['start'], '   end: ', test_flows[flow]['end'])
        print(globus_clusterdtn_dest[i][25])

Number of flow feature vectors from globus-dtn1-src-iso: 5


ZeroDivisionError: float division by zero

In [20]:
# FDT Flows feature extractions
fdt_a2_src = data_dir + 'fdt-airplane2-src-iso.csv'
fdt_a2_src, test_flows = extract_flow_features(fdt_a2_src, airplane2_ip)
print(f'Number of flow feature vectors from fdt-airplane2-src-iso: {len(fdt_a2_src)}')

fdt_a2_dest = data_dir + 'fdt-airplane2-dest-iso.csv'
fdt_a2_dest, test_flows = extract_flow_features(fdt_a2_dest, kchow_ip)
print(f'Number of flow feature vectors from fdt-airplane2-dest-iso: {len(fdt_a2_dest)}')

# 1 stream configured transfer
fdt_a2_dest_1str = data_dir + 'fdt-airplane2-dest-iso-1stream.csv'
fdt_a2_dest_1str, test_flows = extract_flow_features(fdt_a2_dest_1str, kchow_ip)
print(f'Number of flow feature vectors from fdt-airplane2-dest-iso-1stream: {len(fdt_a2_dest_1str)}')

# 2 stream configured transfer
fdt_a2_dest_2str = data_dir + 'fdt-airplane2-dest-iso-2stream.csv'
fdt_a2_dest_2str, test_flows = extract_flow_features(fdt_a2_dest_2str, kchow_ip)
print(f'Number of flow feature vectors from fdt-airplane2-dest-iso-2stream: {len(fdt_a2_dest_2str)}')

fdt_dtn1_dest = data_dir + 'fdt-dtn1-dest-iso.csv'
fdt_dtn1_dest, test_flows = extract_flow_features(fdt_dtn1_dest, kchow_ip)
print(f'Number of flow feature vectors from fdt-dtn1-dest-iso: {len(fdt_dtn1_dest)}')

fdt_dtn1_src = data_dir + 'fdt-dtn1-src-iso.csv'
fdt_dtn1_src, test_flows = extract_flow_features(fdt_dtn1_src, dtn1_ip)
print(f'Number of flow feature vectors from fdt-dtn1-src-iso: {len(fdt_dtn1_src)}')

Number of flow feature vectors from fdt-airplane2-src-iso: 14
Number of flow feature vectors from fdt-airplane2-dest-iso: 8
Number of flow feature vectors from fdt-airplane2-dest-iso-1stream: 3
Number of flow feature vectors from fdt-airplane2-dest-iso-2stream: 4
Number of flow feature vectors from fdt-dtn1-dest-iso: 11
Number of flow feature vectors from fdt-dtn1-src-iso: 11


In [21]:
# RClone Flows feature extractions
rclone_src = data_dir + 'rclone-gdrive-src-iso.csv'
rclone_src, test_flows = extract_flow_features(rclone_src, gdrive_ip)
print(f'Number of flow feature vectors from rclone-gdrive-src-iso: {len(rclone_src)}')

rclone_dest = data_dir + 'rclone-gdrive-dest-iso.csv'
rclone_dest, test_flows = extract_flow_features(rclone_dest, kchow_ip)
print(f'Number of flow feature vectors from rclone-gdrive-src-iso: {len(rclone_dest)}')
    
rclone_src2 = data_dir + 'rclone-gdrive-src-iso2.csv'
rclone_src2, test_flows = extract_flow_features(rclone_src2, gdrive_ip2)
print(f'Number of flow feature vectors from rclone-gdrive-src-iso2: {len(rclone_src2)}')

rclone_dest2 = data_dir + 'rclone-gdrive-dest-iso2.csv'
rclone_dest2, test_flows = extract_flow_features(rclone_dest2, kchow_ip)
print(f'Number of flow feature vectors from rclone-gdrive-dest-iso2: {len(rclone_dest2)}')

rclone_src3 = data_dir + 'rclone-gdrive-src-iso3.csv'
rclone_src3, test_flows = extract_flow_features(rclone_src3, gdrive_ip3)
print(f'Number of flow feature vectors from rclone-gdrive-src-iso2: {len(rclone_src3)}')

rclone_dest3 = data_dir + 'rclone-gdrive-dest-iso3.csv'
rclone_dest3, test_flows = extract_flow_features(rclone_dest3, kchow_ip)
print(f'Number of flow feature vectors from rclone-gdrive-dest-iso3: {len(rclone_dest3)}')

# # Printing flows extracted
# for key in test_flows.keys():
#     print(key)
    
# # Sanity check on direction feature
# for key in test_flows.keys():
#     direction = test_flows[key]['direction']
#     print(direction)

Number of flow feature vectors from rclone-gdrive-src-iso: 5
Number of flow feature vectors from rclone-gdrive-src-iso: 4
Number of flow feature vectors from rclone-gdrive-src-iso2: 4
Number of flow feature vectors from rclone-gdrive-dest-iso2: 4
Number of flow feature vectors from rclone-gdrive-src-iso2: 4
Number of flow feature vectors from rclone-gdrive-dest-iso3: 4


In [22]:
# Creating dictionary of all feature vectors mapping by capture file, saving it
data_dict = {'globus_dtn1_src1' : globus_dtn1_src1, 'globus_dtn1_dest1' : globus_dtn1_dest1, 
             'globus_dtn1_src2' : globus_dtn1_src2, 'globus_dtn1_dest2' : globus_dtn1_dest2,
             'globus_clusterdtn_src' : globus_clusterdtn_src, 'globus_clusterdtn_dest' : globus_clusterdtn_dest,
             'fdt_a2_src' : fdt_a2_src, 'fdt_a2_dest' : fdt_a2_src, 'fdt_dtn1_dest' : fdt_dtn1_dest, 
             'fdt_dtn1_src' : fdt_dtn1_src, 'fdt_a2_dest_1str' : fdt_a2_dest_1str, 'fdt_a2_dest_2str' : fdt_a2_dest_2str,
             'rclone_src' : rclone_src, 'rclone_dest' : rclone_dest, 'rclone_src2': rclone_src2, 
             'rclone_dest2': rclone_dest2, 'rclone_src3' : rclone_src3, 'rclone_dest3' : rclone_dest3}

np.save('../Feature-Vectors/flow_features.npy', data_dict)