# N-Packet Windows/Subflows Basic Feature Extraction (For Comparison/Baseline Experiments)

These features are extracted for each set of N-packets for each flow:  

    Maximum Inter-Packet Arrival Time
    Minimum Inter-Packet Arrival Time
    Average Inter-Packet Arrival Time
    Standard Deviation of Inter-Packet Arrival Time
    Maximum Packet Size
    Minimum Packet Size
    Average Packet Size
    Standard Deviation of Packet Size

    
Extracted for N = 25, 100, 1000

In [3]:
import numpy as np
import netaddr
import csv
import math
import datetime

#
# Extracts a feature vector for each N Packets for each flow in the input file.
#
# A flow is defined by a unique bi-directional tuple: 
# {IP Source Address, Source Port, IP Destination Address, Destination Port, Protocol}
# 
# The 15 features above are extracted for each N-Packet subflows for all flows in the given datafile. 
#
def extract_packet_win_features(filename, N, abs_time=False):
    # Dictionary holding values for feature calculations with keys of:
    # 'ip_source-source_port-ip_dest-dest_port-protocol'
    flows = {}
    # Returned feature values extracted from the given file (all vectors in the file)
    feature_vecs = []
    # Feature vectors for each flow in the file (flow -> feature vector list)
    # THE LIST OF FEATURE VECTORS IS IN CHRONOLOGICAL ORDERING ACCORDING TO WHEN THE SUBFLOWS ARRIVED
    feature_vecs_flows = {} 
    # Reading in the CSV file of packet captures
    with open(filename, 'r') as data_csv:
        reader = csv.reader(data_csv, delimiter=',')
        # Loop through all lines (all packets), get statistics of every window of N packets
        for index, line in enumerate(reader):
            # Line/CSV row format (represents a packet): 
            # [0:Time (Packet Arrival) 1:Source IP 2:Dest IP 3:Protocol 
            #  4: Length 5:Info 6:Source Port 7: Dest Port]
            if 'Time' in line[0]:
                continue
            # Parse out fields needed to construct key (if not the first CSV line)
            ip_src = line[1]
            src_port = line[6]
            ip_dest = line[2]
            dest_port = line[7]
            protocol = line[3]
            # Convert string of absolute time to decimal time
            if abs_time:
                a_time = line[0].split(':')
                secs = a_time[-1].split('.')
                time = float(a_time[0])*3600 + float(a_time[1])*60 + float(secs[0]) + float('0.' + secs[1])
            else:
                time = float(line[0])
            
            key = ip_src + '-' + src_port + '-' + ip_dest + '-' + dest_port + '-' + protocol
            
            # Create new flow and flow statistics if a new flow is encountered
            if key not in flows: 
                flows[key] = {'all_sizes': [], 'all_intervals': [], 'subflow_start': None, 
                              'last_arrival': None, 'packet_num': 0}
            # Update flow statistics based on current packet's information
            flow_stats = flows[key]
            length = float(line[4])
            # Each packet adds to the total number of packets in current subflow
            flow_stats['packet_num'] += 1
            # Give the subflow/window a starting time if this is the first packet
            if flow_stats['subflow_start'] is None:
                flow_stats['subflow_start'] = time
            # Adding this packet's size to array of all packet sizes
            flow_stats['all_sizes'].append(length)
            # Adding interval between this packet's arrival and last packet's arrival 
            if flow_stats['last_arrival'] is None:
                flow_stats['all_intervals'].append(time - flow_stats['subflow_start'])
            else:
                flow_stats['all_intervals'].append(time - flow_stats['last_arrival'])
            # If current packet is the last in the current subflow's packet length,
            # featurize current stats and reset stats for next subflow 
            if flow_stats['packet_num'] == N:                
                # get the time length of the subflow (in seconds)
                subflow_len = time - flow_stats['subflow_start']
                all_pckt_sizes = np.array(flow_stats['all_sizes'])
                all_pckt_intervals = np.array(flow_stats['all_intervals'])
                subflow_features = np.array([np.max(all_pckt_sizes), np.min(all_pckt_sizes), 
                                             np.std(all_pckt_sizes), np.average(all_pckt_sizes), 
                                             np.max(all_pckt_intervals), np.min(all_pckt_intervals), 
                                             np.std(all_pckt_intervals), np.average(all_pckt_intervals)])
                feature_vecs.append(subflow_features) 
                # Add feature vector for N packet subflow to flow's dictionary entry 
                if key not in feature_vecs_flows:
                    feature_vecs_flows[key] = []
                feature_vecs_flows[key].append(subflow_features)
                # Reset features for next subflow of this flow
                flows[key] = {'all_sizes': [], 'all_intervals': [], 'subflow_start': None, 
                              'last_arrival': None, 'packet_num': 0}
            else:
                # Update last arrival time only if this packet wasn't the last
                flow_stats['last_arrival'] = time
    
    # Return a list of feature vectors of all vecs in capture file and dict mapping flows -> flow feature vecs
    return feature_vecs, feature_vecs_flows 
# Any window that doesn't fit (last window: there aren't N packets left to create full subflow) is just dropped

### 25-Packet Subflows

In [4]:
# WIDE F-POINT 25-SUBFLOW FEATURIZATION
data_dir = "../../Unknown-Data/wide-f/"
N = 25

widef_vecs, widef_flow_vecs = extract_packet_win_features(data_dir + 'f-jan3-all.csv', N, abs_time=True)

print(f"Total number of WIDE-f flows: {len(widef_flow_vecs)}")
print(f"Total number of WIDE-f subflows: {len(widef_vecs)}")
print(widef_vecs[0].shape)

# Saving dict of WIDE-f feature vectors, dicts map flows to feature vectors
# Subflow feature vectors are chronologically ordered in their lists, and each list mapped to its flow
np.save(f'../../Feature-Vectors/Basic-Features/wide-f_{N}_packet_subflow_flow_features.npy', widef_flow_vecs)

# Save all WIDE-f feature vecs in one np array
np.save(f'../../Feature-Vectors/Basic-Features/wide-f_{N}_packet_subflow_features.npy', widef_vecs)

Total number of WIDE-f flows: 22481
Total number of WIDE-f subflows: 388000
(8,)


In [5]:
# WIDE G-POINT 25-SUBFLOW FEATURIZATION
data_dir = "../../Unknown-Data/wide-g/"
N = 25

wideg_vecs, wideg_flow_vecs = extract_packet_win_features(data_dir + 'g-jan8-all.csv', N, abs_time=True)

print(f"Total number of WIDE-g flows: {len(wideg_flow_vecs)}")
print(f"Total number of WIDE-g subflows: {len(wideg_vecs)}")

# Saving dict of WIDE-g feature vectors, dicts map flows to feature vectors
# Subflow feature vectors are chronologically ordered in their lists, and each list mapped to its flow
np.save(f'../../Feature-Vectors/Basic-Features/wide-g_{N}_packet_subflow_flow_features.npy', wideg_flow_vecs)

# Save all WIDE-g feature vecs in one np array
np.save(f'../../Feature-Vectors/Basic-Features/wide-g_{N}_packet_subflow_features.npy', wideg_vecs)

Total number of WIDE-g flows: 24265
Total number of WIDE-g subflows: 724554


In [7]:
# FDT 25 SUBFLOW FEATURZIATION
uknown_data_dir = "../../Known-Data/new-fdt/"
N = 25

fdt8gb_vecs, fdt8gb_flow_vecs = extract_packet_win_features(uknown_data_dir + 'fdt-8gb.csv', N, abs_time=True)

fdt16gb_vecs, fdt16gb_flow_vecs = extract_packet_win_features(uknown_data_dir + 'fdt-16gb.csv', N, abs_time=True)

fdt18gb_vecs, fdt18gb_flow_vecs = extract_packet_win_features(uknown_data_dir + 'fdt-18gb.csv', N, abs_time=True)

fdt40gb_vecs, fdt40gb_flow_vecs = extract_packet_win_features(uknown_data_dir + 'fdt-40gb.csv', N, abs_time=True)

fdt47gb_vecs, fdt47gb_flow_vecs = extract_packet_win_features(uknown_data_dir + 'fdt-47gb.csv', N, abs_time=True)

# Creating & saving list of all subflow feature vector lists, each mapped to their flow
fdt_flow_vecs = [fdt8gb_flow_vecs, fdt16gb_flow_vecs, fdt18gb_flow_vecs, \
                    fdt40gb_flow_vecs, fdt47gb_flow_vecs]

# Getting flow count for FDT
flow_tot = 0
subflow_tot = 0
for flow_dict in fdt_flow_vecs:
    flow_tot += len(flow_dict)
    for f in flow_dict:
        subflow_tot += len(flow_dict[f])
print(f"Total number of FDT flows: {flow_tot}")
print(f"Total number of FDT subflows: {subflow_tot}")

# Saving a list of dicts of FDT feature vectors, dicts map flows to feature vectors
# Subflow feature vectors are chronologically ordered in their lists, and each list mapped to its flow
np.save(f'../../Feature-Vectors/Basic-Features/new_fdt_{N}_packet_subflow_flow_features.npy', fdt_flow_vecs)

# Creating & saving list of all FDT data 
all_fdt = fdt8gb_vecs + fdt16gb_vecs + fdt18gb_vecs + fdt40gb_vecs + fdt47gb_vecs
all_fdt = np.array(all_fdt)
print(all_fdt.shape)

# Save all FDT feature vecs in one np array
np.save(f'../../Feature-Vectors/Basic-Features/new_fdt_{N}_packet_subflow_features.npy', all_fdt)

Total number of FDT flows: 40
Total number of FDT subflows: 172480
(172480, 8)


In [8]:
# RCLONE 25-SUBFLOW FEATURIZATION
data_dir = "../../Known-Data/new-rclone/"
N = 25

rclone1_vecs, rclone1_flow_vecs = extract_packet_win_features(data_dir + 'rclone1.csv', N)

rclone2_vecs, rclone2_flow_vecs = extract_packet_win_features(data_dir + 'rclone2.csv', N)

rclone3_vecs, rclone3_flow_vecs = extract_packet_win_features(data_dir + 'rclone3.csv', N)

rclone4_vecs, rclone4_flow_vecs = extract_packet_win_features(data_dir + 'rclone4.csv', N)

rclone5_vecs, rclone5_flow_vecs = extract_packet_win_features(data_dir + 'rclone5.csv', N)

rclone6_vecs, rclone6_flow_vecs = extract_packet_win_features(data_dir + 'rclone6.csv', N)

# Creating & saving list of all subflow feature vector lists, each mapped to their flow
rclone_flow_vecs = [rclone1_flow_vecs, rclone2_flow_vecs, rclone3_flow_vecs, \
                    rclone4_flow_vecs, rclone5_flow_vecs, rclone6_flow_vecs]

# Getting flow count for RClone
flow_tot = 0
subflow_tot = 0
for flow_dict in rclone_flow_vecs:
    flow_tot += len(flow_dict)
    for f in flow_dict:
        subflow_tot += len(flow_dict[f])
print(f"Total number of RClone flows: {flow_tot}")
print(f"Total number of RClone subflows: {subflow_tot}")

# Saving a list of dicts of Rclone feature vectors, dicts map flows to feature vectors
# Subflow feature vectors are chronologically ordered in their lists, and each list mapped to its flow
np.save(f'../../Feature-Vectors/Basic-Features/new_rclone_{N}_packet_subflow_flow_features.npy', rclone_flow_vecs)

# Creating & saving list of all rclone data 
all_rclone = rclone1_vecs + rclone2_vecs + rclone3_vecs + rclone4_vecs + rclone5_vecs + rclone6_vecs
all_rclone = np.array(all_rclone)
print(all_rclone.shape)

# Save all new RClone feature vecs in one np array
np.save(f'../../Feature-Vectors/Basic-Features/new_rclone_{N}_packet_subflow_features.npy', all_rclone)

Total number of RClone flows: 27
Total number of RClone subflows: 537801
(537801, 8)


In [9]:
# GLOBUS 25-SUBFLOW FEATURIZATION
data_dir = "../../Known-Data/new-globus/"
N = 25

globus_10_min_vecs, globus_10_min_flow_vecs = extract_packet_win_features(data_dir + 'globus-10-min.csv', N)

globus_10_min1_vecs, globus_10_min1_flow_vecs = extract_packet_win_features(data_dir + 'globus-10-min1.csv', N)

globus_10_min2_vecs, globus_10_min2_flow_vecs = extract_packet_win_features(data_dir + 'globus-10-min2.csv', N)

globus_20_min, globus_20_min_flow_vecs = extract_packet_win_features(data_dir + 'globus-20-min-all.csv', N, abs_time=True)

globus_5_min, globus_5_min_flow_vecs = extract_packet_win_features(data_dir + 'globus-5-min.csv', N)

# Creating & saving list of all subflow feature vector lists, each mapped to their flow
globus_flow_vecs = [globus_10_min_flow_vecs, globus_10_min1_flow_vecs, globus_10_min2_flow_vecs, \
                    globus_20_min_flow_vecs, globus_5_min_flow_vecs]

# Getting flow count for Globus
flow_tot = 0
subflow_tot = 0
for flow_dict in globus_flow_vecs:
    flow_tot += len(flow_dict)
    for f in flow_dict:
        subflow_tot += len(flow_dict[f])
print(f"Total number of Globus flows: {flow_tot}")
print(f"Total number of Globus subflows: {subflow_tot}")

# Saving a list of dicts of Globus feature vectors, dicts map flows to feature vectors
# Subflow feature vectors are chronologically ordered in their lists, and each list mapped to its flow
np.save(f'../../Feature-Vectors/Basic-Features/new_globus_{N}_packet_subflow_flow_features.npy', globus_flow_vecs)

# Creating & saving list of all globus data 
all_globus = globus_10_min_vecs + globus_10_min1_vecs + globus_10_min2_vecs + globus_20_min + globus_5_min
all_globus = np.array(all_globus)
print(all_globus.shape)

# Save all new globus feature vecs in one np array
np.save(f'../../Feature-Vectors/Basic-Features/new_globus_{N}_packet_subflow_features.npy', all_globus)

Total number of Globus flows: 115
Total number of Globus subflows: 1572839
(1572839, 8)


In [10]:
uknown_data_dir = "../../Unknown-Data/mirror-unknown/"
N = 25

unknown1_vecs, unknown1_flow_vecs = extract_packet_win_features(uknown_data_dir + 'unknown1.csv', N)

unknown2_vecs, unknown2_flow_vecs = extract_packet_win_features(uknown_data_dir + 'unknown2.csv', N)

unknown3_vecs, unknown3_flow_vecs = extract_packet_win_features(uknown_data_dir + 'unknown3-all.csv', N, abs_time=True)

unknown4_vecs, unknown4_flow_vecs = extract_packet_win_features(uknown_data_dir + 'unknown4-all.csv', N, abs_time=True)

unknown5_vecs, unknown5_flow_vecs = extract_packet_win_features(uknown_data_dir + 'unknown5.csv', N)

unknown6_vecs, unknown6_flow_vecs = extract_packet_win_features(uknown_data_dir + 'unknown6.csv', N)

unknown7_vecs, unknown7_flow_vecs = extract_packet_win_features(uknown_data_dir + 'unknown7.csv', N)

unknown_flow_vecs = [unknown1_flow_vecs, unknown2_flow_vecs, unknown3_flow_vecs, unknown4_flow_vecs,
                    unknown5_flow_vecs, unknown6_flow_vecs, unknown7_flow_vecs]

# Number of flows in unknown data (matters due to experimental setup for ensembling unknowns)
flow_tot = 0
for flow_dict in unknown_flow_vecs:
    flow_tot += len(flow_dict)
print(f"Total Unknown Mirror Flows: {flow_tot}")

all_unknown = unknown1_vecs + unknown2_vecs + unknown3_vecs + unknown4_vecs \
                + unknown5_vecs + unknown6_vecs + unknown7_vecs
all_unknown = np.array(all_unknown)
print(f"Total Unknown Mirror Subflows: {len(all_unknown)}")

# Saving unknown feature vectors, all in one array
np.save(f'../../Feature-Vectors/Basic-Features/mirror_unknown_{N}_packet_subflow_features.npy', all_unknown)

# Saving a list of dicts of unknown feature vectors, dicts map flows to feature vectors
# Subflow feature vectors are chronologically ordered in their lists, and each list mapped to its flow
np.save(f'../../Feature-Vectors/Basic-Features/mirror_unknown_{N}_packet_subflow_flow_features.npy', unknown_flow_vecs)

Total Unknown Mirror Flows: 1551
Total Unknown Mirror Subflows: 657368


### 1000 Packet Subflows

In [11]:
# WIDE F-POINT 1000-SUBFLOW FEATURIZATION
data_dir = "../../Unknown-Data/wide-f/"
N = 1000

widef_vecs, widef_flow_vecs = extract_packet_win_features(data_dir + 'f-jan3-all.csv', N, abs_time=True)

print(f"Total number of WIDE-f flows: {len(widef_flow_vecs)}")
print(f"Total number of WIDE-f subflows: {len(widef_vecs)}")

# Saving dict of WIDE-f feature vectors, dicts map flows to feature vectors
# Subflow feature vectors are chronologically ordered in their lists, and each list mapped to its flow
np.save(f'../../Feature-Vectors/Basic-Features/wide-f_{N}_packet_subflow_flow_features.npy', widef_flow_vecs)

# Save all WIDE-f feature vecs in one np array
np.save(f'../../Feature-Vectors/Basic-Features/wide-f_{N}_packet_subflow_features.npy', widef_vecs)

Total number of WIDE-f flows: 804
Total number of WIDE-f subflows: 7611


In [12]:
# WIDE G-POINT 1000-SUBFLOW FEATURIZATION
data_dir = "../../Unknown-Data/wide-g/"
N = 1000

wideg_vecs, wideg_flow_vecs = extract_packet_win_features(data_dir + 'g-jan8-all.csv', N, abs_time=True)

print(f"Total number of WIDE-g flows: {len(wideg_flow_vecs)}")
print(f"Total number of WIDE-g subflows: {len(wideg_vecs)}")

# Saving dict of WIDE-g feature vectors, dicts map flows to feature vectors
# Subflow feature vectors are chronologically ordered in their lists, and each list mapped to its flow
np.save(f'../../Feature-Vectors/Basic-Features/wide-g_{N}_packet_subflow_flow_features.npy', wideg_flow_vecs)

# Save all WIDE-g feature vecs in one np array
np.save(f'../../Feature-Vectors/Basic-Features/wide-g_{N}_packet_subflow_features.npy', wideg_vecs)

Total number of WIDE-g flows: 1409
Total number of WIDE-g subflows: 15454


In [13]:
# MIRROR UNKNOWN 1000-PACKET SUBFLOW FEATURIZATION
uknown_data_dir = "../../Unknown-Data/mirror-unknown/"
N = 1000

unknown1_vecs, unknown1_flow_vecs = extract_packet_win_features(uknown_data_dir + 'unknown1.csv', N)

unknown2_vecs, unknown2_flow_vecs = extract_packet_win_features(uknown_data_dir + 'unknown2.csv', N)

unknown3_vecs, unknown3_flow_vecs = extract_packet_win_features(uknown_data_dir + 'unknown3-all.csv', N, abs_time=True)

unknown4_vecs, unknown4_flow_vecs = extract_packet_win_features(uknown_data_dir + 'unknown4-all.csv', N, abs_time=True)

unknown5_vecs, unknown5_flow_vecs = extract_packet_win_features(uknown_data_dir + 'unknown5.csv', N)

unknown6_vecs, unknown6_flow_vecs = extract_packet_win_features(uknown_data_dir + 'unknown6.csv', N)

unknown7_vecs, unknown7_flow_vecs = extract_packet_win_features(uknown_data_dir + 'unknown7.csv', N)

unknown_flow_vecs = [unknown1_flow_vecs, unknown2_flow_vecs, unknown3_flow_vecs, unknown4_flow_vecs,
                    unknown5_flow_vecs, unknown6_flow_vecs, unknown7_flow_vecs]

# Number of flows in unknown data (matters due to experimental setup for ensembling unknowns)
flow_tot = 0
for flow_dict in unknown_flow_vecs:
    flow_tot += len(flow_dict)
print(f"Total Unknown Mirror Flows: {flow_tot}")

all_unknown = unknown1_vecs + unknown2_vecs + unknown3_vecs + unknown4_vecs \
                + unknown5_vecs + unknown6_vecs + unknown7_vecs
all_unknown = np.array(all_unknown)
print(f"Total Unknown Mirror Subflows: {len(all_unknown)}")

# Saving unknown feature vectors, all in one array
np.save(f'../../Feature-Vectors/Basic-Features/mirror_unknown_{N}_packet_subflow_features.npy', all_unknown)

# Saving a list of dicts of unknown feature vectors, dicts map flows to feature vectors
# Subflow feature vectors are chronologically ordered in their lists, and each list mapped to its flow
np.save(f'../../Feature-Vectors/Basic-Features/mirror_unknown_{N}_packet_subflow_flow_features.npy', unknown_flow_vecs)

Total Unknown Mirror Flows: 1188
Total Unknown Mirror Subflows: 15725


In [14]:
# FDT 1000 SUBFLOW FEATURZIATION
uknown_data_dir = "../../Known-Data/new-fdt/"
N = 1000

fdt8gb_vecs, fdt8gb_flow_vecs = extract_packet_win_features(uknown_data_dir + 'fdt-8gb.csv', N, abs_time=True)

fdt16gb_vecs, fdt16gb_flow_vecs = extract_packet_win_features(uknown_data_dir + 'fdt-16gb.csv', N, abs_time=True)

fdt18gb_vecs, fdt18gb_flow_vecs = extract_packet_win_features(uknown_data_dir + 'fdt-18gb.csv', N, abs_time=True)

fdt40gb_vecs, fdt40gb_flow_vecs = extract_packet_win_features(uknown_data_dir + 'fdt-40gb.csv', N, abs_time=True)

fdt47gb_vecs, fdt47gb_flow_vecs = extract_packet_win_features(uknown_data_dir + 'fdt-47gb.csv', N, abs_time=True)

# Creating & saving list of all subflow feature vector lists, each mapped to their flow
fdt_flow_vecs = [fdt8gb_flow_vecs, fdt16gb_flow_vecs, fdt18gb_flow_vecs, \
                    fdt40gb_flow_vecs, fdt47gb_flow_vecs]

# Getting flow count for FDT
flow_tot = 0
subflow_tot = 0
for flow_dict in fdt_flow_vecs:
    flow_tot += len(flow_dict)
    for f in flow_dict:
        subflow_tot += len(flow_dict[f])
print(f"Total number of FDT flows: {flow_tot}")
print(f"Total number of FDT subflows: {subflow_tot}")

# Saving a list of dicts of FDT feature vectors, dicts map flows to feature vectors
# Subflow feature vectors are chronologically ordered in their lists, and each list mapped to its flow
np.save(f'../../Feature-Vectors/Basic-Features/new_fdt_{N}_packet_subflow_flow_features.npy', fdt_flow_vecs)

# Creating & saving list of all FDT data 
all_fdt = fdt8gb_vecs + fdt16gb_vecs + fdt18gb_vecs + fdt40gb_vecs + fdt47gb_vecs
all_fdt = np.array(all_fdt)
print(all_fdt.shape)

# Save all FDT feature vecs in one np array
np.save(f'../../Feature-Vectors/Basic-Features/new_fdt_{N}_packet_subflow_features.npy', all_fdt)

Total number of FDT flows: 40
Total number of FDT subflows: 4292
(4292, 8)


In [15]:
# RCLONE 1000-SUBFLOW FEATURIZATION
data_dir = "../../Known-Data/new-rclone/"
N = 1000

rclone1_vecs, rclone1_flow_vecs = extract_packet_win_features(data_dir + 'rclone1.csv', N)

rclone2_vecs, rclone2_flow_vecs = extract_packet_win_features(data_dir + 'rclone2.csv', N)

rclone3_vecs, rclone3_flow_vecs = extract_packet_win_features(data_dir + 'rclone3.csv', N)

rclone4_vecs, rclone4_flow_vecs = extract_packet_win_features(data_dir + 'rclone4.csv', N)

rclone5_vecs, rclone5_flow_vecs = extract_packet_win_features(data_dir + 'rclone5.csv', N)

rclone6_vecs, rclone6_flow_vecs = extract_packet_win_features(data_dir + 'rclone6.csv', N)

# Creating & saving list of all subflow feature vector lists, each mapped to their flow
rclone_flow_vecs = [rclone1_flow_vecs, rclone2_flow_vecs, rclone3_flow_vecs, \
                    rclone4_flow_vecs, rclone5_flow_vecs, rclone6_flow_vecs]

# Getting flow count for RClone
flow_tot = 0
subflow_tot = 0
for flow_dict in rclone_flow_vecs:
    flow_tot += len(flow_dict)
    for f in flow_dict:
        subflow_tot += len(flow_dict[f])
print(f"Total number of RClone flows: {flow_tot}")
print(f"Total number of RClone subflows: {subflow_tot}")

# Saving a list of dicts of Rclone feature vectors, dicts map flows to feature vectors
# Subflow feature vectors are chronologically ordered in their lists, and each list mapped to its flow
np.save(f'../../Feature-Vectors/Basic-Features/new_rclone_{N}_packet_subflow_flow_features.npy', rclone_flow_vecs)

# Creating & saving list of all rclone data 
all_rclone = rclone1_vecs + rclone2_vecs + rclone3_vecs + rclone4_vecs + rclone5_vecs + rclone6_vecs
all_rclone = np.array(all_rclone)
print(all_rclone.shape)

# Save all new RClone feature vecs in one np array
np.save(f'../../Feature-Vectors/Basic-Features/new_rclone_{N}_packet_subflow_features.npy', all_rclone)

Total number of RClone flows: 25
Total number of RClone subflows: 13434
(13434, 8)


In [16]:
# GLOBUS 1000-SUBFLOW FEATURIZATION
data_dir = "../../Known-Data/new-globus/"
N = 1000

globus_10_min_vecs, globus_10_min_flow_vecs = extract_packet_win_features(data_dir + 'globus-10-min.csv', N)

globus_10_min1_vecs, globus_10_min1_flow_vecs = extract_packet_win_features(data_dir + 'globus-10-min1.csv', N)

globus_10_min2_vecs, globus_10_min2_flow_vecs = extract_packet_win_features(data_dir + 'globus-10-min2.csv', N)

globus_20_min, globus_20_min_flow_vecs = extract_packet_win_features(data_dir + 'globus-20-min-all.csv', N, abs_time=True)

globus_5_min, globus_5_min_flow_vecs = extract_packet_win_features(data_dir + 'globus-5-min.csv', N)

# Creating & saving list of all subflow feature vector lists, each mapped to their flow
globus_flow_vecs = [globus_10_min_flow_vecs, globus_10_min1_flow_vecs, globus_10_min2_flow_vecs, \
                    globus_20_min_flow_vecs, globus_5_min_flow_vecs]

# Getting flow count for Globus
flow_tot = 0
subflow_tot = 0
for flow_dict in globus_flow_vecs:
    flow_tot += len(flow_dict)
    for f in flow_dict:
        subflow_tot += len(flow_dict[f])
print(f"Total number of Globus flows: {flow_tot}")
print(f"Total number of Globus subflows: {subflow_tot}")

# Saving a list of dicts of Globus feature vectors, dicts map flows to feature vectors
# Subflow feature vectors are chronologically ordered in their lists, and each list mapped to its flow
np.save(f'../../Feature-Vectors/Basic-Features/new_globus_{N}_packet_subflow_flow_features.npy', globus_flow_vecs)

# Creating & saving list of all globus data 
all_globus = globus_10_min_vecs + globus_10_min1_vecs + globus_10_min2_vecs + globus_20_min + globus_5_min
all_globus = np.array(all_globus)
print(all_globus.shape)

# Save all new globus feature vecs in one np array
np.save(f'../../Feature-Vectors/Basic-Features/new_globus_{N}_packet_subflow_features.npy', all_globus)

Total number of Globus flows: 107
Total number of Globus subflows: 39270
(39270, 8)


### 100-Packet Subflows

In [17]:
# WIDE F-POINT 100-SUBFLOW FEATURIZATION
data_dir = "../../Unknown-Data/wide-f/"
N = 100

widef_vecs, widef_flow_vecs = extract_packet_win_features(data_dir + 'f-jan3-all.csv', N, abs_time=True)

print(f"Total number of WIDE-f flows: {len(widef_flow_vecs)}")
print(f"Total number of WIDE-f subflows: {len(widef_vecs)}")

# LOTS OF FLOWS (almost 5k) but fewler SUBFLOWS (~90k)

# Saving dict of WIDE-f feature vectors, dicts map flows to feature vectors
# Subflow feature vectors are chronologically ordered in their lists, and each list mapped to its flow
np.save('../../Feature-Vectors/Basic-Features/wide-f_100_packet_subflow_flow_features.npy', widef_flow_vecs)

# Save all WIDE-f feature vecs in one np array
np.save('../../Feature-Vectors/Basic-Features/wide-f_100_packet_subflow_features.npy', widef_vecs)

Total number of WIDE-f flows: 4937
Total number of WIDE-f subflows: 89528


In [18]:
# WIDE G-POINT 100-SUBFLOW FEATURIZATION
data_dir = "../../Unknown-Data/wide-g/"
N = 100

wideg_vecs, wideg_flow_vecs = extract_packet_win_features(data_dir + 'g-jan8-all.csv', N, abs_time=True)

print(f"Total number of WIDE-g flows: {len(wideg_flow_vecs)}")
print(f"Total number of WIDE-g subflows: {len(wideg_vecs)}")

# Saving dict of WIDE-g feature vectors, dicts map flows to feature vectors
# Subflow feature vectors are chronologically ordered in their lists, and each list mapped to its flow
np.save('../../Feature-Vectors/Basic-Features/wide-g_100_packet_subflow_flow_features.npy', wideg_flow_vecs)

# Save all WIDE-g feature vecs in one np array
np.save('../../Feature-Vectors/Basic-Features/wide-g_100_packet_subflow_features.npy', wideg_vecs)

Total number of WIDE-g flows: 6295
Total number of WIDE-g subflows: 173093


In [19]:
# FDT 100 SUBFLOW FEATURZIATION
uknown_data_dir = "../../Known-Data/new-fdt/"
N = 100

fdt8gb_vecs, fdt8gb_flow_vecs = extract_packet_win_features(uknown_data_dir + 'fdt-8gb.csv', N, abs_time=True)

fdt16gb_vecs, fdt16gb_flow_vecs = extract_packet_win_features(uknown_data_dir + 'fdt-16gb.csv', N, abs_time=True)

fdt18gb_vecs, fdt18gb_flow_vecs = extract_packet_win_features(uknown_data_dir + 'fdt-18gb.csv', N, abs_time=True)

fdt40gb_vecs, fdt40gb_flow_vecs = extract_packet_win_features(uknown_data_dir + 'fdt-40gb.csv', N, abs_time=True)

fdt47gb_vecs, fdt47gb_flow_vecs = extract_packet_win_features(uknown_data_dir + 'fdt-47gb.csv', N, abs_time=True)

# Creating & saving list of all subflow feature vector lists, each mapped to their flow
fdt_flow_vecs = [fdt8gb_flow_vecs, fdt16gb_flow_vecs, fdt18gb_flow_vecs, \
                    fdt40gb_flow_vecs, fdt47gb_flow_vecs]

# Getting flow count for FDT
flow_tot = 0
subflow_tot = 0
for flow_dict in fdt_flow_vecs:
    flow_tot += len(flow_dict)
    for f in flow_dict:
#         print(len(flow_dict[f]))
        subflow_tot += len(flow_dict[f])
#     print(len(flow_dict))
print(f"Total number of FDT flows: {flow_tot}")
print(f"Total number of FDT subflows: {subflow_tot}")

# NOTES

# Saving a list of dicts of FDT feature vectors, dicts map flows to feature vectors
# Subflow feature vectors are chronologically ordered in their lists, and each list mapped to its flow
np.save('../../Feature-Vectors/Basic-Features/new_fdt_100_packet_subflow_flow_features.npy', fdt_flow_vecs)

# Creating & saving list of all FDT data 
all_fdt = fdt8gb_vecs + fdt16gb_vecs + fdt18gb_vecs + fdt40gb_vecs + fdt47gb_vecs
all_fdt = np.array(all_fdt)
print(all_fdt.shape)

# Save all FDT feature vecs in one np array
np.save('../../Feature-Vectors/Basic-Features/new_fdt_100_packet_subflow_features.npy', all_fdt)

Total number of FDT flows: 40
Total number of FDT subflows: 43105
(43105, 8)


In [20]:
# RCLONE 100-SUBFLOW FEATURIZATION
data_dir = "../../Known-Data/new-rclone/"
N = 100

rclone1_vecs, rclone1_flow_vecs = extract_packet_win_features(data_dir + 'rclone1.csv', N)

rclone2_vecs, rclone2_flow_vecs = extract_packet_win_features(data_dir + 'rclone2.csv', N)

rclone3_vecs, rclone3_flow_vecs = extract_packet_win_features(data_dir + 'rclone3.csv', N)

rclone4_vecs, rclone4_flow_vecs = extract_packet_win_features(data_dir + 'rclone4.csv', N)

rclone5_vecs, rclone5_flow_vecs = extract_packet_win_features(data_dir + 'rclone5.csv', N)

rclone6_vecs, rclone6_flow_vecs = extract_packet_win_features(data_dir + 'rclone6.csv', N)

# Creating & saving list of all subflow feature vector lists, each mapped to their flow
rclone_flow_vecs = [rclone1_flow_vecs, rclone2_flow_vecs, rclone3_flow_vecs, \
                    rclone4_flow_vecs, rclone5_flow_vecs, rclone6_flow_vecs]

# Getting flow count for RClone
flow_tot = 0
subflow_tot = 0
for flow_dict in rclone_flow_vecs:
    flow_tot += len(flow_dict)
    for f in flow_dict:
#         print(len(flow_dict[f]))
        subflow_tot += len(flow_dict[f])
#     print(len(flow_dict))
print(f"Total number of RClone flows: {flow_tot}")
print(f"Total number of RClone subflows: {subflow_tot}")

# V FEW RCLONE FLOWS (27, wherease 107 Globus & 1441 unknown)
# so these are real long flows...

# Saving a list of dicts of Rclone feature vectors, dicts map flows to feature vectors
# Subflow feature vectors are chronologically ordered in their lists, and each list mapped to its flow
np.save('../../Feature-Vectors/Basic-Features/new_rclone_100_packet_subflow_flow_features.npy', rclone_flow_vecs)

# Creating & saving list of all rclone data 
all_rclone = rclone1_vecs + rclone2_vecs + rclone3_vecs + rclone4_vecs + rclone5_vecs + rclone6_vecs
all_rclone = np.array(all_rclone)
print(all_rclone.shape)

# Save all new RClone feature vecs in one np array
np.save('../../Feature-Vectors/Basic-Features/new_rclone_100_packet_subflow_features.npy', all_rclone)

Total number of RClone flows: 27
Total number of RClone subflows: 134441
(134441, 8)


In [21]:
# GLOBUS 100-SUBFLOW FEATURIZATION
data_dir = "../../Known-Data/new-globus/"
N = 100

globus_10_min_vecs, globus_10_min_flow_vecs = extract_packet_win_features(data_dir + 'globus-10-min.csv', N)

globus_10_min1_vecs, globus_10_min1_flow_vecs = extract_packet_win_features(data_dir + 'globus-10-min1.csv', N)

globus_10_min2_vecs, globus_10_min2_flow_vecs = extract_packet_win_features(data_dir + 'globus-10-min2.csv', N)

globus_20_min, globus_20_min_flow_vecs = extract_packet_win_features(data_dir + 'globus-20-min-all.csv', N, abs_time=True)

globus_5_min, globus_5_min_flow_vecs = extract_packet_win_features(data_dir + 'globus-5-min.csv', N)

# Creating & saving list of all subflow feature vector lists, each mapped to their flow
globus_flow_vecs = [globus_10_min_flow_vecs, globus_10_min1_flow_vecs, globus_10_min2_flow_vecs, \
                    globus_20_min_flow_vecs, globus_5_min_flow_vecs]

# Getting flow count for Globus
flow_tot = 0
subflow_tot = 0
for flow_dict in globus_flow_vecs:
    flow_tot += len(flow_dict)
    for f in flow_dict:
#         print(len(flow_dict[f]))
        subflow_tot += len(flow_dict[f])
#     print(len(flow_dict))
print(f"Total number of Globus flows: {flow_tot}")
print(f"Total number of Globus subflows: {subflow_tot}")

# WAY LESS FLOWS FOR GLOBUS (107 vs. 1441 for unknown)
# so these are real long flows...

# Saving a list of dicts of Globus feature vectors, dicts map flows to feature vectors
# Subflow feature vectors are chronologically ordered in their lists, and each list mapped to its flow
np.save('../../Feature-Vectors/Basic-Features/new_globus_100_packet_subflow_flow_features.npy', globus_flow_vecs)

# Creating & saving list of all globus data 
all_globus = globus_10_min_vecs + globus_10_min1_vecs + globus_10_min2_vecs + globus_20_min + globus_5_min
all_globus = np.array(all_globus)
print(all_globus.shape)

# Save all new globus feature vecs in one np array
np.save('../../Feature-Vectors/Basic-Features/new_globus_100_packet_subflow_features.npy', all_globus)

Total number of Globus flows: 115
Total number of Globus subflows: 393166
(393166, 8)


In [22]:
# MIRROR 100-SUBFLOW FEATURIZATION
uknown_data_dir = "../../Unknown-Data/mirror-unknown/"
N = 100

unknown1_vecs, unknown1_flow_vecs = extract_packet_win_features(uknown_data_dir + 'unknown1.csv', N)

unknown2_vecs, unknown2_flow_vecs = extract_packet_win_features(uknown_data_dir + 'unknown2.csv', N)

unknown3_vecs, unknown3_flow_vecs = extract_packet_win_features(uknown_data_dir + 'unknown3-all.csv', N, abs_time=True)

unknown4_vecs, unknown4_flow_vecs = extract_packet_win_features(uknown_data_dir + 'unknown4-all.csv', N, abs_time=True)

unknown5_vecs, unknown5_flow_vecs = extract_packet_win_features(uknown_data_dir + 'unknown5.csv', N)

unknown6_vecs, unknown6_flow_vecs = extract_packet_win_features(uknown_data_dir + 'unknown6.csv', N)

unknown7_vecs, unknown7_flow_vecs = extract_packet_win_features(uknown_data_dir + 'unknown7.csv', N)

unknown_flow_vecs = [unknown1_flow_vecs, unknown2_flow_vecs, unknown3_flow_vecs, unknown4_flow_vecs,
                    unknown5_flow_vecs, unknown6_flow_vecs, unknown7_flow_vecs]

# Number of flows in unknown data (matters due to experimental setup for ensembling unknowns)
flow_tot = 0
for flow_dict in unknown_flow_vecs:
    flow_tot += len(flow_dict)
print(f"Total Unknown Mirror Flows: {flow_tot}")

all_unknown = unknown1_vecs + unknown2_vecs + unknown3_vecs + unknown4_vecs \
                + unknown5_vecs + unknown6_vecs + unknown7_vecs
all_unknown = np.array(all_unknown)
print(f"Total Unknown Mirror Subflows: {len(all_unknown)}")

# Saving unknown feature vectors, all in one array
np.save('../../Feature-Vectors/Basic-Features/mirror_unknown_100_packet_subflow_features.npy', all_unknown)

# Saving a list of dicts of unknown feature vectors, dicts map flows to feature vectors
# Subflow feature vectors are chronologically ordered in their lists, and each list mapped to its flow
np.save('../../Feature-Vectors/Basic-Features/mirror_unknown_100_packet_subflow_flow_features.npy', unknown_flow_vecs)

Total Unknown Mirror Flows: 1490
Total Unknown Mirror Subflows: 163772


### Verifying Chronological Ordering / Order of Packets in CSV is Perserved in Featurization

In [10]:
# Chronological ordering is verfied :)
# The start times for all subflows before some subflow are all before the subflow - though tiny margins
data_dir = "../../DT-Data/new-globus/"
N = 100

globus_10_min_vecs, globus_10_min_flow_vecs, times = extract_packet_win_features(data_dir + 'globus-10-min.csv', N)
print(len(globus_10_min_flow_vecs), len(times))
for flow in globus_10_min_flow_vecs:
#     print(len(globus_10_min_flow_vecs[flow]), len(times[flow])) # These are the same :)
#     print(globus_10_min_flow_vecs[flow][:10])
    print(times[flow][:10])
    print('\n')

48 48
[(27.257798377, 27.973901661), (27.973926918, 28.120795314), (28.120842313, 28.12531537), (28.125343187, 28.322537352), (28.322538533, 28.331980117), (28.331996418, 28.37444237), (28.374461247, 28.375774029), (28.375775632, 28.383764782), (28.383779927, 28.387428529), (28.387482157, 28.390666635)]


[(27.323444366, 28.01450188), (28.014513628, 28.122722325), (28.122805275, 28.326296094), (28.326310126, 28.380773065), (28.380788228, 28.383948747), (28.384001942, 28.799189635), (28.799217777, 28.805848382), (28.805876488, 28.813834709), (28.813870127, 28.855042837), (28.855172411, 28.862257084)]


[(27.49484936, 28.025134936), (28.025532065, 28.125593841), (28.12565499, 28.33215705), (28.332216172, 28.383373023), (28.383434527, 28.42495364), (28.424985874, 28.805421637), (28.805481564, 28.811565872), (28.811622517, 28.819563815), (28.819628469, 28.858871189), (28.858933799, 28.865318809)]


[(27.323443787, 28.10619484), (28.106212868, 28.157527011), (28.157556197, 28.376562637), (2

## Below is older data (first experiments, with ~3600 unknown and more known - known had FDT, RClone, and Globus)

### 100 Packet Subflow Feature Vectors

In [15]:
data_dir = "../../DT-Data/"
N = 100

# Globus
globus_dtn1_src1 = extract_packet_win_features(data_dir + 'globus-dtn1-src-iso.csv', N)
print("Globus:")
print(len(globus_dtn1_src1))

globus_dtn1_dest1 = extract_packet_win_features(data_dir + 'globus-dtn1-dest-iso.csv', N)
print(len(globus_dtn1_dest1))

globus_dtn1_src2 = extract_packet_win_features(data_dir + 'globus-dtn1-src-iso2.csv', N)
print(len(globus_dtn1_src2))

globus_dtn1_dest2 = extract_packet_win_features(data_dir + 'globus-dtn1-dest-iso2.csv', N)
print(len(globus_dtn1_dest2))

globus_clusterdtn_src = extract_packet_win_features(data_dir + 'globus-clusterdtn-src-iso.csv', N)
print(len(globus_clusterdtn_src))

globus_clusterdtn_dest = extract_packet_win_features(data_dir + 'globus-clusterdtn-dest-iso.csv', N)
print(len(globus_clusterdtn_dest))


# FDT
fdt_a2_src = extract_packet_win_features(data_dir + 'fdt-airplane2-src-iso.csv', N)
print("\nFDT:")
print(len(fdt_a2_src))

fdt_a2_dest = extract_packet_win_features(data_dir + 'fdt-airplane2-dest-iso.csv', N)
print(len(fdt_a2_dest))

fdt_a2_dest_1str = extract_packet_win_features(data_dir + 'fdt-airplane2-dest-iso-1stream.csv', N)
print(len(fdt_a2_dest_1str))

fdt_a2_dest_2str = extract_packet_win_features(data_dir + 'fdt-airplane2-dest-iso-2stream.csv', N)
print(len(fdt_a2_dest_2str))

fdt_dtn1_dest = extract_packet_win_features(data_dir + 'fdt-dtn1-dest-iso.csv', N)
print(len(fdt_dtn1_dest))

fdt_dtn1_src = extract_packet_win_features(data_dir + 'fdt-dtn1-src-iso.csv', N)
print(len(fdt_dtn1_src))


# RClone
rclone_src = extract_packet_win_features(data_dir + 'rclone-gdrive-src-iso.csv', N)
print("\nRClone:")
print(len(rclone_src))

rclone_dest = extract_packet_win_features(data_dir + 'rclone-gdrive-dest-iso.csv', N)
print(len(rclone_dest))

rclone_src2 = extract_packet_win_features(data_dir + 'rclone-gdrive-src-iso2.csv', N)
print(len(rclone_src2))

rclone_dest2 = extract_packet_win_features(data_dir + 'rclone-gdrive-dest-iso2.csv', N)
print(len(rclone_dest2))

rclone_src3 = extract_packet_win_features(data_dir + 'rclone-gdrive-src-iso3.csv', N)
print(len(rclone_src3))

rclone_dest3 = extract_packet_win_features(data_dir + 'rclone-gdrive-dest-iso3.csv', N)
print(len(rclone_dest3))

#Creating dictionary of all 100 packet subflow feature vectors mapping by capture file, saving it
hundred_N_dict = {'globus_dtn1_src1' : globus_dtn1_src1, 'globus_dtn1_dest1' : globus_dtn1_dest1, 
             'globus_dtn1_src2' : globus_dtn1_src2, 'globus_dtn1_dest2' : globus_dtn1_dest2,
             'globus_clusterdtn_src' : globus_clusterdtn_src, 'globus_clusterdtn_dest' : globus_clusterdtn_dest,
             'fdt_a2_src' : fdt_a2_src, 'fdt_a2_dest' : fdt_a2_src, 'fdt_dtn1_dest' : fdt_dtn1_dest, 
             'fdt_dtn1_src' : fdt_dtn1_src, 'fdt_a2_dest_1str' : fdt_a2_dest_1str, 'fdt_a2_dest_2str' : fdt_a2_dest_2str,
             'rclone_src' : rclone_src, 'rclone_dest' : rclone_dest, 'rclone_src2': rclone_src2, 
             'rclone_dest2': rclone_dest2, 'rclone_src3' : rclone_src3, 'rclone_dest3' : rclone_dest3}

np.save('../../Feature-Vectors/100_packet_subflow_features.npy', hundred_N_dict)

Globus:
3571
5289
2141
4990
2649
6061

FDT:
3126
5792
6078
6025
6578
3580

RClone:
3615
597
4159
616
5833
898


In [4]:
# UNKNOWN/MIRROR CAPTURED DATA 100 packet subflow FEATURIZATION
uknown_data_dir = "../../Unknown-Data/"
N = 100

# Only extracting 4 second windows
unknown1 = uknown_data_dir + 'mirror_unknown1.csv'
unknown1 = extract_packet_win_features(unknown1, N)
print(f'Number of flow feature vectors from mirror_unknown1.csv: {len(unknown1)}')

unknown2 = uknown_data_dir + 'mirror_unknown2.csv'
unknown2 = extract_packet_win_features(unknown2, N)
print(f'Number of flow feature vectors from mirror_unknown2.csv: {len(unknown2)}')

unknown3 = uknown_data_dir + 'dtn1_unknown1.csv'
unknown3 = extract_packet_win_features(unknown3, N)
print(f'Number of flow feature vectors from dtn1_unknown1.csv: {len(unknown3)}')

unknown4 = uknown_data_dir + 'dtn1_unknown2.csv'
unknown4 = extract_packet_win_features(unknown4, N)
print(f'Number of flow feature vectors from dtn1_unknown2.csv: {len(unknown4)}')

unknown5 = uknown_data_dir + 'dtn1_unknown3.csv'
unknown5 = extract_packet_win_features(unknown5, N)
print(f'Number of flow feature vectors from dtn1_unknown3.csv: {len(unknown5)}')

unknown6 = uknown_data_dir + 'mirror_unknown3.csv'
unknown6 = extract_packet_win_features(unknown6, N)
print(f'Number of flow feature vectors from mirror_unknown3.csv: {len(unknown6)}')

# Dictionary of all unknown feature vectors
hundred_unknown_dict = {'unknown1': unknown1, 'unknown2': unknown2, 'unknown3': unknown3, 
                'unknown4': unknown4, 'unknown5': unknown5, 'unknown6': unknown6}

np.save('../../Feature-Vectors/unknown_100_packet_subflow_features.npy', hundred_unknown_dict)

Number of flow feature vectors from mirror_unknown1.csv: 828
Number of flow feature vectors from mirror_unknown2.csv: 914
Number of flow feature vectors from dtn1_unknown1.csv: 363
Number of flow feature vectors from dtn1_unknown2.csv: 387
Number of flow feature vectors from dtn1_unknown3.csv: 655
Number of flow feature vectors from mirror_unknown3.csv: 556


### 25 Packet Subflow Feature Vectors

In [16]:
N = 25

# Globus
globus_dtn1_src1 = extract_packet_win_features(data_dir + 'globus-dtn1-src-iso.csv', N)
print("Globus:")
print(len(globus_dtn1_src1))

globus_dtn1_dest1 = extract_packet_win_features(data_dir + 'globus-dtn1-dest-iso.csv', N)
print(len(globus_dtn1_dest1))

globus_dtn1_src2 = extract_packet_win_features(data_dir + 'globus-dtn1-src-iso2.csv', N)
print(len(globus_dtn1_src2))

globus_dtn1_dest2 = extract_packet_win_features(data_dir + 'globus-dtn1-dest-iso2.csv', N)
print(len(globus_dtn1_dest2))

globus_clusterdtn_src = extract_packet_win_features(data_dir + 'globus-clusterdtn-src-iso.csv', N)
print(len(globus_clusterdtn_src))

globus_clusterdtn_dest = extract_packet_win_features(data_dir + 'globus-clusterdtn-dest-iso.csv', N)
print(len(globus_clusterdtn_dest))


# FDT
fdt_a2_src = extract_packet_win_features(data_dir + 'fdt-airplane2-src-iso.csv', N)
print("\nFDT:")
print(len(fdt_a2_src))

fdt_a2_dest = extract_packet_win_features(data_dir + 'fdt-airplane2-dest-iso.csv', N)
print(len(fdt_a2_dest))

fdt_a2_dest_1str = extract_packet_win_features(data_dir + 'fdt-airplane2-dest-iso-1stream.csv', N)
print(len(fdt_a2_dest_1str))

fdt_a2_dest_2str = extract_packet_win_features(data_dir + 'fdt-airplane2-dest-iso-2stream.csv', N)
print(len(fdt_a2_dest_2str))

fdt_dtn1_dest = extract_packet_win_features(data_dir + 'fdt-dtn1-dest-iso.csv', N)
print(len(fdt_dtn1_dest))

fdt_dtn1_src = extract_packet_win_features(data_dir + 'fdt-dtn1-src-iso.csv', N)
print(len(fdt_dtn1_src))


# RClone
rclone_src = extract_packet_win_features(data_dir + 'rclone-gdrive-src-iso.csv', N)
print("\nRClone:")
print(len(rclone_src))

rclone_dest = extract_packet_win_features(data_dir + 'rclone-gdrive-dest-iso.csv', N)
print(len(rclone_dest))

rclone_src2 = extract_packet_win_features(data_dir + 'rclone-gdrive-src-iso2.csv', N)
print(len(rclone_src2))

rclone_dest2 = extract_packet_win_features(data_dir + 'rclone-gdrive-dest-iso2.csv', N)
print(len(rclone_dest2))

rclone_src3 = extract_packet_win_features(data_dir + 'rclone-gdrive-src-iso3.csv', N)
print(len(rclone_src3))

rclone_dest3 = extract_packet_win_features(data_dir + 'rclone-gdrive-dest-iso3.csv', N)
print(len(rclone_dest3))

# Creating dictionary of all 25 packet subflow feature vectors mapping by capture file, saving it
twentyfive_N_dict = {'globus_dtn1_src1' : globus_dtn1_src1, 'globus_dtn1_dest1' : globus_dtn1_dest1, 
             'globus_dtn1_src2' : globus_dtn1_src2, 'globus_dtn1_dest2' : globus_dtn1_dest2,
             'globus_clusterdtn_src' : globus_clusterdtn_src, 'globus_clusterdtn_dest' : globus_clusterdtn_dest,
             'fdt_a2_src' : fdt_a2_src, 'fdt_a2_dest' : fdt_a2_src, 'fdt_dtn1_dest' : fdt_dtn1_dest, 
             'fdt_dtn1_src' : fdt_dtn1_src, 'fdt_a2_dest_1str' : fdt_a2_dest_1str, 'fdt_a2_dest_2str' : fdt_a2_dest_2str,
             'rclone_src' : rclone_src, 'rclone_dest' : rclone_dest, 'rclone_src2': rclone_src2, 
             'rclone_dest2': rclone_dest2, 'rclone_src3' : rclone_src3, 'rclone_dest3' : rclone_dest3}

np.save('../../Feature-Vectors/25_packet_subflow_features.npy', twentyfive_N_dict)

Globus:
14286
21177
8567
19976
10603
24259

FDT:
12517
23177
24315
24104
26321
14332

RClone:
14464
2392
16644
2468
23335
3599


In [5]:
# UNKNOWN/MIRROR CAPTURED DATA 25 packet subflow FEATURIZATION
uknown_data_dir = "../../Unknown-Data/"
N = 25

# Only extracting 4 second windows
unknown1 = uknown_data_dir + 'mirror_unknown1.csv'
unknown1 = extract_packet_win_features(unknown1, N)
print(f'Number of flow feature vectors from mirror_unknown1.csv: {len(unknown1)}')

unknown2 = uknown_data_dir + 'mirror_unknown2.csv'
unknown2 = extract_packet_win_features(unknown2, N)
print(f'Number of flow feature vectors from mirror_unknown2.csv: {len(unknown2)}')

unknown3 = uknown_data_dir + 'dtn1_unknown1.csv'
unknown3 = extract_packet_win_features(unknown3, N)
print(f'Number of flow feature vectors from dtn1_unknown1.csv: {len(unknown3)}')

unknown4 = uknown_data_dir + 'dtn1_unknown2.csv'
unknown4 = extract_packet_win_features(unknown4, N)
print(f'Number of flow feature vectors from dtn1_unknown2.csv: {len(unknown4)}')

unknown5 = uknown_data_dir + 'dtn1_unknown3.csv'
unknown5 = extract_packet_win_features(unknown5, N)
print(f'Number of flow feature vectors from dtn1_unknown3.csv: {len(unknown5)}')

unknown6 = uknown_data_dir + 'mirror_unknown3.csv'
unknown6 = extract_packet_win_features(unknown6, N)
print(f'Number of flow feature vectors from mirror_unknown3.csv: {len(unknown6)}')

# Dictionary of all unknown feature vectors
twenty_five_unknown_dict = {'unknown1': unknown1, 'unknown2': unknown2, 'unknown3': unknown3, 
                'unknown4': unknown4, 'unknown5': unknown5, 'unknown6': unknown6}

np.save('../../Feature-Vectors/unknown_25_packet_subflow_features.npy', twenty_five_unknown_dict)

Number of flow feature vectors from mirror_unknown1.csv: 3315
Number of flow feature vectors from mirror_unknown2.csv: 3656
Number of flow feature vectors from dtn1_unknown1.csv: 1462
Number of flow feature vectors from dtn1_unknown2.csv: 1555
Number of flow feature vectors from dtn1_unknown3.csv: 2637
Number of flow feature vectors from mirror_unknown3.csv: 2239
