In [3]:
import numpy as np
import pandas as pd
from scapy.all import *
import random, glob, os, sys
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from tqdm import tqdm

import tensorflow as tf
import keras

In [4]:
folder = "/Users/michaelwrana/Documents/Courses/Asokan_858/Project/data_large/"

# top 10 features according to https://www.usenix.org/system/files/conference/usenixsecurity16/sec16_paper_hayes.pdf
feature_names = ['incoming_packets', 'outgoing_proportion', 'incoming_proportion', 
                 'packet_ordering_stdev', 'outgoing_packets', 'alternative_concentration_sum'
                 'outgoing_ordering_sum','sum_all','alternative_persecond_sum',
                 'total_packets']
# class label names
label_names = ['file1','file2']

x = []
y = []
for filename in tqdm(glob.iglob(folder + '**', recursive=True), total=5000):
    if os.path.isfile(filename):
        
        # figure out what the label should be based on directory location
        # e.g. data/file1/website1 means file 1 downloaded from website 1
        file_path = filename.split('/')
        file_id = int(file_path[-2][-1])
        
        features = dict()
        
        # incoming/outgoing packet counts
        incoming_count = 0
        outgoing_count = 0
        packet_count = 0
        # count of previous outgoing packets.  reset when incoming packet arrives
        order_count = 0
        outgoing_order = []
        # packet timestamps
        timestamps = []
        
        for (pkt_data, pkt_metadata) in RawPcapReader(filename):
            ether_pkt = Ether(pkt_data)
            
            # if it is a non-ip packet (i.e. ARP background noise) do not process
            try:
                ip_pkt = ether_pkt[IP]
            except IndexError:
                pass
            
            # outgoing packet (192.168.*.())
            if(ip_pkt.src[0:3] == '192'):
                outgoing_count += 1
                
                # if the previous packet was also outgoing, append to the list
                if(order_count >= 0):
                    outgoing_order.append(order_count)
                
                # this packet is outgoing, increment counter
                order_count += 1
            else:
                incoming_count += 1
                
                # this packet is incoming
                order_count = 0
            packet_count +=1
        
        # get timestamps
        # cant do in above loop because it extracts current UNIX timestamp (not sure why)
        for packet in rdpcap(filename):
            timestamps.append(packet.time)
        
        features['incoming_packets'] = incoming_count
        features['outgoing_proportion'] = outgoing_count / packet_count
        features['incoming_proportion'] = incoming_count / packet_count
        features['outgoing_ordering_stdev'] = np.std(outgoing_order)
        features['outgoing_packets'] = outgoing_count
        # not possible with small size of transmission
        #features['alternative_concentration_sum'] = 0 # TODO
        features['outgoing_ordering_average'] = np.average(outgoing_order)
        features["sum_all"] = incoming_count + outgoing_count + packet_count
        # alternative persecond sum not possible - used packets per second instead
        features["packet_speed"] = float(len(timestamps) / (timestamps[-1] - timestamps[0]))
        features['total_packets'] = packet_count
        
        feature_val = list(features.values())
        
        # append to final dataset
        x = x + [feature_val]
        y = y + [file_id-1]
        


5003it [00:47, 104.38it/s]


In [None]:
# convert to numpy array and split into training/testing
x = np.asarray(x)
y = np.asarray(y)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.10, random_state=42)

In [5]:
model = RandomForestClassifier(max_depth = 2)
model.fit(x_train,y_train)

RandomForestClassifier(max_depth=2)

In [6]:
y_pred = model.predict(x_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.51      0.73      0.60       237
           1       0.60      0.36      0.45       263

    accuracy                           0.54       500
   macro avg       0.56      0.55      0.53       500
weighted avg       0.56      0.54      0.52       500

