In [1]:
import glob
import os
from os.path import join
from numpy import genfromtxt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict
%matplotlib inline

In [2]:
def get_pkt_count(dirs,ending):
    counts = []
    for d in dirs:
        num_sampled_pkts = 0
        for f in glob.glob(join(d,ending)):
            num_sampled_pkts += int(open(f).readline())
        counts.append(num_sampled_pkts)
    return counts

In [3]:
def get_immediate_subdirs(a_dir, only=''):
    if only=='':
        return [os.path.join(a_dir, name) for name in os.listdir(a_dir) 
                if os.path.isdir(os.path.join(a_dir, name)) and name.endswith('_l')]
    else:
        return [os.path.join(a_dir, name) for name in os.listdir(a_dir)
                if os.path.isdir(os.path.join(a_dir, name)) and name.endswith('_l') and (only in name or 'whole' in name)]

In [4]:
def get_flow_dists(dirs):
    results = []
    for d in dirs:
        df = pd.read_csv(join(d,'label_dist.csv'),header=None,names=['Label','Count'])
        results.append(df)
            
    return results
        
        

In [5]:
def get_count_dir_names(dirs,ending):
    pkt_counts = np.array(get_pkt_count(dirs,ending))
    flow_dists = get_flow_dists(dirs)
    dir_names = np.array([os.path.basename(dir)[:-2] for dir in dirs])
        
    sorted_index = np.argsort(dir_names)
    pkt_counts = pkt_counts[sorted_index]
    flow_dists = [flow_dists[i] for i in sorted_index]
    dir_names = dir_names[sorted_index]
    
    return flow_dists,pkt_counts,dir_names

In [6]:
def get_count4whole(ending):
    d = '/home/juma/data/net_intrusion/CIC-IDS-2018/whole_l'
    num_sampled_pkts = 0
    for f in glob.glob(join(d,ending)):
        num_sampled_pkts += int(open(f).readline())
    return num_sampled_pkts

def get_flow_dist4whole():
    d = '/home/juma/data/net_intrusion/CIC-IDS-2018/whole_l'
    df = pd.read_csv(join(d,'label_dist.csv'),header=None,names=['Label','Count'])          
    return df
        
        

In [7]:
import math
def autolabel(rects, xpos='center'):
    """
    Attach a text label above each bar in *rects*, displaying its height.

    *xpos* indicates which side to place the text w.r.t. the center of
    the bar. It can be one of the following {'center', 'right', 'left'}.
    """

    ha = {'center': 'center', 'right': 'left', 'left': 'right'}
    offset = {'center': 0, 'right': 1, 'left': -1}
    max_height = 0
    for rect in rects:
        height = rect.get_height()
        if height > max_height:
            max_height = height
            
    for rect in rects:
        height = rect.get_height()
        ax.annotate('{0:.2f}%'.format(100*height/max_height),
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(offset[xpos]*3, 1),  # use 3 points offset
                    textcoords="offset points",  # in both directions
                    ha=ha[xpos], va='bottom')


In [8]:

def plot_comparison(dir_names,counts,ax,dataset_name,y_label):
    ind = np.arange(len(dir_names))*20
    width=15
    N = len(dirs)
    colors = []
    labels = []
    for dir_name in dir_names:
        if 'sk_sr' in dir_name:
            color='orange'
            label = 'sketchflow'
        elif 'sgs_e' in dir_name:
            color = 'green'
            label = 'sgs'
        elif 'sf_sr' in dir_name:
            color = 'blue'
            label = 'sFlow'
        elif 'ffs_(' in dir_name:
            color = 'red'
            label = 'Fast Filtered Sampling'
        elif 'sel' in dir_name:
            color = 'purple'
            label = 'selective flow samp.'
        elif 'whole' in dir_name or 'cicflow' in dir_name:
            color = 'gray'
            label = 'whole data'
        else:
            print('Investigate plot_comparison',dir_name)
            
        colors.append(color)
        labels.append(label)
        
    rects = ax.bar(ind,counts,label=dataset_name,width=width,color=colors)

    #Add some text for labels, title and custom x-axis tick labels, etc.    
    #ax.set_title('Finding Equivalent Sampling Rate for comparision')
    ax.set_xticks(ind)
    ax.set_xticklabels(dir_names,rotation=45)
    ax.set_ylabel(y_label)
    ax.legend()
    ax.grid()
    #ax.margins(0.15)
    autolabel(rects)




In [9]:
def extract_sampler_names(dir_names):
    colors = []
    sampler_names = []
    for dir_name in dir_names:
        if 'sk_sr' in dir_name:
            color='orange'
            label = 'SketchFlow Sampling'
        elif 'sgs_e' in dir_name:
            color = 'green'
            label = 'Sketch Guided Sampling'
        elif 'sf_sr' in dir_name or 'SF_NO_LIMIT' in dir_name:
            color = 'blue'
            label = 'Random Packet Sampling'
        elif 'ffs_(' in dir_name:
            color = 'red'
            label = 'Fast Filtered Sampling'
        elif 'sel' in dir_name:
            color = 'purple'
            label = 'Selective Flow Sampling'
        elif 'whole' in dir_name or 'cicflow' in dir_name:
            color = 'gray'
            label = 'Without Sampling'
        else:
            print('Investigate plot_comparison',dir_name)
            
        colors.append(color)
        sampler_names.append(label)
    return sampler_names

In [13]:
root = '/home/juma/data/net_intrusion/CIC-IDS-2018/NO_LIMIT/'
label = 'CIC-IDS-2018'
ending = '*.spc'

total_pkts = get_count4whole(ending)
whole_df = get_flow_dist4whole()


In [14]:
#attack_labels = whole_df['Label'].values
#for label in attack_labels:
#    num_flows = whole_df[whole_df['Label']==label]['Count'].values[0]
#    print("{0:30} - > {1}".format(label,num_flows))

In [22]:
attack_labels = whole_df['Label'].values
header = np.concatenate((['Sampling technique'],attack_labels))
df = pd.DataFrame(columns=header)
flow_counts = defaultdict(list)
flow_counts['Sampling Rate'].append('Without sampling')
for attack_label in attack_labels:
    flow_counts[attack_label].append('100')
flow_counts['Total Malicious Flows'].append('100')
    
only_sampling_method = 'SF'
for sr in ['SR_10','SR_1','SR_0.1']:
    original_flows_sum = 0
    sampled_flows_sum = 0
    print("-----------------SR of {} --------------------".format(sr[3:]))
    print("-----------------SR of {} --------------------".format(sr[3:]))
    flow_counts['Sampling Rate'].append(sr[3:])
    
    dirs = get_immediate_subdirs(join(root,sr),only=only_sampling_method)
    flow_dists,pkt_counts,dir_names = get_count_dir_names(dirs,ending)

    # below here would form a single table for Sampling_rate x Attacks for specific SR
    sampler_names = extract_sampler_names(dir_names)
    for i,sampler_name in enumerate(sampler_names):
        flow_dist_df = flow_dists[i]
        print('******************{0:20}*********************'.format(sampler_name))
            
        for label in attack_labels:
                #make list dictionary for each attack type on sampler names
                if len(flow_dist_df[flow_dist_df['Label']==label]['Count'])>0:
                    num_flows = flow_dist_df[flow_dist_df['Label']==label]['Count'].values[0]
                else:
                    num_flows = 0
                original_num_flows = whole_df[whole_df['Label']==label]['Count'].values[0]                               
                flow_counts[label].append('{:.2f}'.format(100*num_flows/original_num_flows))
                print('{:30}'.format(label),num_flows)
                
                if label=="Benign":
                    continue
                original_flows_sum+=original_num_flows
                sampled_flows_sum += num_flows
                
    flow_counts['Total Malicious Flows'].append('{:.2f}'.format(100*sampled_flows_sum/original_flows_sum))
 
    
pd.DataFrame.from_dict(flow_counts).to_csv(join(root,'retained_attacks_{}.csv'.format(only_sampling_method)),index=False)
           
    

-----------------SR of 10 --------------------
-----------------SR of 10 --------------------
******************Random Packet Sampling*********************
Benign                         75489
Brute Force-Web                225
Brute Force-XSS                117
DDoS attacks-LOIC-HTTP         185817
DDoS-HOIC                      747562
DDoS-LOIC-UDP                  2619
DoS-GoldenEye                  17721
DoS-Hulk                       327445
DoS-SlowHTTPTest               20478
DoS-Slowloris                  5594
FTP-BruteForce                 37006
Infiltration                   85
SQL Injection                  45
SSH-BruteForce                 93768
-----------------SR of 1 --------------------
-----------------SR of 1 --------------------
******************Random Packet Sampling*********************
Benign                         11052
Brute Force-Web                141
Brute Force-XSS                109
DDoS attacks-LOIC-HTTP         25342
DDoS-HOIC                      104551

In [21]:
flow_counts

defaultdict(list,
            {'Sampling Rate': ['Without sampling', '10', '1', '0.1'],
             'Benign': ['100', '15.98', '2.34', '0.28'],
             'Brute Force-Web': ['100', '31.96', '20.03', '4.69'],
             'Brute Force-XSS': ['100', '34.51', '32.15', '8.26'],
             'DDoS attacks-LOIC-HTTP': ['100', '32.09', '4.38', '0.44'],
             'DDoS-HOIC': ['100', '23.19', '3.24', '0.33'],
             'DDoS-LOIC-UDP': ['100', '68.20', '64.58', '63.46'],
             'DoS-GoldenEye': ['100', '38.61', '5.45', '0.56'],
             'DoS-Hulk': ['100', '9.29', '2.01', '0.44'],
             'DoS-SlowHTTPTest': ['100', '19.40', '2.03', '0.20'],
             'DoS-Slowloris': ['100', '54.73', '9.26', '0.92'],
             'FTP-BruteForce': ['100', '19.44', '2.01', '0.19'],
             'Infiltration': ['100', '82.52', '27.18', '0.97'],
             'SQL Injection': ['100', '35.71', '1.59', '1.59'],
             'SSH-BruteForce': ['100', '50.52', '19.13', '2.23'],
          

In [26]:
19.4/2.3 dosslowhttptest

8.434782608695652

In [24]:
18.28/3.15

5.803174603174604

In [25]:
23.19/3.24

7.157407407407407