In [69]:
import glob
import os
from os.path import join
from numpy import genfromtxt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict
%matplotlib inline

In [70]:
def get_pkt_count(dirs,ending):
    counts = []
    for d in dirs:
        num_sampled_pkts = 0
        for f in glob.glob(join(d,ending)):
            num_sampled_pkts += int(open(f).readline())
        counts.append(num_sampled_pkts)
    return counts

In [71]:
def get_immediate_subdirs(a_dir, only=''):
    if only=='':
        return [os.path.join(a_dir, name) for name in os.listdir(a_dir) 
                if os.path.isdir(os.path.join(a_dir, name)) and name.endswith('_l')]
    else:
        return [os.path.join(a_dir, name) for name in os.listdir(a_dir)
                if os.path.isdir(os.path.join(a_dir, name)) and name.endswith('_l') and (only in name or 'whole' in name)]

In [72]:
def get_flow_dists(dirs):
    results = []
    for d in dirs:
        df = pd.read_csv(join(d,'label_dist.csv'),header=None,names=['Label','Count'])
        results.append(df)
            
    return results
        
        

In [73]:
def get_count4whole(ending):
    d = '/home/juma/data/net_intrusion/CIC-IDS-2018/whole_l'
    num_sampled_pkts = 0
    for f in glob.glob(join(d,ending)):
        num_sampled_pkts += int(open(f).readline())
    return num_sampled_pkts

def get_flow_dist4whole():
    d = '/home/juma/data/net_intrusion/CIC-IDS-2018/whole_l'
    df = pd.read_csv(join(d,'label_dist.csv'),header=None,names=['Label','Count'])          
    return df        

In [74]:
def get_count_dir_names(dirs,ending):
    pkt_counts = np.array(get_pkt_count(dirs,ending))
    flow_dists = get_flow_dists(dirs)
    dir_names = np.array([os.path.basename(dir)[:-2] for dir in dirs])
        
    sorted_index = np.argsort(dir_names)
    pkt_counts = pkt_counts[sorted_index]
    flow_dists = [flow_dists[i] for i in sorted_index]
    dir_names = dir_names[sorted_index]
    
    return flow_dists,pkt_counts,dir_names

In [75]:
import math
def autolabel(rects, xpos='center'):
    """
    Attach a text label above each bar in *rects*, displaying its height.

    *xpos* indicates which side to place the text w.r.t. the center of
    the bar. It can be one of the following {'center', 'right', 'left'}.
    """

    ha = {'center': 'center', 'right': 'left', 'left': 'right'}
    offset = {'center': 0, 'right': 1, 'left': -1}
    max_height = 0
    for rect in rects:
        height = rect.get_height()
        if height > max_height:
            max_height = height
            
    for rect in rects:
        height = rect.get_height()
        ax.annotate('{0:.2f}%'.format(100*height/max_height),
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(offset[xpos]*3, 1),  # use 3 points offset
                    textcoords="offset points",  # in both directions
                    ha=ha[xpos], va='bottom')


In [76]:

def plot_comparison(dir_names,counts,ax,dataset_name,y_label):
    ind = np.arange(len(dir_names))*20
    width=15
    N = len(dirs)
    colors = []
    labels = []
    for dir_name in dir_names:
        if 'sk_sr' in dir_name:
            color='orange'
            label = 'sketchflow'
        elif 'sgs_e' in dir_name:
            color = 'green'
            label = 'sgs'
        elif 'sf_sr' in dir_name:
            color = 'blue'
            label = 'sFlow'
        elif 'ffs_(' in dir_name:
            color = 'red'
            label = 'Fast Filtered Sampling'
        elif 'sel' in dir_name:
            color = 'purple'
            label = 'selective flow samp.'
        elif 'whole' in dir_name or 'cicflow' in dir_name:
            color = 'gray'
            label = 'whole data'
        else:
            print('Investigate plot_comparison',dir_name)
            
        colors.append(color)
        labels.append(label)
        
    rects = ax.bar(ind,counts,label=dataset_name,width=width,color=colors)

    #Add some text for labels, title and custom x-axis tick labels, etc.    
    #ax.set_title('Finding Equivalent Sampling Rate for comparision')
    ax.set_xticks(ind)
    ax.set_xticklabels(dir_names,rotation=45)
    ax.set_ylabel(y_label)
    ax.legend()
    ax.grid()
    #ax.margins(0.15)
    autolabel(rects)




In [77]:
def extract_sampler_names(dir_names):
    colors = []
    sampler_names = []
    for dir_name in dir_names:
        if 'sk_sr' in dir_name:
            color='orange'
            label = 'SketchFlow Sampling'
        elif 'sgs_e' in dir_name:
            color = 'green'
            label = 'Sketch Guided Sampling'
        elif 'sf_sr' in dir_name:
            color = 'blue'
            label = 'Random Packet Sampling'
        elif 'ffs_(' in dir_name:
            color = 'red'
            label = 'Fast Filtered Sampling'
        elif 'sel' in dir_name:
            color = 'purple'
            label = 'Selective Flow Sampling'
        elif 'whole' in dir_name or 'cicflow' in dir_name:
            color = 'gray'
            label = 'Without Sampling'
        else:
            print('Investigate plot_comparison',dir_name)
            
        colors.append(color)
        sampler_names.append(label)
    return sampler_names

In [78]:
root = '/home/juma/data/net_intrusion/CIC-IDS-2018/CSVs_mem_SF_r_100/SR_1'
label = 'CIC-IDS-2018'
ending = '*.spc'

whole_pkt_count = get_count4whole(ending)
whole_df = get_flow_dist4whole()
dirs = get_immediate_subdirs(root)
flow_dists,pkt_counts,dir_names = get_count_dir_names(dirs,ending)
sampler_names = extract_sampler_names(dir_names)


In [79]:
    attack_labels = whole_df['Label'].values
    header = np.concatenate((['Sampling technique'],attack_labels))
    df = pd.DataFrame(columns=header)
    flow_counts = defaultdict(list)
    flow_counts['Sampling technique'].append('Without sampling')
    
    for i,row in whole_df.iterrows():
        flow_counts[row[0]].append('100') # for each label     
    
    for i,sampler_name in enumerate(sampler_names):
        # below here would form a single table for Sampling_method x Attacks for specific SR
        #sampler_name = sampler_names[i]
        flow_counts['Sampling technique'].append(sampler_name)
        flow_dist_df = flow_dists[i]
        print('******************{0:20}*********************'.format(sampler_name))
            
        for label in attack_labels:
                #make list dictionary for each attack type on sampler names
                if len(flow_dist_df[flow_dist_df['Label']==label]['Count'])>0:
                    num_flows = flow_dist_df[flow_dist_df['Label']==label]['Count'].values[0]
                else:
                    num_flows = 0
                original_num_flows = whole_df[whole_df['Label']==label]['Count'].values[0]
                
                flow_counts[label].append('{:.2f}'.format(100*num_flows/original_num_flows))
                print('{:30}'.format(label),num_flows)
            
    pd.DataFrame.from_dict(flow_counts).to_csv(join(root,'retained_ratio.csv'),index=False)
            
    

******************Fast Filtered Sampling*********************
Benign                         17805
Brute Force-Web                143
Brute Force-XSS                110
DDoS attacks-LOIC-HTTP         26702
DDoS-HOIC                      113382
DDoS-LOIC-UDP                  2467
DoS-GoldenEye                  2903
DoS-Hulk                       72451
DoS-SlowHTTPTest               2632
DoS-Slowloris                  1151
FTP-BruteForce                 4201
Infiltration                   31
SQL Injection                  8
SSH-BruteForce                 37055
******************Selective Flow Sampling*********************
Benign                         364115
Brute Force-Web                261
Brute Force-XSS                112
DDoS attacks-LOIC-HTTP         175444
DDoS-HOIC                      249853
DDoS-LOIC-UDP                  320
DoS-GoldenEye                  15385
DoS-Hulk                       27729
DoS-SlowHTTPTest               57733
DoS-Slowloris                  6109
FTP-Br