In [1]:
import glob
import os
from os.path import join
from numpy import genfromtxt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict
%matplotlib inline

In [2]:
def get_pkt_count(dirs,ending):
    counts = []
    for d in dirs:
        num_sampled_pkts = 0
        for f in glob.glob(join(d,ending)):
            num_sampled_pkts += int(open(f).readline())
        counts.append(num_sampled_pkts)
    return counts

In [3]:
def get_immediate_subdirs(a_dir, only=''):
    if only=='':
        return [os.path.join(a_dir, name) for name in os.listdir(a_dir) 
                if os.path.isdir(os.path.join(a_dir, name)) and name.endswith('_l')]
    else:
        return [os.path.join(a_dir, name) for name in os.listdir(a_dir)
                if os.path.isdir(os.path.join(a_dir, name)) and name.endswith('_l') and (only in name or 'whole' in name)]

In [4]:
def get_flow_dists(dirs):
    results = []
    for d in dirs:
        df = pd.read_csv(join(d,'label_dist.csv'),header=None,names=['Label','Count'])
        results.append(df)
            
    return results
        
        

In [5]:
def get_count_dir_names(dirs,ending):
    pkt_counts = np.array(get_pkt_count(dirs,ending))
    flow_dists = get_flow_dists(dirs)
    dir_names = np.array([os.path.basename(dir)[:-2] for dir in dirs])
        
    sorted_index = np.argsort(dir_names)
    pkt_counts = pkt_counts[sorted_index]
    flow_dists = [flow_dists[i] for i in sorted_index]
    dir_names = dir_names[sorted_index]
    
    return flow_dists,pkt_counts,dir_names

In [6]:
import math
def autolabel(rects, xpos='center'):
    """
    Attach a text label above each bar in *rects*, displaying its height.

    *xpos* indicates which side to place the text w.r.t. the center of
    the bar. It can be one of the following {'center', 'right', 'left'}.
    """

    ha = {'center': 'center', 'right': 'left', 'left': 'right'}
    offset = {'center': 0, 'right': 1, 'left': -1}
    max_height = 0
    for rect in rects:
        height = rect.get_height()
        if height > max_height:
            max_height = height
            
    for rect in rects:
        height = rect.get_height()
        ax.annotate('{0:.2f}%'.format(100*height/max_height),
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(offset[xpos]*3, 1),  # use 3 points offset
                    textcoords="offset points",  # in both directions
                    ha=ha[xpos], va='bottom')


In [7]:

def plot_comparison(dir_names,counts,ax,dataset_name,y_label):
    ind = np.arange(len(dir_names))*20
    width=15
    N = len(dirs)
    colors = []
    labels = []
    for dir_name in dir_names:
        if 'sk_sr' in dir_name:
            color='orange'
            label = 'sketchflow'
        elif 'sgs_e' in dir_name:
            color = 'green'
            label = 'sgs'
        elif 'sf_sr' in dir_name:
            color = 'blue'
            label = 'sFlow'
        elif 'ffs_(' in dir_name:
            color = 'red'
            label = 'Fast Filtered Sampling'
        elif 'sel' in dir_name:
            color = 'purple'
            label = 'selective flow samp.'
        elif 'whole' in dir_name or 'cicflow' in dir_name:
            color = 'gray'
            label = 'whole data'
        else:
            print('Investigate plot_comparison',dir_name)
            
        colors.append(color)
        labels.append(label)
        
    rects = ax.bar(ind,counts,label=dataset_name,width=width,color=colors)

    #Add some text for labels, title and custom x-axis tick labels, etc.    
    #ax.set_title('Finding Equivalent Sampling Rate for comparision')
    ax.set_xticks(ind)
    ax.set_xticklabels(dir_names,rotation=45)
    ax.set_ylabel(y_label)
    ax.legend()
    ax.grid()
    #ax.margins(0.15)
    autolabel(rects)




In [8]:
def extract_sampler_names(dir_names):
    colors = []
    sampler_names = []
    for dir_name in dir_names:
        if 'sk_sr' in dir_name:
            color='orange'
            label = 'SketchFlow Sampling'
        elif 'sgs_e' in dir_name:
            color = 'green'
            label = 'Sketch Guided Sampling'
        elif 'sf_sr' in dir_name:
            color = 'blue'
            label = 'Random Packet Sampling'
        elif 'ffs_(' in dir_name:
            color = 'red'
            label = 'Fast Filtered Sampling'
        elif 'sel' in dir_name:
            color = 'purple'
            label = 'Selective Flow Sampling'
        elif 'whole' in dir_name or 'cicflow' in dir_name:
            color = 'gray'
            label = 'Without Sampling'
        else:
            print('Investigate plot_comparison',dir_name)
            
        colors.append(color)
        sampler_names.append(label)
    return sampler_names

In [9]:
root = '/home/juma/data/net_intrusion/CIC-IDS-2018/CSVs'
label = 'CIC-IDS-2018'
ending = '*.spc'

dirs = get_immediate_subdirs(root)
flow_dists,pkt_counts,dir_names = get_count_dir_names(dirs,ending)

whole_index = np.where(dir_names=='whole')[0][0]
total_pkts = pkt_counts[whole_index] 
sampling_rates = [100*10**round(np.log10(pkt_count/total_pkts)) for pkt_count in pkt_counts]
sampler_names = extract_sampler_names(dir_names)


In [10]:
whole_df = flow_dists[whole_index]

In [14]:
#attack_labels = whole_df['Label'].values
#for label in attack_labels:
#    num_flows = whole_df[whole_df['Label']==label]['Count'].values[0]
#    print("{0:30} - > {1}".format(label,num_flows))

In [13]:
attack_labels = whole_df['Label'].values
header = np.concatenate((['Sampling technique'],attack_labels))
for sr in [10,1,.1]:
    print("-----------------SR of {} --------------------".format(sr))
    df = pd.DataFrame(columns=header)
    flow_counts = defaultdict(list)
    flow_counts['Sampling technique'].append('No sampling')
    for i,row in whole_df.iterrows():
        flow_counts[row[0]].append('100')    
    
    for i,sampling_rate in enumerate(sampling_rates):
        if sampling_rate==sr:
            # below here would form a single table for Sampling_method x Attacks for specific SR
            sampler_name = sampler_names[i]
            flow_counts['Sampling technique'].append(sampler_name)
            flow_dist_df = flow_dists[i]
            print('******************{0:20}*********************'.format(sampler_name))
            
            for label in attack_labels:
                #make list dictionary for each attack type on sampler names
                if len(flow_dist_df[flow_dist_df['Label']==label]['Count'])>0:
                    num_flows = flow_dist_df[flow_dist_df['Label']==label]['Count'].values[0]
                else:
                    num_flows = 0
                original_num_flows = whole_df[whole_df['Label']==label]['Count'].values[0]
                
                flow_counts[label].append('{:.2f}'.format(100*num_flows/original_num_flows))
                print('{:30}'.format(label),num_flows)
            
    pd.DataFrame.from_dict(flow_counts).to_csv(join(root,'retained_attacks','{}.csv'.format(sr)),index=False)
            
    

-----------------SR of 10 --------------------


FileNotFoundError: [Errno 2] No such file or directory: '/home/juma/data/net_intrusion/CIC-IDS-2018/CSVs/retained_attacks/10.csv'