In [1]:
import glob
import os
from os.path import join
from numpy import genfromtxt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
def get_immediate_subdirs(a_dir, only=''):
    if only=='':
        return [os.path.join(a_dir, name) for name in os.listdir(a_dir) 
                if os.path.isdir(os.path.join(a_dir, name)) and name.endswith('_l')]
    else:
        return [os.path.join(a_dir, name) for name in os.listdir(a_dir)
                if os.path.isdir(os.path.join(a_dir, name)) and name.endswith('_l') and (only in name or 'whole' in name)]

In [3]:
def get_pkt_count(dirs,ending):
    counts = []
    for d in dirs:
        num_sampled_pkts = 0
        for f in glob.glob(join(d,ending)):
            num_sampled_pkts += int(open(f).readline())
        counts.append(num_sampled_pkts)
    return counts

In [4]:
def get_cms(dirs,local_path):
    cms = []
    for d in dirs:
        data = pd.read_csv(join(d,local_path))        
        cms.append(data)
    return cms

In [5]:
def get_count_dir_names(dirs,ending,local_path):
    pkt_counts = np.array(get_pkt_count(dirs,ending))
    cms = get_cms(dirs,local_path)
    #print(cms)
    dir_names = np.array([os.path.basename(dir)[:-2] for dir in dirs])
        
    sorted_index = np.argsort(pkt_counts)
    pkt_counts = pkt_counts[sorted_index]
    sorted_cms = [cms[i] for i in sorted_index]
    
    dir_names = dir_names[sorted_index]
    return sorted_cms,pkt_counts,dir_names

In [6]:
def extract_sampler_name(d):
    dir_name  = os.path.basename(d)
    sampler_name = dir_name[:dir_name.find('_')]
    if sampler_name=='ffs':
        sampler_name = 'Fast Filtered Sampling'
    elif sampler_name=='sel': 
        sampler_name = 'Selective Flow Sampling'
    elif sampler_name=='sf':
        sampler_name = 'Random Packet Sampling'
    elif sampler_name == 'sgs':
        sampler_name ='Sketch Guided Sampling'
    elif sampler_name == 'sk':
        sampler_name = 'SketchFlow'
    return sampler_name

def extract_nd_nfa(cm,attack_name):
    if attack_name not in cm:
        return -1,-1
    else:
        nd = cm.loc[cm['Unnamed: 0']==attack_name][attack_name].values[0]
        nfa = cm.loc[cm['Unnamed: 0']=='Benign'][attack_name].values[0]
        return nd,nfa

def get_count4label(cm,attack_name):
    #print(cm.loc[cm['Unnamed: 0']==attack_name])
    if attack_name not in cm:
        return -1
    
    row = cm.loc[cm['Unnamed: 0']==attack_name,].values[0]
    
    c = np.sum(row[1:])
    if c==0:
        #print("CM has 0 flow for ", attack_name)
        #print(cm)
        pass
    return c

import math
def round_up(n,decimals=0):
    #print("rounding up", n)
    multiplier = 10**decimals
    return math.ceil(n*multiplier)/multiplier

In [27]:
#root = '/home/juma/data/net_intrusion/ISCX-Bot-2014/CSVs'
root = '/home/juma/data/net_intrusion/CIC-IDS-2018/CSVs'
#label = 'ISCX-Bot-2014'
ending = '*.spc'
local_path = 'forest_norm_k_10/cm_nonnorm_fold_avg.csv'

AGAINST_WHOLE=True
if AGAINST_WHOLE:
    results_file = 'abs_forest_norm_k_10.csv'
else:
    results_file = 'forest_norm_k_10.csv'

In [28]:
import math
import pandas as df
from collections import defaultdict
num_samplers = 5
num_sampling_rate = 3

dirs = get_immediate_subdirs(root)
cms, pkt_counts,dir_names = get_count_dir_names(dirs,ending,local_path)
ds = cms[0].columns
attack_types = ds.drop(labels=['Unnamed: 0', 'Benign']).values
print('Attack types',attack_types)

# we will use benign_flow_count for NFA
index = np.where(dir_names=='whole')[0][0]
print('index',index)
whole_cm = cms[index]
whole_benign_count = get_count4label(whole_cm,'Benign')
print('benign_flow_count',whole_benign_count)

with open(join(root,results_file), 'w') as f:
    np.savetxt(f,np.array(['','','Confusion Matrix','','']),delimiter=',',fmt='%s')        
    np.savetxt(f,whole_cm.columns.values.reshape((1,-1)),delimiter=',',fmt='%s')
    np.savetxt(f,whole_cm,delimiter=',',fmt='%s')


Attack types ['Brute Force-Web' 'Brute Force-XSS' 'DDoS attacks-LOIC-HTTP' 'DDoS-HOIC'
 'DDoS-LOIC-UDP' 'DoS-GoldenEye' 'DoS-Hulk' 'DoS-SlowHTTPTest'
 'DoS-Slowloris' 'FTP-BruteForce' 'SSH-BruteForce']
index 15
benign_flow_count 47233.2


In [29]:
for attack_name in attack_types:
    results = np.empty((0,num_sampling_rate*2+1),dtype=str)
    presults = np.empty((0,num_sampling_rate*2+1),dtype=str)
    t1= np.array(['method \ SR','10%','10%','1%','1%','.1%','.1%']).reshape((1,-1))
    results = np.concatenate((results,t1),axis=0)
    presults = np.concatenate((presults,t1),axis=0)

    t2 = np.array(['','ND','NFA','ND','NFA','ND','NFA']).reshape((1,-1))
    results = np.concatenate((results,t2),axis=0)
    t2 = np.array(['','DR','FAR','DR','FAR','DR','FAR']).reshape((1,-1))
    presults = np.concatenate((presults,t2),axis=0)

    
    rows = defaultdict(list)
    prows = defaultdict(list)
    whole_flow_count = get_count4label(whole_cm,attack_name)
    
    for i,(pkt_count,dir_name) in enumerate(zip(pkt_counts,dir_names)):
        log_sr = round(math.log10(pkt_counts[-1]/pkt_count))
        if log_sr==1:
            cm = cms[i]
            nd,nfa = extract_nd_nfa(cm,attack_name) # return (detection-count,false-alarm-count) for the given attack and cm 
            rows[extract_sampler_name(dir_name)].append(nd)
            rows[extract_sampler_name(dir_name)].append(nfa)
            if AGAINST_WHOLE:
                flow_count = whole_flow_count
                benign_flow_count = whole_benign_count
            else:
                flow_count = get_count4label(cm,attack_name)
                benign_flow_count = get_count4label(cm,'Benign')
                
            dr = 100*nd/flow_count
            dr = round_up(dr,2)
            far = 100*nfa/benign_flow_count
            far = round_up(far,2)
            prows[extract_sampler_name(dir_name)].append(dr)
            prows[extract_sampler_name(dir_name)].append(far)
            
            #print(dir_name,cms[i].iloc[1,1],nd)


    for i,(pkt_count,dir_name) in enumerate(zip(pkt_counts,dir_names)):
        log_sr = round(math.log10(pkt_counts[-1]/pkt_count))
        if log_sr==2:
            cm = cms[i]

            nd,nfa = extract_nd_nfa(cm,attack_name) # return (detection-count,false-alarm-count) for the given attack and cm 
            
            rows[extract_sampler_name(dir_name)].append(nd)
            rows[extract_sampler_name(dir_name)].append(nfa)
            
            if AGAINST_WHOLE:
                flow_count = whole_flow_count
                benign_flow_count = whole_benign_count
            else:
                flow_count = get_count4label(cm,attack_name)
                benign_flow_count = get_count4label(cm,'Benign')

            dr = 100*nd/flow_count
            dr = round_up(dr,2)
            far = 100*nfa/benign_flow_count
            far = round_up(far,2)
            prows[extract_sampler_name(dir_name)].append(dr)
            prows[extract_sampler_name(dir_name)].append(far)
            #print(dir_name,cms[i].iloc[1,1],nd)
                
    for i,(pkt_count,dir_name) in enumerate(zip(pkt_counts,dir_names)):
        log_sr = round(math.log10(pkt_counts[-1]/pkt_count))
        if log_sr==3: # SR 0.1%
            cm = cms[i]
            #print(dir_name)
            nd,nfa = extract_nd_nfa(cm,attack_name) # return (detection-count,false-alarm-count) for the given attack and cm 
            
            rows[extract_sampler_name(dir_name)].append(nd)
            rows[extract_sampler_name(dir_name)].append(nfa)
            
            if AGAINST_WHOLE:
                flow_count = whole_flow_count
                benign_flow_count = whole_benign_count
            else:
                flow_count = get_count4label(cm,attack_name)
                benign_flow_count = get_count4label(cm,'Benign')

            dr = 100*nd/flow_count
            dr = round_up(dr,2)
            far = 100*nfa/benign_flow_count
            far = round_up(far,2)
            prows[extract_sampler_name(dir_name)].append(dr)
            prows[extract_sampler_name(dir_name)].append(far)

           
        
    #now put the sampler results into variable 'results'
    # 1st put the row for whole data 
    for i,(pkt_count,dir_name) in enumerate(zip(pkt_counts,dir_names)):
        log_sr = round(math.log10(pkt_counts[-1]/pkt_count))
        if log_sr==0:
            whole_cm = cms[i] # we will print this to csv
            nd,nfa = extract_nd_nfa(whole_cm,attack_name)
            t = np.array(['without sampling'] + [nd,nfa,nd,nfa,nd,nfa]).reshape((1,-1))
            results = np.concatenate((results,t),axis=0)

            dr = round_up(100*nd/whole_flow_count,2)
            far = round_up(100*nfa/whole_benign_count,2)
            t = np.array(['without sampling'] + [dr,far,dr,far,dr,far]).reshape((1,-1))
            presults = np.concatenate((presults,t),axis=0)
            
            #print(dir_name)
            #print(whole_cm)
            break

    # entry for sampling method
    for i,(sampling_method,k) in enumerate(sorted(rows.items())):
        k1 = [i if i>=0 else 'NA' for i in k ]
        t = np.array([sampling_method] + k1,dtype=str).reshape((1,-1))
        #print(results.shape,t.shape,t)
        results = np.concatenate((results,t),axis=0)
        
        k2 = [i if i>=0 else 'NA' for i in prows[sampling_method] ]
        t = np.array([sampling_method] + k2).reshape((1,-1))
        presults = np.concatenate((presults,t),axis=0)
        
    with open(join(root,results_file), 'a') as f:
        
        np.savetxt(f,np.array(['','',attack_name]),delimiter=',',fmt='%s')
        #np.savetxt(f,np.array(['Count']),delimiter=',',fmt='%s')
        #np.savetxt(f,results,delimiter=',',fmt='%s')        
        
        np.savetxt(f,np.array(['Detection Rate, False Alarm Rate in %']),delimiter=',',fmt='%s')
        np.savetxt(f,presults,delimiter=',',fmt='%s')        
    

In [30]:
cms[0]

Unnamed: 0.1,Unnamed: 0,Benign,Brute Force-Web,Brute Force-XSS,DDoS attacks-LOIC-HTTP,DDoS-HOIC,DDoS-LOIC-UDP,DoS-GoldenEye,DoS-Hulk,DoS-SlowHTTPTest,DoS-Slowloris,FTP-BruteForce,SSH-BruteForce
0,Benign,155.5,0.3,0.1,2.9,16.4,2.6,0.2,4.3,3.5,0.1,3.2,8.9
1,Brute Force-Web,0.4,1.9,0.0,0.0,0.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Brute Force-XSS,0.1,0.3,1.4,0.2,0.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,DDoS attacks-LOIC-HTTP,2.1,0.0,0.0,74.0,171.2,0.0,0.0,0.1,0.0,0.0,0.0,0.0
4,DDoS-HOIC,6.4,0.1,0.2,138.4,911.1,0.0,0.0,0.3,0.0,0.0,0.0,0.0
5,DDoS-LOIC-UDP,1.3,0.0,0.0,0.0,0.0,242.5,0.0,0.0,0.0,0.0,0.0,0.0
6,DoS-GoldenEye,0.2,0.0,0.0,0.0,0.0,0.0,5.6,19.7,0.0,0.0,0.0,0.3
7,DoS-Hulk,1.3,0.0,0.0,0.0,0.2,0.0,9.6,1473.2,0.0,1.6,0.0,19.5
8,DoS-SlowHTTPTest,3.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.9,0.0,15.2,0.0
9,DoS-Slowloris,0.0,0.0,0.0,0.0,0.0,0.0,0.1,5.5,0.0,4.4,0.0,0.0
