In [1]:
author="Jumabek Alikhanov"
date = 'Nov 17,2019'

This is Final Project material for the "Deep Learning" class I took  

## Download the dataset from https://www.unb.ca/cic/datasets/ids-2017.html
We only need CSV files that is preprocessed and labeled for ML 

In [2]:
#imports
import os
from os.path import join
import glob
import pandas as pd
import numpy as np
import time
%load_ext autoreload

In [3]:
# load data
dataroot = 'MachineLearningCVE/'
SEED =2 
np.random.seed(SEED)

In [4]:
%autoreload 2
from preprocessing import load_data
X,y = load_data(dataroot) # reads csv file and returns np array of X,y -> of shape (N,D) and (N,1)

MachineLearningCVE/*.pcap_ISCX.csv
there are 2830743 flow records with 79 feature dimension
stripped column names
dropped bad columns
There are 0 nan entries
converted to numeric


## Data Imbalance
1. It is curucial to adress this issue in order to get decent performance
2. It also affects evaluation, we should calculate  `balanced accuracy`

In [5]:
from preprocessing import balance_data, normalize
X = normalize(X)

In [6]:

%autoreload 2
from cnn import CNNClassifier

def ensure_dir(dir_path):
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)

def getClassifier(args=None,runs_dir=None):
    if args is not None:
        (_,optim,lr,reg,batch_size,input_dim,num_class,num_epochs) = args
    if runs_dir is not None:
        ensure_dir(runs_dir)
    
    clf = CNNClassifier(input_dim,num_class,lr=lr,reg=reg,num_epochs=num_epochs,
                        batch_size=batch_size,runs_dir=runs_dir,use_batchnorm=True)
    return clf


In [7]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn import metrics
K=5
results = {}

#hyper-params
batch_size = 5*1024 # increasing batch size with more gpu added
optim = 'Adam'
num_epochs = 60

N,input_dim = X.shape
num_class = len(np.unique(y))

num_layers = 'conv_2_fc_1'
run='Kfold'
lr = 1e-3 #best_lr
reg = 1e-6 #best_reg

skf = StratifiedKFold(n_splits=K,random_state=SEED)
for fold_index, (dev_index,test_index) in enumerate(skf.split(X,y)): 
    print('---------------------------------------------')
    print('Fold #{}'.format(fold_index))    
    X_dev = X[dev_index]
    y_dev = y[dev_index]
    X_test = X[test_index]
    y_test = y[test_index]

    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=SEED)
    for train_index, val_index in sss.split(X_dev, y_dev): # runs only once
        X_train = X_dev[train_index]
        y_train = y_dev[train_index]
        X_val = X_dev[val_index]
        y_val = y_dev[val_index]    
        
        
        X_train,y_train = balance_data(X_train,y_train,seed = SEED)

        classifier_args = ('softmax',optim,lr,reg,batch_size,input_dim,num_class,num_epochs)
        config =  'CNN_{}_layer_relu_nn/{}-{}_run/optim_{}_lr_{}_reg_{}_bs_{}'.\
        format(num_layers,run,fold_index,optim,lr,reg,batch_size)
        
        runs_dir = join(dataroot,'runs',config)
    
        clf = getClassifier(classifier_args,runs_dir)
        
        #clf.fit(X_train,y_train,X_val,y_val,verbose=False)
    
        row_pred,pred = clf.predict(X_val,eval_mode=True)
        imbalanced_val_acc = metrics.accuracy_score(y_val,pred)*100        
        val_acc = metrics.balanced_accuracy_score(y_val,pred)*100

        row_pred, pred = clf.predict(X_test,eval_mode=True)
        test_acc = metrics.balanced_accuracy_score(y_test,pred)*100
        print('val acc:',imbalanced_val_acc)
        print('balanced val acc: ',val_acc)
        print('balanced test acc: ',test_acc)
    
        results[fold_index]= (imbalanced_val_acc, val_acc, test_acc)
        break

---------------------------------------------
Fold #0
Loaded MachineLearningCVE/runs/CNN_conv_2_fc_1_layer_relu_nn/Kfold-0_run/optim_Adam_lr_0.001_reg_1e-06_bs_5120 model trained with batch_size = 5120, seen 57 epochs and 50 mini batches
Loaded MachineLearningCVE/runs/CNN_conv_2_fc_1_layer_relu_nn/Kfold-0_run/optim_Adam_lr_0.001_reg_1e-06_bs_5120 model trained with batch_size = 5120, seen 57 epochs and 50 mini batches
val acc: 95.99463920621393
balanced val acc:  93.17087864851128
balanced test acc:  84.22443801611917
---------------------------------------------
Fold #1
Loaded MachineLearningCVE/runs/CNN_conv_2_fc_1_layer_relu_nn/Kfold-1_run/optim_Adam_lr_0.001_reg_1e-06_bs_5120 model trained with batch_size = 5120, seen 55 epochs and 350 mini batches
Loaded MachineLearningCVE/runs/CNN_conv_2_fc_1_layer_relu_nn/Kfold-1_run/optim_Adam_lr_0.001_reg_1e-06_bs_5120 model trained with batch_size = 5120, seen 55 epochs and 350 mini batches
val acc: 95.60804470556545
balanced val acc:  93.737

In [8]:
sum_imbalanced_val_acc = 0
sum_val_acc = 0
sum_test_acc = 0
for fold_index,res in results.items():
    (acc,balanced_acc,test_acc) = res
    sum_imbalanced_val_acc+=acc
    sum_val_acc +=balanced_acc
    sum_test_acc+= test_acc
print("{}-fold evaluation:".format(K))
print('imbalanced val acc:',sum_imbalanced_val_acc/K)
print('val acc: ',sum_val_acc/K)
print('test acc: ',sum_test_acc/K)
print('{0:.2f} {1:.2f} {2:.2f}'.format(sum_imbalanced_val_acc/K,sum_val_acc/K,sum_test_acc/K))

5-fold evaluation:
imbalanced val acc: 95.84698736965606
val acc:  92.53391703138325
test acc:  87.10614795958128
95.85 92.53 87.11


In [9]:
# Visualize the cross-validation results
import math
import matplotlib.pyplot as plt
x_scatter = [math.log10(x[0]) for x in results]
y_scatter = [math.log10(x[1]) for x in results]


# plot validation accuracy
marker_size=100
colors = [results[x] for x in results] # default size of markers is 20

plt.scatter(x_scatter, y_scatter, marker_size, c=colors, cmap=plt.cm.coolwarm)
plt.colorbar()
plt.xlabel('log learning rate')
plt.ylabel('log regularization strength')
plt.title('Net intrusion(CIC-IDS-2017) validation accuracy')
plt.savefig('MachineLearningCVE/5_layers_relu_{}th_run.png'.format(run))
plt.show()

TypeError: 'int' object is not subscriptable