In [1]:
author="Jumabek Alikhanov"
date = 'Nov 17,2019'

This is Final Project material for the "Deep Learning" class I took  

## Download the dataset from https://www.unb.ca/cic/datasets/ids-2017.html
We only need CSV files that is preprocessed and labeled for ML 

In [2]:
#imports
import os
from os.path import join
import glob
import pandas as pd
import numpy as np
import time
%load_ext autoreload

In [3]:
# load data
dataroot = 'MachineLearningCVE/'
SEED =2 
np.random.seed(SEED)

In [24]:
%autoreload 2
from preprocessing import load_data
X,y = load_data(dataroot) # reads csv file and returns np array of X,y -> of shape (N,D) and (N,1)

MachineLearningCVE/*.pcap_ISCX.csv


  This is separate from the ipykernel package so we can avoid doing imports until
  This is separate from the ipykernel package so we can avoid doing imports until


2830743 flow records read which has 79 feature dimension


## Data Imbalance
1. It is curucial to adress this issue in order to get decent performance
2. It also affects evaluation, we should calculate  `balanced accuracy`

In [15]:
from sklearn import preprocessing
from preprocessing import balance_data, normalize

X = normalize(X)
#X = preprocessing.scale(X)

(2830743, 76) (2830743,)


In [17]:

%autoreload 2
from neural_network import NetClassifier

def ensure_dir(dir_path):
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)

def getClassifier(args=None,runs_dir=None):
    if args is not None:
        (_,optim,lr,reg,batch_size,input_dim,num_class) = args
    if runs_dir is not None:
        ensure_dir(runs_dir)
    
    num_epochs = 60
    
    clf = NetClassifier(input_dim,num_class,lr=lr,reg=reg,num_epochs=num_epochs,
                        batch_size=batch_size,runs_dir=runs_dir,use_batchnorm=True)
    return clf


In [18]:
from sklearn.model_selection import train_test_split
from sklearn import metrics

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=SEED)



In [20]:
from sklearn.model_selection import StratifiedKFold
K=5
X = X_train
Y = y_train
results = {}
skf = StratifiedKFold(n_splits=K,random_state=SEED)

#hyper-params
batch_size = 5*1024 # increasing batch size with more gpu added
optim = 'Adam'

N,input_dim = X.shape
num_class = len(np.unique(y_train))

num_layers = 3
run='3_2'
lr = 1e-3 #best_lr
reg = 1e-5 #best_reg

for fold_index, (train_index,val_index) in enumerate(skf.split(X,Y)): 
    print('---------------------------------------------')
    print('Fold #{}'.format(fold_index))    
    X_train = X[train_index]
    y_train = Y[train_index]
    X_val = X[val_index]
    y_val = Y[val_index]

    classifier_args = ('softmax',optim,lr,reg,batch_size,input_dim,num_class)
    config =  '{}_layer_relu_nn/{}th_run/optim_{}_lr_{}_reg_{}_bs_{}'.format(num_layers,run,optim,lr,reg,batch_size)
    runs_dir = join(dataroot,'runs',config)
        
    X_train,y_train = balance_data(X_train,y_train)

    tick = time.time()
    clf = getClassifier(classifier_args,runs_dir)
        
    clf.fit(X_train,y_train,X_val,y_val,verbose=False)
    raw_pred,pred = clf.predict(X_val,eval_mode=True)
    acc = metrics.accuracy_score(y_val,pred)*100        
    balanced_acc = metrics.balanced_accuracy_score(y_val,pred)*100
    
    raw_pred,pred = clf.predict(X_test,eval_mode=True)    
    test_acc = metrics.balanced_accuracy_score(y_test,pred)*100
    print('val acc:',acc)
    print('balanced val acc: ',balanced_acc)
    print('balanced test acc: ',test_acc)
    
    results[fold_index]= (acc,balanced_acc,test_acc)

Loaded MachineLearningCVE/runs/5_layer_relu_nn/5th_run/optim_Adam_lr_1e-05_reg_1e-06_bs_5120 model trained with batch_size = 5120, seen 59 epochs and 300 mini batches
Model is trained in 1.256474494934082 sec
Loaded MachineLearningCVE/runs/5_layer_relu_nn/5th_run/optim_Adam_lr_1e-05_reg_1e-05_bs_5120 model trained with batch_size = 5120, seen 59 epochs and 150 mini batches
Model is trained in 1.2294058799743652 sec
Loaded MachineLearningCVE/runs/5_layer_relu_nn/5th_run/optim_Adam_lr_1e-05_reg_0.0001_bs_5120 model trained with batch_size = 5120, seen 59 epochs and 100 mini batches
Model is trained in 1.2898533344268799 sec
Loaded MachineLearningCVE/runs/5_layer_relu_nn/5th_run/optim_Adam_lr_1e-05_reg_0.001_bs_5120 model trained with batch_size = 5120, seen 59 epochs and 350 mini batches
Model is trained in 1.2325313091278076 sec
Loaded MachineLearningCVE/runs/5_layer_relu_nn/5th_run/optim_Adam_lr_0.0001_reg_1e-06_bs_5120 model trained with batch_size = 5120, seen 59 epochs and 200 mini 

In [22]:
raw_pred,pred = best_model.predict(X_test,eval_mode=True)        
test_acc = metrics.balanced_accuracy_score(y_test,pred)*100

raw_pred,pred = best_model.predict(X_val,eval_mode=True)        
val_acc = metrics.balanced_accuracy_score(y_val,pred)*100
print()
print("val acc of best model ",val_acc)
print("test acc of best model ",test_acc)

Loaded MachineLearningCVE/runs/5_layer_relu_nn/5th_run/optim_Adam_lr_0.0001_reg_1e-05_bs_5120 model trained with batch_size = 5120, seen 57 epochs and 0 mini batches
Loaded MachineLearningCVE/runs/5_layer_relu_nn/5th_run/optim_Adam_lr_0.0001_reg_1e-05_bs_5120 model trained with batch_size = 5120, seen 57 epochs and 0 mini batches

val acc of best model  91.1445743762908
test acc of best model  89.12953471998712


In [None]:
# Visualize the cross-validation results
import math
import matplotlib.pyplot as plt
x_scatter = [math.log10(x[0]) for x in results]
y_scatter = [math.log10(x[1]) for x in results]


# plot validation accuracy
marker_size=100
colors = [results[x] for x in results] # default size of markers is 20

plt.scatter(x_scatter, y_scatter, marker_size, c=colors, cmap=plt.cm.coolwarm)
plt.colorbar()
plt.xlabel('log learning rate')
plt.ylabel('log regularization strength')
plt.title('Net intrusion(CIC-IDS-2017) validation accuracy')
plt.savefig('MachineLearningCVE/5_layers_relu_{}th_run.png'.format(run))
plt.show()