In [1]:
author="Jumabek Alikhanov"
date = 'Nov 17,2019'

This is Final Project material for the "Deep Learning" class I took  

## Download the dataset from https://www.unb.ca/cic/datasets/ids-2017.html
We only need CSV files that is preprocessed and labeled for ML 

In [2]:
#imports
import os
from os.path import join
import glob
import pandas as pd
import numpy as np
import time
%load_ext autoreload

In [3]:
# load data
dataroot = 'MachineLearningCVE/'
SEED =2 
np.random.seed(SEED)

In [4]:
%autoreload 2
from preprocessing import load_data
X,y = load_data(dataroot) # reads csv file and returns np array of X,y -> of shape (N,D) and (N,1)

MachineLearningCVE/*.pcap_ISCX.csv
there are 2830743 flow records with 79 feature dimension
stripped column names
dropped bad columns
There are 0 nan entries
converted to numeric


## Data Imbalance
1. It is curucial to adress this issue in order to get decent performance
2. It also affects evaluation, we should calculate  `balanced accuracy`

In [5]:
from preprocessing import balance_data, normalize
X = normalize(X)

In [None]:

%autoreload 2
from models import Classifier

def ensure_dir(dir_path):
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)

def getClassifier(args=None,runs_dir=None):
    if args is not None:
        (method,optim,lr,reg,batch_size,input_dim,num_class,num_iters) = args
    if runs_dir is not None:
        ensure_dir(runs_dir)
    
    clf = Classifier(method,input_dim,num_class,lr=lr,reg=reg,num_epochs=num_iters,
                        batch_size=batch_size,runs_dir=runs_dir)
    return clf


In [None]:
def KFoldEvaluation(method='cnn2')

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn import metrics
K=5
results = {}

#hyper-params
batch_size = 5*1024 # increasing batch size with more gpu added
optim = 'Adam'

N,input_dim = X.shape
num_class = len(np.unique(y))
num_epochs = 20
num_layers = '3'
run='Kfold'
lr = 1e-3 #best_lr
reg = 1e-5 #best_reg

skf = StratifiedKFold(n_splits=K,random_state=SEED)
for fold_index, (train_index,test_index) in enumerate(skf.split(X,y)): 
        print('---------------------------------------------')
        print('Fold #{}'.format(fold_index))    
        X_train = X[train_index]
        y_train = y[train_index]
        X_test = X[test_index]
        y_test = y[test_index]
        
        X_train,y_train = balance_data(X_train,y_train,seed = SEED)
        num_iters = X_train.shape[0]*10
        classifier_args = ('nn3',optim,lr,reg,batch_size,input_dim,num_class,num_epochs)
        config =  '{}_layer_relu_nn/{}-{}th_run/optim_{}_lr_{}_reg_{}_bs_{}'.format(num_layers,run,fold_index,optim,lr,reg,batch_size)
        runs_dir = join(dataroot,'runs',config)
    
        clf = getClassifier(classifier_args,runs_dir)
        
        clf.fit(X_train,y_train)
    
        pred = clf.predict(X_test,eval_mode=True)
        test_acc = metrics.balanced_accuracy_score(y_test,pred)*100
        print('balanced test acc: ',test_acc)
        results[fold_index]= (test_acc)


---------------------------------------------
Fold #0
building NN3
Loaded MachineLearningCVE/runs/3_layer_relu_nn/Kfold-0th_run/optim_Adam_lr_0.001_reg_1e-05_bs_5120 model trained with batch_size = 5120, seen 4 epochs and 353 mini batches
best epoch 5, best batch 354
bst acc  92.05375114785531
Epoch [6/20], Step [355/398], Loss: 0.2492
Epoch [7/20], Step [6/398], Loss: 0.2394
Epoch [7/20], Step [56/398], Loss: 0.2552
Epoch [7/20], Step [106/398], Loss: 0.2480
Epoch [7/20], Step [156/398], Loss: 0.2475
Epoch [7/20], Step [206/398], Loss: 0.2628
Epoch [7/20], Step [256/398], Loss: 0.2467
Epoch [7/20], Step [306/398], Loss: 0.2453
Epoch [7/20], Step [356/398], Loss: 0.2540
no improvement in accuracy for 10 iterations
Loaded MachineLearningCVE/runs/3_layer_relu_nn/Kfold-0th_run/optim_Adam_lr_0.001_reg_1e-05_bs_5120 model trained with batch_size = 5120, seen 4 epochs and 353 mini batches
balanced test acc:  83.18824804791724
---------------------------------------------
Fold #1
building NN3

Epoch [1/20], Step [150/398], Loss: 0.5796
Epoch [1/20], Step [200/398], Loss: 0.5210
Epoch [1/20], Step [250/398], Loss: 0.4882
Epoch [1/20], Step [300/398], Loss: 0.4501
Epoch [1/20], Step [350/398], Loss: 0.4156
Epoch [2/20], Step [1/398], Loss: 0.4263


In [None]:
sum_test_acc = 0
for fold_index,res in results.items():
    (test_acc) = res
    sum_test_acc+= test_acc
print("{}-fold evaluation:".format(K))
print('test acc: ',sum_test_acc/K)
print('{0:.2f}'.format(sum_test_acc/K))