In [1]:
author="Jumabek Alikhanov"
date = 'Nov 17,2019'

This is Final Project material for the "Deep Learning" class I took  

## Download the dataset from https://www.unb.ca/cic/datasets/ids-2017.html
We only need CSV files that is preprocessed and labeled for ML 

In [2]:
#imports
import os
from os.path import join
import glob
import pandas as pd
import numpy as np
import time
%load_ext autoreload

In [3]:
# load data
dataroot = 'MachineLearningCVE/'
SEED =2 
np.random.seed(SEED)

In [4]:
%autoreload 2
from preprocessing import load_data
X,y = load_data(dataroot) # reads csv file and returns np array of X,y -> of shape (N,D) and (N,1)

MachineLearningCVE/*.pcap_ISCX.csv
there are 2830743 flow records with 79 feature dimension
stripped column names
dropped bad columns
There are 0 nan entries
converted to numeric


## Data Imbalance
1. It is curucial to adress this issue in order to get decent performance
2. It also affects evaluation, we should calculate  `balanced accuracy`

In [5]:
from preprocessing import balance_data, normalize
X = normalize(X)

In [6]:

%autoreload 2
from models import Classifier

def ensure_dir(dir_path):
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)

def getClassifier(args,runs_dir=None):
    
    (method,optim,lr,reg,batch_size,input_dim,num_class,num_iters) = args
    if runs_dir is not None:
        ensure_dir(runs_dir)
    
    clf = Classifier(method,input_dim,num_class,lr=lr,reg=reg,num_epochs=num_iters,
                        batch_size=batch_size,runs_dir=runs_dir)
    return clf


In [7]:

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn import metrics
def Kfold_evaluation(X,y,method='nn3',K=5,lr=1e-3,reg=1e-5,num_epochs=20):
    results = {}

    #hyper-params
    batch_size = 5120 # increasing batch size with more gpu added
    optim = 'Adam'

    N,input_dim = X.shape
    num_class = len(np.unique(y))
 
    skf = StratifiedKFold(n_splits=K,random_state=SEED)
    for fold_index, (train_index,test_index) in enumerate(skf.split(X,y)): 
        print('---------------------------------------------')
        print('Fold #{}'.format(fold_index))    
        X_train = X[train_index]
        y_train = y[train_index]
        X_test = X[test_index]
        y_test = y[test_index]
        
        X_train,y_train = balance_data(X_train,y_train,seed = SEED)
        
        classifier_args = (method,optim,lr,reg,batch_size,input_dim,num_class,num_epochs)
        config =  '{}/Kfold-{}th_run/optim_{}_lr_{}_reg_{}_bs_{}'.format(method,fold_index,optim,lr,reg,batch_size)
        runs_dir = join(dataroot,'runs',config)
    
        clf = getClassifier(classifier_args,runs_dir)
        
        clf.fit(X_train,y_train)
    
        pred = clf.predict(X_test,eval_mode=True)
        test_acc = metrics.balanced_accuracy_score(y_test,pred)*100
        print('balanced test acc: ',test_acc)
        results[fold_index]= (test_acc)
    return results

In [11]:
#neural network with 3 layer
method = 'nn3'
K=5
#results = Kfold_evaluate(X,y,method=method,K=K,lr=1e-3,reg=1e-5,num_epochs=20)

sum_test_acc = 0
for fold_index,res in results.items():
    (test_acc) = res
    sum_test_acc+= test_acc
print('{0:.2f}'.format(sum_test_acc/K))

84.87


In [12]:
#1D Convolutional neural network with 2 conv layer, 1 dense layer
method = 'cnn2'
K=5
results = Kfold_evaluate(X,y,method=method,K=K,lr=1e-3,reg=1e-6,num_epochs=20)

sum_test_acc = 0
for fold_index,res in results.items():
    (test_acc) = res
    sum_test_acc+= test_acc
print('{0:.2f}'.format(sum_test_acc/K))

---------------------------------------------
Fold #0
best epoch 0, best batch 0
bst acc  -1
Epoch [1/20], Step [50/398], Loss: 0.3436
Epoch [1/20], Step [100/398], Loss: 0.2961
Epoch [1/20], Step [150/398], Loss: 0.2689
Epoch [1/20], Step [200/398], Loss: 0.2282
Epoch [1/20], Step [250/398], Loss: 0.2105
Epoch [1/20], Step [300/398], Loss: 0.1961
Epoch [1/20], Step [350/398], Loss: 0.1910
Epoch [2/20], Step [1/398], Loss: 0.2380
Epoch [2/20], Step [51/398], Loss: 0.1830
Epoch [2/20], Step [101/398], Loss: 0.1824
Epoch [2/20], Step [151/398], Loss: 0.1743
Epoch [2/20], Step [201/398], Loss: 0.1796
Epoch [2/20], Step [251/398], Loss: 0.1695
Epoch [2/20], Step [301/398], Loss: 0.1748
Epoch [2/20], Step [351/398], Loss: 0.1670
Epoch [3/20], Step [2/398], Loss: 0.1799
Epoch [3/20], Step [52/398], Loss: 0.1753
Epoch [3/20], Step [102/398], Loss: 0.1726
Epoch [3/20], Step [152/398], Loss: 0.1647
Epoch [3/20], Step [202/398], Loss: 0.1824
Epoch [3/20], Step [252/398], Loss: 0.1712
Epoch [3/20

Epoch [8/20], Step [157/398], Loss: 0.1197
Epoch [8/20], Step [207/398], Loss: 0.1311
Epoch [8/20], Step [257/398], Loss: 0.1200
Epoch [8/20], Step [307/398], Loss: 0.1295
Epoch [8/20], Step [357/398], Loss: 0.1295
Epoch [9/20], Step [8/398], Loss: 0.1247
Epoch [9/20], Step [58/398], Loss: 0.1225
Epoch [9/20], Step [108/398], Loss: 0.1210
Epoch [9/20], Step [158/398], Loss: 0.1246
Epoch [9/20], Step [208/398], Loss: 0.1223
Epoch [9/20], Step [258/398], Loss: 0.1230
Epoch [9/20], Step [308/398], Loss: 0.1109
Epoch [9/20], Step [358/398], Loss: 0.1208
no improvement in accuracy for 10 iterations
Loaded MachineLearningCVE/runs/cnn2/Kfold-2th_run/optim_Adam_lr_0.001_reg_1e-06_bs_5120 model trained with batch_size = 5120, seen 7 epochs and 306 mini batches
balanced test acc:  88.39551724046086
---------------------------------------------
Fold #3
best epoch 0, best batch 0
bst acc  -1
Epoch [1/20], Step [50/398], Loss: 0.3568
Epoch [1/20], Step [100/398], Loss: 0.3144
Epoch [1/20], Step [15