In [None]:
author="Jumabek Alikhanov"
date = 'Nov 17,2019'

### Download data directly from the link below and unzip it in the same directory

In [5]:
!gdown --id 1-t3RdDpmqMs4ABt9oobSapeNYTZJ9tpu
!unzip MachineLearningCSV.zip

Downloading...
From: https://drive.google.com/uc?id=1-t3RdDpmqMs4ABt9oobSapeNYTZJ9tpu
To: /content/net_intrusion_detection/MachineLearningCSV.zip
100% 235M/235M [00:00<00:00, 241MB/s]


In [7]:
#imports
import os
from os.path import join
import glob
import pandas as pd
import numpy as np
import time
from preprocessing import load_data
%load_ext autoreload

In [8]:
# load data
dataroot = 'MachineLearningCVE/'

In [9]:
X,y = load_data(dataroot) # reads csv file and returns np array of X,y -> of shape (N,D) and (N,1)

MachineLearningCVE/*.csv
['MachineLearningCVE/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv', 'MachineLearningCVE/Friday-WorkingHours-Morning.pcap_ISCX.csv', 'MachineLearningCVE/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv', 'MachineLearningCVE/Monday-WorkingHours.pcap_ISCX.csv', 'MachineLearningCVE/Wednesday-workingHours.pcap_ISCX.csv', 'MachineLearningCVE/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv', 'MachineLearningCVE/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv', 'MachineLearningCVE/Tuesday-WorkingHours.pcap_ISCX.csv']
There are 2830743 flow records with 79 feature dimension
Data loaded.
Data preprocessing started...
Stripped column names
Dropped bad columns
There are 0 nan entries
Converted to numeric


## Data Imbalance
1. It is curucial to adress this issue in order to get decent performance
2. It also affects evaluation, we should calculate  `balanced accuracy`

In [10]:
from preprocessing import balance_data, normalize
X = normalize(X)

In [11]:
%autoreload 2
from models import Classifier

def ensure_dir(dir_path):
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)

def getClassifier(args,runs_dir=None):
    
    (method,optim,lr,reg,batch_size,input_dim,num_class,num_iters) = args
    if runs_dir is not None:
        ensure_dir(runs_dir)
    
    clf = Classifier(method,input_dim,num_class,lr=lr,reg=reg,num_epochs=num_iters,
                        batch_size=batch_size,runs_dir=runs_dir)
    return clf

In [12]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn import metrics
def Kfold_evaluation(X,y,method='nn3',K=5,lr=1e-3,reg=1e-5,num_epochs=20):
    results = {}

    #hyper-params
    batch_size = 5120 # increasing batch size with more gpu added
    optim = 'Adam'

    N,input_dim = X.shape
    num_class = len(np.unique(y))
 
    skf = StratifiedKFold(n_splits=K,)
    for fold_index, (train_index,test_index) in enumerate(skf.split(X,y)): 
        print('---------------------------------------------')
        print('Fold #{}'.format(fold_index))    
        X_train = X[train_index]
        y_train = y[train_index]
        X_test = X[test_index]
        y_test = y[test_index]
        
        X_train,y_train = balance_data(X_train,y_train,seed = SEED)
        
        classifier_args = (method,optim,lr,reg,batch_size,input_dim,num_class,num_epochs)
        config =  '{}/Kfold-{}th_run/optim_{}_lr_{}_reg_{}_bs_{}'.format(method,fold_index,optim,lr,reg,batch_size)
        runs_dir = join(dataroot,'runs',config)
    
        clf = getClassifier(classifier_args,runs_dir)
        
        clf.fit(X_train,y_train)
    
        pred = clf.predict(X_test,eval_mode=True)
        test_acc = metrics.balanced_accuracy_score(y_test,pred)*100
        print('balanced test acc: ',test_acc)
        results[fold_index]= (test_acc)
    return results

In [13]:
#neural network with 5 layer
method = 'nn5'
K=5
results = Kfold_evaluation(X,y,method=method,K=K,lr=1e-3,reg=1e-5,num_epochs=20)

sum_test_acc = 0
for fold_index,res in results.items():
    (test_acc) = res
    sum_test_acc+= test_acc
print('{0:.2f}'.format(sum_test_acc/K))

---------------------------------------------
Fold #0


INFO:models:Classifier initialized with method nn5, input_dim 76, num_classes 15, num_epochs 20, batch_size 5120, lr 0.001000, reg 0.000010, runs_dir MachineLearningCVE/runs/nn5/Kfold-0th_run/optim_Adam_lr_0.001_reg_1e-05_bs_5120
INFO:models:Starting training process...
DEBUG:models:Epoch [1/20], Step [50/398], Loss: 0.6280
DEBUG:models:Epoch [1/20], Step [100/398], Loss: 0.2898
DEBUG:models:Epoch [1/20], Step [150/398], Loss: 0.2421
DEBUG:models:Epoch [1/20], Step [200/398], Loss: 0.2240
DEBUG:models:Epoch [1/20], Step [250/398], Loss: 0.2037
DEBUG:models:Epoch [1/20], Step [300/398], Loss: 0.1845
DEBUG:models:Epoch [1/20], Step [350/398], Loss: 0.2058
DEBUG:models:Epoch [2/20], Step [1/398], Loss: 0.2019
DEBUG:models:Epoch [2/20], Step [51/398], Loss: 0.1918
DEBUG:models:Epoch [2/20], Step [101/398], Loss: 0.1816
DEBUG:models:Epoch [2/20], Step [151/398], Loss: 0.1787
DEBUG:models:Epoch [2/20], Step [201/398], Loss: 0.1842
DEBUG:models:Epoch [2/20], Step [251/398], Loss: 0.1796
DEBUG

Loaded MachineLearningCVE/runs/nn5/Kfold-0th_run/optim_Adam_lr_0.001_reg_1e-05_bs_5120 model trained with batch_size = 5120, seen 3 epochs and 102 mini batches
balanced test acc:  84.86105583548303
---------------------------------------------
Fold #1


INFO:models:Classifier initialized with method nn5, input_dim 76, num_classes 15, num_epochs 20, batch_size 5120, lr 0.001000, reg 0.000010, runs_dir MachineLearningCVE/runs/nn5/Kfold-1th_run/optim_Adam_lr_0.001_reg_1e-05_bs_5120
INFO:models:Starting training process...
DEBUG:models:Epoch [1/20], Step [50/398], Loss: 0.6766
DEBUG:models:Epoch [1/20], Step [100/398], Loss: 0.3034
DEBUG:models:Epoch [1/20], Step [150/398], Loss: 0.2639
DEBUG:models:Epoch [1/20], Step [200/398], Loss: 0.2409
DEBUG:models:Epoch [1/20], Step [250/398], Loss: 0.2205
DEBUG:models:Epoch [1/20], Step [300/398], Loss: 0.2228
DEBUG:models:Epoch [1/20], Step [350/398], Loss: 0.1952
DEBUG:models:Epoch [2/20], Step [1/398], Loss: 0.2264
DEBUG:models:Epoch [2/20], Step [51/398], Loss: 0.1885
DEBUG:models:Epoch [2/20], Step [101/398], Loss: 0.2125
DEBUG:models:Epoch [2/20], Step [151/398], Loss: 0.1800
DEBUG:models:Epoch [2/20], Step [201/398], Loss: 0.1883
DEBUG:models:Epoch [2/20], Step [251/398], Loss: 0.1884
DEBUG

Loaded MachineLearningCVE/runs/nn5/Kfold-1th_run/optim_Adam_lr_0.001_reg_1e-05_bs_5120 model trained with batch_size = 5120, seen 2 epochs and 151 mini batches
balanced test acc:  87.38730502709997
---------------------------------------------
Fold #2


INFO:models:Classifier initialized with method nn5, input_dim 76, num_classes 15, num_epochs 20, batch_size 5120, lr 0.001000, reg 0.000010, runs_dir MachineLearningCVE/runs/nn5/Kfold-2th_run/optim_Adam_lr_0.001_reg_1e-05_bs_5120
INFO:models:Starting training process...
DEBUG:models:Epoch [1/20], Step [50/398], Loss: 0.6385
DEBUG:models:Epoch [1/20], Step [100/398], Loss: 0.2960
DEBUG:models:Epoch [1/20], Step [150/398], Loss: 0.2503
DEBUG:models:Epoch [1/20], Step [200/398], Loss: 0.2386
DEBUG:models:Epoch [1/20], Step [250/398], Loss: 0.2275
DEBUG:models:Epoch [1/20], Step [300/398], Loss: 0.2239
DEBUG:models:Epoch [1/20], Step [350/398], Loss: 0.2425
DEBUG:models:Epoch [2/20], Step [1/398], Loss: 0.2483
DEBUG:models:Epoch [2/20], Step [51/398], Loss: 0.2189
DEBUG:models:Epoch [2/20], Step [101/398], Loss: 0.2147
DEBUG:models:Epoch [2/20], Step [151/398], Loss: 0.1963
DEBUG:models:Epoch [2/20], Step [201/398], Loss: 0.1879
DEBUG:models:Epoch [2/20], Step [251/398], Loss: 0.1932
DEBUG

Loaded MachineLearningCVE/runs/nn5/Kfold-2th_run/optim_Adam_lr_0.001_reg_1e-05_bs_5120 model trained with batch_size = 5120, seen 4 epochs and 203 mini batches
balanced test acc:  90.31020740930572
---------------------------------------------
Fold #3


INFO:models:Classifier initialized with method nn5, input_dim 76, num_classes 15, num_epochs 20, batch_size 5120, lr 0.001000, reg 0.000010, runs_dir MachineLearningCVE/runs/nn5/Kfold-3th_run/optim_Adam_lr_0.001_reg_1e-05_bs_5120
INFO:models:Starting training process...
DEBUG:models:Epoch [1/20], Step [50/398], Loss: 0.6800
DEBUG:models:Epoch [1/20], Step [100/398], Loss: 0.2963
DEBUG:models:Epoch [1/20], Step [150/398], Loss: 0.2565
DEBUG:models:Epoch [1/20], Step [200/398], Loss: 0.2671
DEBUG:models:Epoch [1/20], Step [250/398], Loss: 0.2236
DEBUG:models:Epoch [1/20], Step [300/398], Loss: 0.2730
DEBUG:models:Epoch [1/20], Step [350/398], Loss: 0.2088
DEBUG:models:Epoch [2/20], Step [1/398], Loss: 0.2289
DEBUG:models:Epoch [2/20], Step [51/398], Loss: 0.2048
DEBUG:models:Epoch [2/20], Step [101/398], Loss: 0.1802
DEBUG:models:Epoch [2/20], Step [151/398], Loss: 0.1936
DEBUG:models:Epoch [2/20], Step [201/398], Loss: 0.1825
DEBUG:models:Epoch [2/20], Step [251/398], Loss: 0.1817
DEBUG

Loaded MachineLearningCVE/runs/nn5/Kfold-3th_run/optim_Adam_lr_0.001_reg_1e-05_bs_5120 model trained with batch_size = 5120, seen 3 epochs and 102 mini batches
balanced test acc:  86.47327250186615
---------------------------------------------
Fold #4


INFO:models:Classifier initialized with method nn5, input_dim 76, num_classes 15, num_epochs 20, batch_size 5120, lr 0.001000, reg 0.000010, runs_dir MachineLearningCVE/runs/nn5/Kfold-4th_run/optim_Adam_lr_0.001_reg_1e-05_bs_5120
INFO:models:Starting training process...
DEBUG:models:Epoch [1/20], Step [50/398], Loss: 0.6346
DEBUG:models:Epoch [1/20], Step [100/398], Loss: 0.3089
DEBUG:models:Epoch [1/20], Step [150/398], Loss: 0.2410
DEBUG:models:Epoch [1/20], Step [200/398], Loss: 0.2249
DEBUG:models:Epoch [1/20], Step [250/398], Loss: 0.2162
DEBUG:models:Epoch [1/20], Step [300/398], Loss: 0.2074
DEBUG:models:Epoch [1/20], Step [350/398], Loss: 0.1922
DEBUG:models:Epoch [2/20], Step [1/398], Loss: 0.2424
DEBUG:models:Epoch [2/20], Step [51/398], Loss: 0.1923
DEBUG:models:Epoch [2/20], Step [101/398], Loss: 0.1846
DEBUG:models:Epoch [2/20], Step [151/398], Loss: 0.1954
DEBUG:models:Epoch [2/20], Step [201/398], Loss: 0.1946
DEBUG:models:Epoch [2/20], Step [251/398], Loss: 0.1746
DEBUG

Loaded MachineLearningCVE/runs/nn5/Kfold-4th_run/optim_Adam_lr_0.001_reg_1e-05_bs_5120 model trained with batch_size = 5120, seen 2 epochs and 351 mini batches
balanced test acc:  79.12800402504438
85.63
