In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 
import warnings; warnings.simplefilter('ignore')

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler, RobustScaler, Normalizer
from sklearn.model_selection import KFold, StratifiedKFold
from tqdm import tqdm_notebook as tqdm
import time

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        if '.csv' in filename:
            print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/unsw-nb15-eda/train.csv
/kaggle/input/unsw-nb15-eda/features.csv
/kaggle/input/unsw-nb15-eda/test.csv
/kaggle/input/unsw-nb15/UNSW-NB15_2.csv
/kaggle/input/unsw-nb15/NUSW-NB15_features.csv
/kaggle/input/unsw-nb15/UNSW-NB15_1.csv
/kaggle/input/unsw-nb15/UNSW_NB15_testing-set.csv
/kaggle/input/unsw-nb15/UNSW-NB15_3.csv
/kaggle/input/unsw-nb15/UNSW_NB15_training-set.csv
/kaggle/input/unsw-nb15/UNSW-NB15_LIST_EVENTS.csv
/kaggle/input/unsw-nb15/UNSW-NB15_4.csv


In [2]:
train = pd.read_csv('/kaggle/input/unsw-nb15/UNSW_NB15_training-set.csv')
test = pd.read_csv('/kaggle/input/unsw-nb15/UNSW_NB15_testing-set.csv')
if train.shape[0]<100000:
    print("Fixing train test")
    train, test = test, train

drop_columns = ['attack_cat', 'id']
for df in [train, test]:
    for col in drop_columns:
        if col in df.columns:
            print('Dropping '+col)
            df.drop([col], axis=1, inplace=True)

Fixing train test
Dropping attack_cat
Dropping id
Dropping attack_cat
Dropping id


Util methods

In [3]:
def detection_rate(y_true, y_pred):
    CM = metrics.confusion_matrix(y_true, y_pred)
    TN = CM[0][0]
    FN = CM[1][0]
    TP = CM[1][1]
    FP = CM[0][1]
    return TP/(TP+FN)

def false_positive_rate(y_true, y_pred):
    CM = metrics.confusion_matrix(y_true, y_pred)
    TN = CM[0][0]
    FN = CM[1][0]
    TP = CM[1][1]
    FP = CM[0][1]
    return FP/(FP+TN)

def false_alarm_rate(y_true, y_pred):
    CM = metrics.confusion_matrix(y_true, y_pred)
    TN = CM[0][0]
    FN = CM[1][0]
    TP = CM[1][1]
    FP = CM[0][1]
    return (FP+FN)/(TP+TN+FP+FN)

def get_xy(df):
    return pd.get_dummies(df.drop(['label'], axis=1)), df['label']

def get_cat_columns(train):
    categorical = []
    for col in train.columns:
        if train[col].dtype == 'object':
            categorical.append(col)
    return categorical

def label_encode(train, test):
    for col in get_cat_columns(train):
        le = LabelEncoder()
        le.fit(list(train[col].astype(str).values) + list(test[col].astype(str).values))
        train[col] = le.transform(list(train[col].astype(str).values))
        test[col] = le.transform(list(test[col].astype(str).values))
    return train, test

def process_feature(df):
    df.loc[~df['state'].isin(['FIN', 'INT', 'CON', 'REQ', 'RST']), 'state'] = 'others'
    df.loc[~df['service'].isin(['-', 'dns', 'http', 'smtp', 'ftp-data', 'ftp', 'ssh', 'pop3']), 'service'] = 'others'
    df.loc[df['proto'].isin(['igmp', 'icmp', 'rtp']), 'proto'] = 'igmp_icmp_rtp'
    df.loc[~df['proto'].isin(['tcp', 'udp', 'arp', 'ospf', 'igmp_icmp_rtp']), 'proto'] = 'others'
    return df

def get_train_test(train, test, feature_engineer=True, label_encoding=False, scaler=StandardScaler()):
    x_train, y_train = train.drop(['label'], axis=1), train['label']
    x_test, y_test = test.drop(['label'], axis=1), test['label']
    
    if feature_engineer:
        x_train, x_test = process_feature(x_train), process_feature(x_test)
    
    categorical_columns = get_cat_columns(x_train)
    non_categorical_columns = [x for x in x_train.columns if x not in categorical_columns]
    if scaler is not None:
        x_train[non_categorical_columns] = scaler.fit_transform(x_train[non_categorical_columns])
        x_test[non_categorical_columns] = scaler.transform(x_test[non_categorical_columns])

    if label_encoding:
        x_train, x_test = label_encode(x_train, x_test)
        features = x_train.columns
    else:
        x_train = pd.get_dummies(x_train)
        x_test = pd.get_dummies(x_test)
        print("Column mismatch {0}, {1}".format(set(x_train.columns)- set(x_test.columns),  set(x_test.columns)- set(x_train.columns)))
        features = list(set(x_train.columns) & set(x_test.columns))
    print(f"Number of features {len(features)}")
    x_train = x_train[features]
    x_test = x_test[features]

    return x_train, y_train, x_test, y_test

def results(y_test, y_pred):
    acc = metrics.accuracy_score(y_test, y_pred)
    pre = metrics.precision_score(y_test, y_pred)
    rec = metrics.recall_score(y_test, y_pred)
    f1 = metrics.f1_score(y_test, y_pred)
    print(f"Acc {acc}, Precision {pre}, Recall {rec}, F1-score {f1}")
    
    CM = metrics.confusion_matrix(y_test, y_pred)
    TN = CM[0][0]
    FN = CM[1][0]
    TP = CM[1][1]
    FP = CM[0][1]
    
    # detection rate or true positive rate
    DR = TP*100/(TP+FN)
    # false positive rate
    FPR = FP*100/(FP+TN)
    # false alarm rate 
    FAR = (FP+FN)*100/(TP+TN+FP+FN)
    
    print("DR {0}, FPR {1}, FAR {2}".format(DR, FPR, FAR))
    print(metrics.classification_report(y_test, y_pred))

In [4]:
def cross_validation(params, X, Y):
    y_probs = []
    y_vals = []

    # for tr_idx, val_idx in tqdm(kf.split(X, Y), total=folds):
    for tr_idx, val_idx in kf.split(X, Y):
        x_train, y_train = X.iloc[tr_idx], Y[tr_idx]
        x_val, y_val = X.iloc[val_idx], Y[val_idx]
        clf = RandomForestClassifier(**params)
        clf.fit(x_train, y_train)
        y_prob = clf.predict_proba(x_val)[:, 1]
        
        y_probs.append(y_prob)
        y_vals.append(y_val)
        
    acc, pre, rec, f1, far, fpr, dr, auc = 0, 0, 0, 0, 0, 0, 0, 0
    folds = len(y_probs)
    for i in range(folds):
        y_prob, y_val = y_probs[i], y_vals[i]
        y_pred = np.where(y_prob>=0.5, 1, 0)

        acc += metrics.accuracy_score(y_val, y_pred)/folds
        f1 += metrics.f1_score(y_val, y_pred)/folds
        pre += metrics.precision_score(y_val, y_pred) /folds
        rec += metrics.recall_score(y_val, y_pred) /folds
        dr += detection_rate(y_val, y_pred) /folds
        fpr += false_positive_rate(y_val, y_pred) /folds
        far += false_alarm_rate(y_val, y_pred)/folds
        auc += metrics.roc_auc_score(y_val, y_prob) /folds 
    
    print(f"Acc {acc}, Precision {pre}, Recall {rec}, F1-score {f1} \nFAR {far}, FPR {fpr}, DR {dr} , AUC {auc}")
    
def test_run(params, X, Y):
    clf = RandomForestClassifier(**params)
    clf.fit(X, Y)
    y_pred = clf.predict(x_test)
    results(y_test, y_pred)
    
    y_prob = clf.predict_proba(x_test)[:, 1]
    print("Auc {0}".format(metrics.roc_auc_score(y_test, y_prob)))

In [5]:
folds = 10
seed = 1
kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)
params = {
    'n_estimators': 100,
    'random_state':1,
    'class_weight': {0:2, 1:1}
}

# Why overfitting will be bad
Evaluating the model on the same data, that was used for training gives wrong info about the actual performance of the model. Here we can see the model achieve nearly 100% accuracy of train data. However, when ten-fold cross validation is used on train data it reduces to 96%. And on test data the model's performance falls drastically to 87.16%. This indicates the model needs to be generalized and effective measures need to be taken to reduce overfit.

In [6]:
X, Y, x_test, y_test = get_train_test(train, test, feature_engineer=False, label_encoding=True, scaler=None)

clf = RandomForestClassifier()
clf.fit(X,Y)
y_pred = clf.predict(X)
results(Y, y_pred)

Number of features 42
Acc 0.9964012980421009, Precision 0.9970855735892669, Recall 0.9976286439698008, F1-score 0.9973570348527938
DR 99.76286439698008, FPR 0.6214285714285714, FAR 0.3598701957899179
              precision    recall  f1-score   support

           0       0.99      0.99      0.99     56000
           1       1.00      1.00      1.00    119341

    accuracy                           1.00    175341
   macro avg       1.00      1.00      1.00    175341
weighted avg       1.00      1.00      1.00    175341



In [7]:
cross_validation(params, X, Y)

Acc 0.9608705325583738, Precision 0.9633644554079561, Recall 0.979772254591989, F1-score 0.9714973621567162 
FAR 0.039129467441626056, FPR 0.07941071428571428, DR 0.979772254591989 , AUC 0.9939725918435274


In [8]:
test_run(params, X, Y)

Acc 0.8722853811397755, Precision 0.818469531492966, Recall 0.9869407923762463, F1-score 0.8948447422371117
DR 98.69407923762463, FPR 26.818918918918918, FAR 12.771461886022445
              precision    recall  f1-score   support

           0       0.98      0.73      0.84     37000
           1       0.82      0.99      0.89     45332

    accuracy                           0.87     82332
   macro avg       0.90      0.86      0.87     82332
weighted avg       0.89      0.87      0.87     82332

Auc 0.9797695870228298


# Combinations of Feature Preprocessing

In [9]:
# Drop Features with low importance
drop_columns = ['response_body_len', 'is_sm_ips_ports', 'ct_flw_http_mthd', 'trans_depth', 'dwin', 'ct_ftp_cmd', 'is_ftp_login']
for df in [train, test]:
    df.drop(drop_columns, axis=1, inplace=True)
X, Y, x_test, y_test = get_train_test(train, test, feature_engineer=True, label_encoding=False, scaler=RobustScaler())

Column mismatch set(), set()
Number of features 53


| Preprocess | Train Acc | Test Acc|
|:-----:|:------:|:------:|
| LabelEncoded | 95.92 | 87.40 |
| OneHotEncoded | 95.65 | 87.87 |
| OneHotEncoded, FeatureEngineer| 95.66 | 88.03 |
| OneHotEncoded, FeatureEngineer, MinMaxScaler| 95.80 | 87.49 |
| OneHotEncoded, FeatureEngineer, RobustScaler| 95.67 | 87.92 |
| OneHotEncoded, FeatureEngineer, StandardScaler| 95.67 | 87.89 |
| OneHotEncoded, FeatureEngineer, FeatureSelection, StandardScaler| 95.72 | 87.81 |
| OneHotEncoded, FeatureEngineer, FeatureSelection, RobustScaler| 95.70 | 88.03 |
| OneHotEncoded, FeatureEngineer, FeatureSelection, MinMaxScaler| 95.77 | 87.73 |

In [10]:
params = {
    'random_state':1,
    'class_weight': {0:2, 1:1}
}
start_time = time.clock()
cross_validation(params, X, Y)
print("Time spent in 10-fold cross validation of train data ", time.clock()-start_time)

start_time = time.clock()
test_run(params, X, Y)
print("Time spent in test run ", time.clock()-start_time)

Acc 0.9561939218566241, Precision 0.9567817183292109, Recall 0.9799063154850629, F1-score 0.9682036274888738 
FAR 0.04380607814337594, FPR 0.09433928571428572, DR 0.9799063154850629 , AUC 0.989753764302109
Time spent in 10-fold cross validation of train data  36.799360999999976
Acc 0.8791356945051741, Precision 0.8328347538146037, Recall 0.9764846024883085, F1-score 0.8989571804270787
DR 97.64846024883084, FPR 24.013513513513512, FAR 12.086430549482582
              precision    recall  f1-score   support

           0       0.96      0.76      0.85     37000
           1       0.83      0.98      0.90     45332

    accuracy                           0.88     82332
   macro avg       0.90      0.87      0.87     82332
weighted avg       0.89      0.88      0.88     82332

Auc 0.9643720997755896
Time spent in test run  4.73065600000001


# Hypertuning

In [11]:
X, Y, x_test, y_test = get_train_test(train, test, feature_engineer=True, label_encoding=False, scaler=RobustScaler())

Column mismatch set(), set()
Number of features 53


<table>
    <tr>
    <th>PreProcessing</th> <th style="text-align:center">Parameters</th><th>Train Acc</th><th>Test Acc</th>
    </tr>
    <tr>
      <td rowspan="8">OneHot encoding, StandardScaler</td><td>n_estimators 10, max_depth 10, max_features 10, class_weight {0:2, 1: 1}</td><td>94.10</td><td>91.05</td>
    </tr>
    <tr>
        <td>n_estimators 10, max_depth 10, max_features 20, class_weight {0:2, 1: 1}</td> <td>94.16</td> <td>91.61</td>
    </tr>
    <tr>
        <td>n_estimators 10, max_depth 20, max_features 20, class_weight {0:2, 1: 1}</td> <td>95.64</td> <td>89.16</td>
    </tr>
    <tr>
        <td>n_estimators 30, max_depth 10, max_features 30, class_weight {0:2, 1: 1}</td> <td>94.19</td> <td>91.66</td>
    </tr>
     <tr>
        <td>n_estimators 10</td> <td>95.66</td> <td>87.90</td>
    </tr>
    <tr>
        <td>n_estimators 10, max_depth 20</td> <td>95.73</td> <td>86.91</td>
    </tr>
    <tr>
        <td>n_estimators 10, max_features 20</td> <td>95.76</td> <td>87.57</td>
    </tr>
    <tr>
        <td>n_estimators 50</td> <td>96.06</td> <td>87.31</td>
    </tr>
     <tr>
      <td rowspan="2">OneHot encoding, MinMaxScaler</td><td>n_estimators 10, max_depth 10, max_features 30, class_weight {0:2, 1: 1}</td><td>93.98</td><td>91.49</td>
    </tr>
    <tr>
      <td>n_estimators 10, max_depth 10, max_features 10, class_weight {0:2, 1: 1}</td><td>94.20</td><td>91.32</td>
    </tr>
    <tr>
      <td rowspan="3">OneHot encoding, RobustScaler</td><td>n_estimators 10, max_depth 10, max_features 30, class_weight {0:2, 1: 1}</td><td>94.08</td><td>91.90</td>
    </tr>
    <tr>
      <td>n_estimators 50, max_depth 10, max_features 30, class_weight {0:2, 1: 1}</td><td>94.21</td><td>91.68</td>
    </tr>
    <tr>
      <td>n_estimators 20, max_depth 10, max_features 10, class_weight {0:2, 1: 1}</td><td>94.21</td><td>91.36</td>
    </tr>
    <tr>
      <td rowspan="1">OneHot encoding</td><td>n_estimators 10, max_depth 10, max_features 10, class_weight {0:2, 1: 1}</td><td>94.20</td><td>91.02</td>
    </tr>
</table>

In [12]:
for n_estimators in [20, 50]:
    for max_features in [10, 30]:
        print("n_estimators {0} max_features {1}".format(n_estimators, max_features))
        params = {
           'n_estimators': n_estimators,
            'random_state':1,
            'max_depth':10,
            'max_features': max_features,
            'class_weight': {0:2, 1:1}
        }
        cross_validation(params, X, Y)
        test_run(params, X, Y)
        print()

n_estimators 20 max_features 10
Acc 0.9424207824472288, Precision 0.9756968538115552, Recall 0.9387888848799665, F1-score 0.9568838403022358 
FAR 0.057579217552771074, FPR 0.04983928571428572, DR 0.9387888848799665 , AUC 0.9912013708126307
Acc 0.9128649856677841, Precision 0.8870136719542375, Recall 0.9646166063707756, F1-score 0.9241889464229104
DR 96.46166063707756, FPR 15.054054054054054, FAR 8.713501433221591
              precision    recall  f1-score   support

           0       0.95      0.85      0.90     37000
           1       0.89      0.96      0.92     45332

    accuracy                           0.91     82332
   macro avg       0.92      0.91      0.91     82332
weighted avg       0.92      0.91      0.91     82332

Auc 0.9840462193641625

n_estimators 20 max_features 30
Acc 0.9414797613949374, Precision 0.9774694053117982, Recall 0.9355879821913979, F1-score 0.9560672117854316 
FAR 0.05852023860506269, FPR 0.04596428571428572, DR 0.9355879821913979 , AUC 0.9915677271