### First, import the necessary libraries

In [1]:
from sklearn.cluster import KMeans
import numpy as np
import seaborn as sns
import csv
import os
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout


In [2]:
from sklearn.preprocessing import MinMaxScaler

X_train = pd.read_csv('UNSW-NB15/a part of training and testing set/UNSW_NB15_testing-set.csv', header=0) # they flipped the names.
X_test = pd.read_csv('UNSW-NB15/a part of training and testing set/UNSW_NB15_training-set.csv', header=0)

X_train = X_train.dropna()
X_test=  X_test.dropna()

Separate labels from data

In [3]:
y_train = X_train.iloc[:,-1]
X_train = X_train.iloc[:,:-2]

y_test = X_test.iloc[:,-1]
X_test = X_test.iloc[:,:-2]

In [4]:
# categorical
for col in ['proto', 'service', 'state']:
    X_train[col] = pd.Categorical(X_train[col]).codes
    X_test[col] = pd.Categorical(X_test[col]).codes
#
# for col in ['proto', 'service', 'state']:
#     X_train[col] = pd.Categorical(X_train[col])
#     X_test[col] = pd.Categorical(X_test[col])
X_train.pop('id')
X_test.pop('id')

0            1
1            2
2            3
3            4
4            5
         ...  
82327    82328
82328    82329
82329    82330
82330    82331
82331    82332
Name: id, Length: 82332, dtype: int64

## Scale the data (BIG, BIG PERFORMANCE INCREASE!!!)

In [5]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)

X_test = scaler.transform(X_test)

## what if I just threw a NN at it, or "How I became a data scientist in one easy step"

In [6]:
# define the keras model
batch_size = len(X_train)

NUM_LAYERS = 20
model = tf.keras.Sequential()
model.add(Dense(64, input_dim=42, activation='relu'))
for x in range(NUM_LAYERS): 
    model.add(Dropout(0.4))
    model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [7]:
from tensorflow.keras.optimizers import Adam
optim = Adam(learning_rate=0.0005)
model.compile(loss = tf.keras.losses.BinaryCrossentropy(), optimizer = optim, metrics=['accuracy'])

In [8]:
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train.values))
train_dataset = train_dataset.shuffle(len(X_train)).batch(batch_size)

test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test.values))
test_dataset = test_dataset.shuffle(len(X_test)).batch(batch_size)

In [22]:
history = model.fit(train_dataset, epochs=120, batch_size=batch_size, validation_data=test_dataset)

Epoch 1/120
Epoch 2/120
Epoch 3/120
Epoch 4/120
Epoch 5/120
Epoch 6/120
Epoch 7/120
Epoch 8/120
Epoch 9/120
Epoch 10/120
Epoch 11/120
Epoch 12/120
Epoch 13/120
Epoch 14/120
Epoch 15/120
Epoch 16/120
Epoch 17/120
Epoch 18/120
Epoch 19/120
Epoch 20/120
Epoch 21/120
Epoch 22/120
Epoch 23/120
Epoch 24/120
Epoch 25/120
Epoch 26/120
Epoch 27/120
Epoch 28/120
Epoch 29/120
Epoch 30/120
Epoch 31/120
Epoch 32/120
Epoch 33/120
Epoch 34/120
Epoch 35/120
Epoch 36/120
Epoch 37/120
Epoch 38/120
Epoch 39/120
Epoch 40/120
Epoch 41/120
Epoch 42/120
Epoch 43/120
Epoch 44/120
Epoch 45/120
Epoch 46/120
Epoch 47/120
Epoch 48/120
Epoch 49/120
Epoch 50/120
Epoch 51/120
Epoch 52/120
Epoch 53/120
Epoch 54/120
Epoch 55/120
Epoch 56/120
Epoch 57/120
Epoch 58/120
Epoch 59/120
Epoch 60/120
Epoch 61/120
Epoch 62/120
Epoch 63/120
Epoch 64/120
Epoch 65/120
Epoch 66/120
Epoch 67/120
Epoch 68/120
Epoch 69/120
Epoch 70/120
Epoch 71/120
Epoch 72/120
Epoch 73/120
Epoch 74/120
Epoch 75/120
Epoch 76/120
Epoch 77/120
Epoch 78

In [10]:
model.save('correct_train_test.h5')

In [11]:
y_pred = model.predict(X_test)

In [12]:
precision = tf.keras.metrics.Precision()
precision.update_state(y_test, y_pred)
precision.result().numpy()

0.93394244

In [13]:
len(X_train)

175341

In [14]:
recall = tf.keras.metrics.Recall()
recall.update_state(y_test, y_pred)
recall.result().numpy()

0.63811433

In [15]:
accuracy = tf.keras.metrics.BinaryAccuracy()
accuracy.update_state(y_test, y_pred)
accuracy.result().numpy()

0.7758952

In [16]:
actual_prediction = np.round(y_pred)

# actually normal, predicted normal
normal_traffic_indices = np.argwhere(y_test.to_numpy()==0)
normal_traffic_indices = normal_traffic_indices.reshape((len(normal_traffic_indices),))
attack_traffic_indices = np.argwhere(y_test.to_numpy()==1)
attack_traffic_indices = attack_traffic_indices.reshape((len(attack_traffic_indices),))

In [17]:
tn = len(np.where(np.take(actual_prediction, normal_traffic_indices, axis=0)==0)[0])
norm = len(normal_traffic_indices)
print(f'classified {tn} out of {norm} normal traffic patterns as normal {tn/norm}%')

classified 34954 out of 37000 normal traffic patterns as normal 0.9447027027027027%


In [18]:
fn = len(np.where(np.take(actual_prediction, attack_traffic_indices, axis=0)==0)[0])
attack = len(attack_traffic_indices)
print(f'classified {fn} out of {attack} attack traffic patterns as normal {fn/attack}%')

classified 16405 out of 45332 attack traffic patterns as normal 0.3618856436954028%


In [19]:
fp = len(np.where(np.take(actual_prediction, normal_traffic_indices, axis=0)==1)[0])
print(f'classified {fp} out of {norm} normal traffic patterns as attack {fp/norm}%')

classified 2046 out of 37000 normal traffic patterns as attack 0.0552972972972973%


In [20]:
tp = len(np.where(np.take(actual_prediction, attack_traffic_indices, axis=0)==1)[0])
print(f'classified {tp} out of {attack} attack traffic patterns as attack {tp/attack}%')

classified 28927 out of 45332 attack traffic patterns as attack 0.6381143563045972%


In [21]:
tf.keras.utils.plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=False)

('You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) ', 'for plot_model/model_to_dot to work.')
