# Data Set Bezerra - NN

In [67]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [68]:
import functools
import time
import math
import tensorflow as tf
import datetime

In [69]:
# Clear any logs from previous runs
!rm -rf ./logs/ 

Defining constants

In [70]:
import pandas as pd
import numpy as np

DEVICE = 'GPU/:0'

DATASET_DIR = '../datasets/Dataset-IoT/'
NETFLOW_DIR = DATASET_DIR + 'MC/NetFlow/'

# MC_I_FIRST: Has infected data by Hajime, Aidra and BashLite botnets 
MC_I_FIRST = r'MC_I1.csv'

# MC_I_SECOND: Has infected data from Mirai botnets
MC_I_SECOND = r'MC_I2.csv'

# MC_I_THIR: Has infected data from Mirai, Doflo, Tsunami and Wroba botnets
MC_I_THIRD = r'MC_I3.csv'

# MC_L: Has legitimate data, no infection
MC_L = r'MC_L.csv'

data_set_files = [MC_L, MC_I_FIRST, MC_I_SECOND, MC_I_THIRD]

Loading the data set into a pd DataFrame

In [71]:
legitimate_file_path = NETFLOW_DIR + MC_L
first_file_path = NETFLOW_DIR + MC_I_FIRST

LABEL_COLUMN = 'Label'

#reading data
df = pd.read_csv (legitimate_file_path)
first_df = pd.read_csv (first_file_path)

#making the final DataFrame
df = pd.concat([df, first_df], ignore_index=True)
df = df.sample(frac=1, random_state=math.ceil(time.time()), )
df = df.drop(df.columns[0], axis=1)
df

Unnamed: 0,Label,ts,te,td,sa,da,sp,dp,pr,flg,...,mpls8,mpls9,mpls10,cl,sl,al,ra,eng,exid,tr
277238,1,2018-11-11 03:58:47,2018-11-11 03:58:47,0.0,192.168.1.109,78109244106,15983,23.0,TCP,....S.,...,0-0-0,0-0-0,0-0-0,0.0,0.0,0.0,0.0.0.0,0/0,1.0,1969-12-31 21:00:00.000
311972,1,2018-11-11 04:03:01,2018-11-11 04:03:01,0.0,192.168.1.109,152.196.233.3,48381,23.0,TCP,....S.,...,0-0-0,0-0-0,0-0-0,0.0,0.0,0.0,0.0.0.0,0/0,1.0,1969-12-31 21:00:00.000
316862,1,2018-11-11 04:03:37,2018-11-11 04:03:37,0.0,192.168.1.109,196.22.250.72,2562,23.0,TCP,....S.,...,0-0-0,0-0-0,0-0-0,0.0,0.0,0.0,0.0.0.0,0/0,1.0,1969-12-31 21:00:00.000
107434,1,2018-11-11 03:38:14,2018-11-11 03:38:14,0.0,192.168.1.109,189.150.38.130,19394,81.0,TCP,....S.,...,0-0-0,0-0-0,0-0-0,0.0,0.0,0.0,0.0.0.0,0/0,1.0,1969-12-31 21:00:00.000
107070,1,2018-11-11 03:38:41,2018-11-11 03:39:03,21474.0,192.168.1.109,13.160.176.82,51394,23.0,TCP,.APRSF,...,0-0-0,0-0-0,0-0-0,0.0,0.0,0.0,0.0.0.0,0/0,1.0,1969-12-31 21:00:00.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
181234,1,2018-11-11 03:47:12,2018-11-11 03:47:12,0.0,192.168.1.109,142.97.244.230,36347,23.0,TCP,....S.,...,0-0-0,0-0-0,0-0-0,0.0,0.0,0.0,0.0.0.0,0/0,1.0,1969-12-31 21:00:00.000
183348,1,2018-11-11 03:47:27,2018-11-11 03:47:27,0.0,192.168.1.109,108.149.183.58,24617,23.0,TCP,....S.,...,0-0-0,0-0-0,0-0-0,0.0,0.0,0.0,0.0.0.0,0/0,1.0,1969-12-31 21:00:00.000
54615,1,2018-11-11 03:32:02,2018-11-11 03:32:02,0.0,192.168.1.109,206.190.80.13,22192,81.0,TCP,....S.,...,0-0-0,0-0-0,0-0-0,0.0,0.0,0.0,0.0.0.0,0/0,1.0,1969-12-31 21:00:00.000
11151,1,2018-11-11 03:26:53,2018-11-11 03:26:53,0.0,192.168.1.109,163.206.155.61,18354,23.0,TCP,....S.,...,0-0-0,0-0-0,0-0-0,0.0,0.0,0.0,0.0.0.0,0/0,1.0,1969-12-31 21:00:00.000


Splitting the data set

In [72]:
from tensorflow import feature_column
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

train, test = train_test_split (df, test_size=0.2)
train, val = train_test_split (train, test_size=0.2)
print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

230794 train examples
57699 validation examples
72124 test examples


In [73]:
#DataFrame to tf.data.Dataset object
def df_to_dataset(dataFrame, shuffle=True, batch_size=32):
    dataFrame = dataFrame.copy()
    labels = dataFrame.pop(LABEL_COLUMN)
    data_set = tf.data.Dataset.from_tensor_slices((dict(dataFrame), labels))
    if shuffle:
        data_set = data_set.shuffle(buffer_size=len(dataFrame))
    data_set = data_set.batch(batch_size)
    return data_set

In [74]:
BATCH_SIZE = 32
train_ds = df_to_dataset(train, batch_size=BATCH_SIZE)
val_ds = df_to_dataset(val, shuffle=False, batch_size=BATCH_SIZE)
test_ds = df_to_dataset(test, shuffle=False, batch_size=BATCH_SIZE)

# Building the Feature Layer

In [99]:
PROTOCOL_COLUMN = 9 - 1
PROTOCOL_FLAGS_COLUMN = 10 - 1

feature_columns = []

cat_cols, num_cols = df.columns[df.dtypes == 'O'], df.columns[df.dtypes != 'O']
num_cols = num_cols[1:]

#numeric columns
for key in num_cols:
    feature_columns.append(feature_column.numeric_column(key))


#categorical columns
all_categories = [df[column].unique() for column in df[cat_cols]]
for item, categories in zip(cat_cols, all_categories):
    feature = feature_column.categorical_column_with_vocabulary_list(item, categories)
    mfeature = feature_column.embedding_column (feature, dimension=8)
    feature_columns.append(mfeature)


# ##finding out the different categories
# categorical_cols = df.columns[PROTOCOL_COLUMN]
# pr_list = df[categorical_cols].unique()

# categorical_cols = df.columns[PROTOCOL_FLAGS_COLUMN]
# flg_list = df[categorical_cols].unique()

# #one_hot_pr
# pr = feature_column.categorical_column_with_vocabulary_list('pr', pr_list)
# pr_one_hot = feature_column.indicator_column(pr)
# feature_columns.append(pr_one_hot)

# #embedding_flg
# flg = feature_column.categorical_column_with_vocabulary_list('flg', flg_list)
# flg_embedding = feature_column.embedding_column(flg, dimension=8)
# feature_columns.append(flg_embedding)

In [103]:
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

initializer = tf.initializers.VarianceScaling(scale=2.0)
hidden_layer_size, num_classes = 128, 1
layers = [
    feature_layer,
    tf.keras.layers.Dense(hidden_layer_size, use_bias=True, activation='relu', kernel_initializer=initializer),
    tf.keras.layers.Dense(hidden_layer_size,  use_bias=True, activation='relu', kernel_initializer=initializer),
    tf.keras.layers.Dense(hidden_layer_size,  use_bias=True, activation='relu', kernel_initializer=initializer),
    tf.keras.layers.Dense(num_classes,  use_bias=True, kernel_initializer=initializer),
]

model = tf.keras.Sequential(layers)
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [104]:
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)


with tf.device (DEVICE):
    model.fit(train_ds, 
              epochs=5, 
              validation_data=val_ds, 
              callbacks=[tensorboard_callback])

Epoch 1/5


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [102]:
%tensorboard --logdir logs/fit

Reusing TensorBoard on port 6006 (pid 14507), started 0:54:20 ago. (Use '!kill 14507' to kill it.)