In [45]:
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

In [70]:
from time import time
from tensorflow import keras
from tensorflow.python.keras.layers import Dense,Dropout,RNN, SimpleRNN
from tensorflow.python.keras import Sequential,callbacks
from tensorflow.python.keras import utils
from keras.utils import to_categorical
from sklearn.metrics import log_loss, hinge_loss, accuracy_score, confusion_matrix
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.preprocessing import (StandardScaler, OrdinalEncoder,LabelEncoder, MinMaxScaler)

In [52]:
proto_values = ['tcp', 'udp', 'arp', 'ospf', 'icmp', 'igmp', 'rtp', 'ddp',
                'ipv6-frag', 'cftp', 'wsn', 'pvp', 'wb-expak', 'mtp',
                'pri-enc', 'sat-mon', 'cphb', 'sun-nd', 'iso-ip', 'xtp', 'il',
                'unas', 'mfe-nsp', '3pc', 'ipv6-route', 'idrp', 'bna', 'swipe',
                'kryptolan', 'cpnx', 'rsvp', 'wb-mon', 'vmtp', 'ib', 'dgp',
                'eigrp', 'ax.25', 'gmtp', 'pnni', 'sep', 'pgm', 'idpr-cmtp',
                'zero', 'rvd', 'mobile', 'narp', 'fc', 'pipe', 'ipcomp',
                'ipv6-no', 'sat-expak', 'ipv6-opts', 'snp', 'ipcv',
                'br-sat-mon', 'ttp', 'tcf', 'nsfnet-igp', 'sprite-rpc',
                'aes-sp3-d', 'sccopmce', 'sctp', 'qnx', 'scps', 'etherip',
                'aris', 'pim', 'compaq-peer', 'vrrp', 'iatp', 'stp',
                'l2tp', 'srp', 'sm', 'isis', 'smp', 'fire', 'ptp', 'crtp',
                'sps', 'merit-inp', 'idpr', 'skip', 'any', 'larp', 'ipip',
                'micp', 'encap', 'ifmp', 'tp++', 'a/n', 'ipv6', 'i-nlsp',
                'ipx-n-ip', 'sdrp', 'tlsp', 'gre', 'mhrp', 'ddx', 'ippc',
                'visa', 'secure-vmtp', 'uti', 'vines', 'crudp', 'iplt',
                'ggp', 'ip', 'ipnip', 'st2', 'argus', 'bbn-rcc', 'egp',
                'emcon', 'igp', 'nvp', 'pup', 'xnet', 'chaos', 'mux', 'dcn',
                'hmp', 'prm', 'trunk-1', 'xns-idp', 'leaf-1', 'leaf-2', 'rdp',
                'irtp', 'iso-tp4', 'netblt', 'trunk-2', 'cbt']

In [53]:
state_values = ['FIN', 'INT', 'CON', 'ECO', 'REQ', 'RST', 'PAR', 'URN', 'no',
                'ACC', 'CLO']

service_values = ['-', 'ftp', 'smtp', 'snmp', 'http', 'ftp-data',
                  'dns', 'ssh', 'radius', 'pop3', 'dhcp', 'ssl', 'irc']

attack_cat_values = ['Normal', 'Backdoor', 'Analysis', 'Fuzzers', 'Shellcode',
                     'Reconnaissance', 'Exploits', 'DoS', 'Worms', 'Generic']

In [54]:
def unsw_encoding(params):
   # Load csv data into dataframes without 'id' and 'Label'
    train_df = pd.read_csv('UNSW_NB15_training-set.csv').drop(['id', 'label'], axis=1)
    test_df = pd.read_csv('UNSW_NB15_testing-set.csv').drop(['id', 'label'], axis=1)
    
    def process_dataframe(df):
        # Replace attack string with an int
        for i in range(len(attack_cat_values)):
            df['attack_cat'] = df['attack_cat'].replace(
                [attack_cat_values[i]], i)

        # Assign x (inputs) and y (outputs) of the network
        y = df['attack_cat']
        x = df.drop(columns='attack_cat')

      
        # Encode categorical features as an integer array
        if params['encoder'] == 'ordinalencoder':
            x = OrdinalEncoder().fit_transform(x)
        # Encode labels with value between 0 and n_classes-1.
        elif params['encoder'] == 'labelencoder':
            x = x.apply(LabelEncoder().fit_transform)
        else:
            # Replace String features with ints
            for i in range(len(proto_values)):
                x['proto'] = x['proto'].replace(proto_values[i], i)

            for i in range(len(state_values)):
                x['state'] = x['state'].replace(state_values[i], i)

            for i in range(len(service_values)):
                x['service'] = x['service'].replace(service_values[i], i)
            # Standardize by removing the mean and scaling to unit variance
            if params['encoder'] == "standardscaler":
                x = StandardScaler().fit_transform(x)
            # Transforms features by scaling each feature to range [0, 1]
            elif params['encoder'] == "minmaxscaler01":
                x = MinMaxScaler(feature_range=(0, 1)).fit_transform(x)
            # Transforms features by scaling each feature to range [-1, 1]
            elif params['encoder'] == "minmaxscaler11":
                x = MinMaxScaler(feature_range=(-1, 1)).fit_transform(x)

        return x, y

    x_train, Y_train = process_dataframe(train_df)
    x_test, Y_test = process_dataframe(test_df)

    # Apply one-hot encoding to outputs
    y_train = to_categorical(Y_train)
    y_test = to_categorical(Y_test)

    return x_train, x_test, y_train, y_test

In [55]:
# training model
csv_values = ['epochs', 'acc', 'loss', 'val_acc', 'val_loss', "train_data",
              "features_nb", 'loss_fct', 'optimizer', 'activation_fct',
              'layer_nb', 'unit_nb', 'batch_size', 'dropout', 'cell_type',
              'encoder']

In [56]:
csv_best_res = ['param', 'value', 'min_mean_val_loss']

In [57]:
# epochs: Number of iteration of the training dataset
# train_data: Number of rows in training dataset
# features_nb: Number of features kept as input
# loss fct: Loss function used in training
# optimizer: Optimizer used in training
# activation_fct: Activation function used in outputs layer
# layer_nb: Number of hidden layers in the network
# unit_nb: Number of cells for each layer
# batch_size: Number of elements observed before updating weights
# dropout: Fraction of inputs randomly discarded
# cell_type: Type of cell ['RNN']
# encoder: Encoding performed (see processing files)
# dataset: Processing file to be called ['unsw']
# training_nb: Number of model to be trained with the same params
# resultstocsv: Wether to save results to csv
# resultstologs: Wether to save models and tensorboard logs
# showresults: Wether to show detailled statistics about the trained model
# shuffle: Wether to shuffle the batches sequences during training

In [58]:
# reference parameters
params = {'epochs': 3, 'train_data': 494021, 'features_nb': 4,
          'loss_fct': 'mse', 'optimizer': 'rmsprop',
          'activation_fct': 'sigmoid', 'layer_nb': 1, 'unit_nb': 128,
          'batch_size': 1024, 'dropout': 0.2, 'cell_type': 'RNN',
          'encoder': 'labelencoder', 'dataset': 'unsw', 'training_nb': 1,
          'resultstocsv': False, 'resultstologs': False, 'showresults': True,
          'shuffle': True}

In [59]:
# variable parameters
params_var = {'encoder': ['standardscaler', 'labelencoder',
                          'minmaxscaler01', 'minmaxscaler11',
                          'ordinalencoder'],
              'optimizer': ['adam', 'sgd', 'rmsprop', 'nadam', 'adamax',
                            'adadelta'],
              'activation_fct': ['sigmoid', 'softmax', 'relu', 'tanh'],
              'layer_nb': [1, 2, 3, 4],
              'unit_nb': [4, 8, 32, 64, 128, 256],
              'dropout': [0.1, 0.2, 0.3, 0.4],
              'batch_size': [512, 1024, 2048],
              }

In [100]:
def load_data():
    if params['dataset'] == 'unsw':
        x_train, x_test, y_train, y_test = unsw_encoding(params)
    return x_train, x_test, y_train, y_test

In [61]:
# method 1
# def train_model(x_train, x_test, y_train, y_test):
        
#     # Create a Sequential layer, one layer after the other
#     model = Sequential()
#     # If there is more than 1 layer, the first must return sequences
#     for _ in range(params['layer_nb']-1):
#         model.add(RNN(units=params['unit_nb'],
#                        input_shape=(x_train.shape[1:]), return_sequences=True))
#         model.add(Dropout(rate=params['dropout']))

#     # If there is only 1 layer, it must not return sequences
#     if(params['layer_nb'] == 1):
#         model.add(RNN(units=params['unit_nb'], input_shape=x_train.shape[1:]))
#         model.add(Dropout(rate=params['dropout']))
#     else:  # If there is more than 1, the following must not return sequences
#         model.add(RNN(units=params['unit_nb']))
#         model.add(Dropout(rate=params['dropout']))
#     # Outputs layer
#     model.add(Dense(units=y_train.shape[1],
#                     activation=params['activation_fct']))

#     model.compile(loss=params['loss_fct'], optimizer=params['optimizer'],
#                   metrics=['accuracy'])

#     model.summary()

#     hist = model.fit(x_train, y_train, params['batch_size'], params['epochs'],
#                      verbose=1, shuffle=params['shuffle'],
#                      validation_data=(x_test, y_test), callbacks=callbacks)

#     if params['showresults'] is True:
#         print_results(params, model, x_train, x_test, y_train, y_test)

#     return hist

In [104]:
x_train, x_test, y_train, y_test = load_data()
# Reshape the inputs in the accepted model format
sample = x_train.shape[0]
features = x_train.shape[1]
x_train = np.array(x_train).reshape([-1, x_train.shape[1], 1])
x_test = np.array(x_test).reshape([-1, x_test.shape[1], 1])

In [93]:
    # method 2
    # initialize RNN
    model = Sequential()
    #  1st RNN layer and Dropout regularization
    model.add(SimpleRNN(units = 50, activation='relu', return_sequences=True, input_shape= (x_train.shape[1],1)))
    model.add(Dropout(0.2))
     # 2nd RNN layer and Dropout regularization
    model.add(SimpleRNN(units = 50, activation='relu', return_sequences=True))
    model.add(Dropout(0.2))
    # 3rd RNN layer and Dropout regularization
    model.add(SimpleRNN(units = 50, activation='relu', return_sequences=True))
    model.add(Dropout(0.2))
    # 4th RNN layer and Dropout regularization
    model.add(SimpleRNN(units = 50))
    model.add(Dropout(0.2))
    # output layer
    model.add(Dense(units = 1))
    # compile the RNN
    model.compile(optimizer='adam', loss='mse')
    
    model.summary()
    m = model.fit(x_train, y_train, epochs=10, batch_size=32)

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
simple_rnn_15 (SimpleRNN)    (None, 42, 50)            2600      
_________________________________________________________________
dropout_15 (Dropout)         (None, 42, 50)            0         
_________________________________________________________________
simple_rnn_16 (SimpleRNN)    (None, 42, 50)            5050      
_________________________________________________________________
dropout_16 (Dropout)         (None, 42, 50)            0         
_________________________________________________________________
simple_rnn_17 (SimpleRNN)    (None, 42, 50)            5050      
_________________________________________________________________
dropout_17 (Dropout)         (None, 42, 50)            0         
_________________________________________________________________
simple_rnn_18 (SimpleRNN)    (None, 50)               

In [94]:
scores = model.evaluate(x_test, y_test, verbose=0)
scores

0.08999738842248917

In [1]:
#  method 3
# Model = Sequential([

#         RNN(50,input_shape=(features,x_train.shape[2]),
#                           activation='sigmoid',recurrent_activation='hard_sigmoid'),
#         keras.layers.Dense(1,activation="softmax")
#     ])

# Model.compile(optimizer='rmsprop',loss='mse', metrics=['accuracy'])

# #Training the model

# Model.fit(x_train, y_train, epochs=10, batch_size= 32) 
# Model.summary()

# # Final evaluation of the model
# scores = Model.evaluate(x_test, y_test, verbose=0)
# print('/n')
# print("Accuracy: %.2f%%" % (scores[1]*100))