In [None]:
# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive')
path = 'drive/MyDrive/Colab Notebooks/TER_DATA/'

Mounted at /content/drive


# **Data Overview**

This dataset was created by the IXIA PerfectStorm tool in the Cyber Range Lab of the Australian Centre for Cyber Security (ACCS) for generating a hybrid of real modern normal activities and synthetic contemporary attack behaviours
<a href='https://www.kaggle.com/mrwellsdavid/unsw-nb15'>more information</a>.

In [None]:
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv )
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout
from keras.optimizers import Adam
from keras.models import load_model
from keras.wrappers.scikit_learn import KerasClassifier
from keras.callbacks import ModelCheckpoint, Callback
import matplotlib.pyplot as plt 
import seaborn as sns # Data vizualisation
from sklearn.preprocessing import MinMaxScaler # Data preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV # For splitting data
import pickle # save and load python objects

# ***Load Data From Disk***

In [None]:
# Load features names types 
features_df = pd.read_csv(path+'/NUSW-NB15_features.csv', encoding='cp1252')
features_df

In [None]:
# Load Data
data = pd.concat(
      [
      pd.read_csv(path+'/UNSW-NB15_1.csv', header=None),\
      pd.read_csv(path+'/UNSW-NB15_2.csv', header=None),\
      pd.read_csv(path+'/UNSW-NB15_3.csv', header=None),\
      pd.read_csv(path+'/UNSW-NB15_4.csv', header=None)\
      ], ignore_index=True)
features_name = features_df['Name'].to_numpy()# get feautures names
data.columns = features_name# add columns names
data.head()

# **Plotting The History Of  Accuracy And Loss**

In [None]:
def plot_training_val_loss(H, N, plotPath): 
  """
    This function used to plot the progress of the loss during the 
    training.
      args:
        H: history of train
        N: number of epochs
        plotPath: The path of the image that will be saved
  """  
  plt.style.use("ggplot")
  plt.figure()
  plt.plot(np.arange(0, N), H.history["loss"], label="train_loss")
  plt.plot(np.arange(0, N), H.history["val_loss"], label="val_loss")
  plt.plot(np.argmax(H.history["loss"]), np.min(H.history["loss"]), marker="x",color="r", label="meilleur train_loss")
  plt.title("Training & Val loss")
  plt.xlabel("Epoch #")
  plt.ylabel("loss")
  plt.legend(loc="lower left")
  plt.savefig(plotPath)
  plt.show()

def plot_training_val_accu(H, N, plotPath):   
  """
    This function used to plot the progress of the accuracy during the 
    training.
      args:
        H: history of train
        N: number of epochs
        plotPath: The path of the image that will be saved
  """  
  plt.style.use("ggplot")
  plt.figure()
  plt.plot(np.arange(0, N), H.history["accuracy"], label="train_acc")
  plt.plot(np.arange(0, N), H.history["val_accuracy"], label="val_acc")
  plt.plot(np.argmax(H.history["val_accuracy"]), np.max(H.history["val_accuracy"]), marker="x",color="g", label="meilleur accuracy")
  plt.title("Training & Val Accuracy")
  plt.xlabel("Epoch #")
  plt.ylabel("Accuracy")
  plt.legend(loc="lower left")
  plt.savefig(plotPath)
  plt.show()

# ***Data Preprocessing***

In [None]:
def data_preprocessing(data):
  """
    This function used preprocess the data and prepared for training. 
      args:
        data: The data to process
      Return:
        processed data
        unlabled_data: The data features
        y: The data labels
  """  
  print("Data preprocessing ...")
  # Delete 'srcip', 'sport', 'dstip', 'dsport' columns
  data = data.drop(['srcip', 'sport', 'dstip', 'dsport'], axis=1)
  # replace spaces by 0
  data['ct_ftp_cmd'] = data['ct_ftp_cmd'].replace(' ', 0)
  # Change dtype to int64 of ct_ftp_cmd feauture
  data = data.astype({'ct_ftp_cmd': 'int64'})

  ##### Handle missing values #####
  # Fill nan values for attack_cat feauture with Normal
  data = data.fillna({'attack_cat': 'Normal'})
  data['attack_cat'] = data['attack_cat'].replace(' Fuzzers','Fuzzers')
  data['attack_cat'] = data['attack_cat'].replace(' Fuzzers ','Fuzzers')
  data['attack_cat'] = data['attack_cat'].replace(' Reconnaissance','Reconnaissance')
  data['attack_cat'] = data['attack_cat'].replace(' Reconnaissance ','Reconnaissance')
  data['attack_cat'] = data['attack_cat'].replace(' Shellcode','Shellcode')
  data['attack_cat'] = data['attack_cat'].replace(' Shellcode ','Shellcode')
  data['attack_cat'] = data['attack_cat'].replace('Backdoors','Backdoor')
  # Fill nan values with 0
  data = data.fillna(0) 

  ##### Encode categorial variables with --> Integer Encoding #####
  # Get list of labels categories
  proto_list = data['proto'].unique().tolist()
  state_list = data['state'].unique().tolist()#unique() function return ndarray object
  service_list = data['service'].unique().tolist()
  # Apply Integer Encoding 
  data['proto'] = data['proto'].apply(lambda x: proto_list.index(x))
  data['state'] = data['state'].apply(lambda x: state_list.index(x))
  data['service'] = data['service'].apply(lambda x: service_list.index(x))
  # Encode The Class  
  attacks_list = data['attack_cat'].unique().tolist()
  data['attack_cat'] = data['attack_cat'].apply(lambda x: attacks_list.index(x))
  # Returned objects
  unlabled_data = data.loc[:, :'ct_dst_src_ltm']
  y = data['attack_cat']
  print("Done !!!")
  return (unlabled_data, y)

In [None]:
def data_normalisation_encode(data, y):
  """
      This function used to normalize the data. 
      args:
        data: The data features
        y: The data labels
      Return:
        processed data
        transformed_data: The normalized features
        transformed_y: The normalized labels
  """
  print("Starting normalisation ...")
  #Normalisation of data
  scaler = MinMaxScaler()
  scaler.fit(data)# Compute the minimum and maximum to be used for later scaling.
  transformed_data = pd.DataFrame(scaler.fit_transform(data))
  #Encode data to_categorical from keras
  transformed_y = tf.keras.utils.to_categorical(y, num_classes=10)
  print("Normalisation done !!!")
  return (transformed_data, transformed_y)

In [None]:
x, y = data_preprocessing(data)


Data preprocessing ...
Done !!!


In [None]:
x, y = data_normalisation_encode(x, y)

Starting normalisation ...
Normalisation done !!!


## **Splitting Data**

*In this stage we split the data to 70% train and 30% test*

In [None]:
X_train, X_test, Y_train, Y_test  = train_test_split(x, y, test_size = 0.3)

# **Modeling**

## **First model**

In [None]:
def define_model():
  """
    This function used to define our first model
  """
  model = Sequential()
  model.add(Dense(512, input_shape=(43,))) #(784,) is not a typo -- that represents a 784 length vector!
  model.add(Activation('relu'))
  model.add(Dropout(0.2))
  model.add(Dense(512))
  model.add(Activation('relu'))
  model.add(Dropout(0.2))
  model.add(Dense(10))
  model.add(Activation('softmax'))
  model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.005), metrics=['accuracy'])
  return model

In [None]:
filepath="LPT-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

model = define_model()
history  = model.fit(X_train, Y_train, epochs=50, batch_size=2048, validation_data=(X_test, Y_test), callbacks=callbacks_list)


In [None]:
plot_training_val_loss(history, 1, "losss")
plot_training_val_accu(history, 1, 'accuracy')

# **Resume The Training Process**

*This cell used to resume the training process from .h5 file*

In [None]:
# Util function to get the initial epoch number from the checkpoint name
def get_init_epoch(checkpoint_path):
    """
      This function used to get the last epchos
      args:
        checkpoint_path: A file that will be used to resume the training.
      return:
        epoch: integer the echop number
    """
    filename = os.path.basename(checkpoint_path)
    filename = os.path.splitext(filename)[0]
    init_epoch = filename.split("-")[1]
    return int(init_epoch)

In [None]:
# Resume the training process
file_path = 'LPT-47-0.0087.h5'
model = load_model(file_path)
filepath="drive/MyDrive/Colab Notebooks/TER_DATA/LPT-{epoch:02d}-{loss:.4f}.h5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]
initial_epoch = get_init_epoch(file_path)
history = model.fit(X_train, Y_train, epochs=50, batch_size=2048, validation_data=(X_test, Y_test), callbacks=callbacks_list, initial_epoch=initial_epoch)
model.save('drive/MyDrive/Colab Notebooks/TER_DATA/best_model')

## **Second model**

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Dense, Flatten , Activation, SimpleRNN, LSTM, GRU, Dropout, TimeDistributed, Reshape, Input, Lambda, Add
from tensorflow.keras import Sequential
from keras.callbacks import ModelCheckpoint
def define_model():
  """
    This function used to define our second model
  """
  model = Sequential()
  model.add(Reshape((-1,1), input_shape=(43,)))
  model.add(Conv1D(32, 3, activation='relu', padding='causal'))
  model.add(Conv1D(64, 3, activation='relu', padding='causal'))
  model.add(MaxPooling1D(pool_size=2))
  model.add(LSTM(70, recurrent_dropout=0.1))
  model.add(Flatten())
  model.add(Dense(10, activation='softmax'))
  model.compile(optimizer=keras.optimizers.Adam(lr=0.005), loss='binary_crossentropy', metrics=['accuracy'])
  return model

In [None]:
model = define_model()
filepath="LPT-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]
history  = model.fit(X_train, Y_train, epochs=10, batch_size=2048, validation_data=(X_test, Y_test), callbacks=callbacks_list)
model.save('first_model')

## **Grid search**

In [None]:
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV, train_test_split


activation = [ "relu", "tanh"]
optimzer = ["SGD", "RMSprop", "Adam"]
epochs = [ 20 ,30 , 40, 50]
batch_size = [2048, 1000]

model = KerasClassifier(build_fn=define_model, verbose=1)

param_grid = dict(optimzer= ["Adam"], epochs=[20], batch_size=batch_size, activation=activation)

#use the grid search
grid = GridSearchCV(estimator= model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

#start the training
grid_result = grid.fit(X_train, Y_train)