In [None]:
## A pipeline for training and evaluating a Stacked Autoencoder with LSTM layers for anomaly detection in network traffic data. 
#It preprocesses the data, trains the model, evaluates its performance, and saves the model for future use.

#Data Generator

In [None]:
# Importing the libraries
import numpy as np
import pandas as pd
import ipaddress

#Variables
noise_factor = 0.5
sample_size = 65000
#0 => Normal, 1 => Attack

#Load the dataset
def loadDataset(datasetFilePath):
    train_df =  pd.read_csv(datasetFilePath)
    train_df.drop(columns=['Flow ID', 'Src IP', 'Dst IP', 'Timestamp'], inplace = True)
    train_df.replace([np.inf, -np.inf], -1, inplace=True)
    train_df.replace([np.nan], 0, inplace=True)
    ae_train_df = train_df[train_df['Label']=='NormalTraffic']
    
    return ae_train_df

#DDoS Features were found to be 22, 23, 25, 27, 30, 32, 58, 79. So removed them along with the flow ID and the timestamp
def getEncoderInput(datasetType, dataset, start, nSamples, nColumns):
    if 'unb15' in datasetType or 'custom' in datasetType:    
        X = dataset.iloc[start:nSamples, 0:nColumns-2].values    
    else:
        X = dataset.iloc[start:nSamples, 0:nColumns-1].values   
 

    if 'cicids2017' in datasetType:
        X = np.delete(X, [0, 6, 22, 23, 25, 27, 30, 32, 58, 79], axis=1)
        for i in range(len(X)):
            X[i, 0] = int(ipaddress.ip_address(X[i, 0]))
            X[i, 2] = int(ipaddress.ip_address(X[i, 2]))
        #X[i, 5] = int(X[i, 5])
    elif 'cicids2018' in datasetType:
        X = np.delete(X, [2, 3, 18, 19, 21, 23, 26, 28, 54, 75], axis=1)

    # Randomly sampling code - Sailik
    #X = dt[np.random.choice(dt.shape[0], sample_size , replace=False), :] 
    return X


def getEncoderLabelCoulmn(datasetType, dataset, start, nSamples):
    if 'unb15' in datasetType or 'custom' in datasetType:    
        labelIndex = -2    
    else:
        labelIndex = -1
    y = dataset.iloc[start:nSamples, labelIndex].values 
    integerY = []
    for i in range(len(y)):
         integerY.append(int(str.lower(str(y[i])) != "benign" and str.lower(str(y[i])) != "normal" and str.lower(str(y[i])) != "normaltraffic"))
    y = np.array(integerY)    
    y = np.reshape(y, (y.shape[0], 1))
    return y

#Add noise in case you want to make your model to still be able to contruct the original input (a process known as denoising). Resulting model will fall under Denoising Stacked Autoencoders
def addNoise(X):
     return X + noise_factor * np.random.normal(loc=0.0, scale=1.0, size=X.shape)    

#If you are using LSTM in your stacked encoder, you have to convert the input into sequences
def getEncoderInputSequence(X, nTimesteps, nColumns):
    X_sequence = []
    for i in range(nTimesteps, np.shape(X)[0]):
        X_sequence.append(X[i-nTimesteps:i, :])  
    X_sequence = np.array(X_sequence)
    X_sequence = np.reshape(X_sequence, (X_sequence.shape[0], X_sequence.shape[1], nColumns))
    return X_sequence


# Model Generator

In [None]:
from keras.layers import Input, LSTM, RepeatVector, Dense
#regularizers
from keras.models import Model

#Get the model 
def getSAE_LSTM(nTimesteps, nOperatingColumns):
    #This is the size of our encoded representations
    encoding_dim1 = 60 
    encoding_dim2 = 35
    encoding_dim3 = 20
    
    # this is our input placeholder
    input = Input(shape=(nTimesteps, nOperatingColumns))
    # "encoded" is the encoded representation of the input
    encoded = LSTM(encoding_dim1, return_sequences=True, dropout = 0.2)(input)
    
	#dropout will randomly make some cells void in generating the output. Makes the model better.
    encoded = LSTM(encoding_dim2, return_sequences=True, dropout = 0.2)(encoded)
    
	#return_sequences passes the sequences to the next layer. Since we have LSTM layers all the way, we need to pass the sequences to the next layers too. 
    encoded = LSTM(encoding_dim3, return_sequences=True, dropout = 0.2)(encoded)
    
    decoded = LSTM(encoding_dim2, return_sequences=True, dropout = 0.2)(encoded)
    
    decoded = LSTM(encoding_dim1, return_sequences=True, dropout = 0.2)(decoded)
    
    decoded = LSTM(nOperatingColumns, return_sequences=True)(decoded)
    
    # this model maps an input to its reconstruction
    sae_lstm = Model(input, decoded)
    
    return sae_lstm


# Trainer

In [None]:
import math
from keras.models import Model, model_from_json
from keras.callbacks import ModelCheckpoint, EarlyStopping
from numpy.testing import assert_allclose
from keras.models import load_model
from sklearn.preprocessing import MinMaxScaler
from keras.utils.vis_utils import plot_model
import matplotlib.pyplot as plt
import tensorflow as tf
import os

# Specify which GPU(s) to use
os.environ["CUDA_VISIBLE_DEVICES"] = "1"  # Or 2, 3, etc. other than 0

# On CPU/GPU placement
config = tf.compat.v1.ConfigProto(allow_soft_placement=True, log_device_placement=True)
config.gpu_options.allow_growth = True
tf.compat.v1.Session(config=config)


dataset_train = loadDataset("SCVIC_APT/Training.csv")
datasetType = 'other'
_nTotal = dataset_train.shape[0]
_nColumns = dataset_train.shape[1]
#Using 75% of the data for training and remaining 25% for validation testing
_nSamplesTrain = math.ceil(_nTotal * 0.75)
_nSamplesValidation = _nTotal - _nSamplesTrain
_nTimesteps = 3

X_train  = getEncoderInput(datasetType, dataset_train, 0, _nSamplesTrain, _nColumns)
#X_train = getEncoderInput(dataset_train, 0, _nSamplesTrain, _nColumns)
#y = getEncoderLabelCoulmn(dataset_train, 0, _nSamplesTrain, _nColumns)
y = getEncoderLabelCoulmn(datasetType, dataset_train, 0, _nSamplesTrain)

Validation_X = getEncoderInput(datasetType, dataset_train, _nSamplesTrain, _nSamplesTrain+_nSamplesValidation, _nColumns)

# Feature Scaling -Normalization recommended for RNN    
sc = MinMaxScaler(feature_range = (0, 1))
X_train = sc.fit_transform(X_train)
Validation_X = sc.fit_transform(Validation_X)

#Converting training inputs into LSTM training inputs
_nOperatingColumns = len(X_train[0])
X_train_sequence = getEncoderInputSequence(X_train, _nTimesteps, _nOperatingColumns)
Validation_X_sequence = getEncoderInputSequence(Validation_X, _nTimesteps, _nOperatingColumns)

sequence_autoencoder_semi = getSAE_LSTM(_nTimesteps, _nOperatingColumns)
sequence_autoencoder_semi.compile(optimizer='adam', loss='mean_squared_error')

modelName = "sae_lstm"

#Adding checkpoints
checkpointFile = modelName + ".h5"
checkpoint = ModelCheckpoint(checkpointFile, monitor='loss', verbose=1, save_best_only=True, mode='min')
#earlyStopping = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)
#callbacks_list = [checkpoint, earlyStopping]
callbacks_list = [checkpoint]

#sequence_autoencoder_semi = load_model(checkpointFile)
#sequence_autoencoder_semi.summary()
plot_model(sequence_autoencoder_semi, to_file='sae_lstm_model_plot.png', show_shapes=True, show_layer_names=True)
#Training autoencoder
sequence_autoencoder_semi_history = sequence_autoencoder_semi.fit(X_train_sequence, X_train_sequence,
                     epochs=10,
                     batch_size=32,
                     shuffle=False,
                     validation_data=(Validation_X_sequence, 
                     Validation_X_sequence), 
                     callbacks=callbacks_list)



# Save the model and serialize model to JSON and h5
sequence_autoencoder_semi.save( modelName + ".h5")
print("Saved model to disk")

# loss = sequence_autoencoder_semi_history.history['loss']
# val_loss = sequence_autoencoder_semi_history.history['val_loss']
# epochs = range(3)
# plt.figure()
# plt.plot(epochs, loss, color='red', label='Training loss')
# plt.plot(epochs, val_loss, color='blue', label='Validation loss')
# plt.title('Training and Validation loss')
# plt.xlabel('epochs')
# plt.ylabel('loss')
# plt.legend()
# #plt.savefig('LossColored_' + modelName + '.png')
# plt.show()


Device mapping: no known devices.
Epoch 1/10
Epoch 1: loss improved from inf to 0.00944, saving model to sae_lstm.h5
Epoch 2/10
Epoch 2: loss improved from 0.00944 to 0.00478, saving model to sae_lstm.h5
Epoch 3/10
Epoch 3: loss improved from 0.00478 to 0.00398, saving model to sae_lstm.h5
Epoch 4/10
Epoch 4: loss improved from 0.00398 to 0.00347, saving model to sae_lstm.h5
Epoch 5/10
Epoch 5: loss improved from 0.00347 to 0.00282, saving model to sae_lstm.h5
Epoch 6/10
Epoch 6: loss improved from 0.00282 to 0.00257, saving model to sae_lstm.h5
Epoch 7/10
Epoch 7: loss improved from 0.00257 to 0.00237, saving model to sae_lstm.h5
Epoch 8/10
Epoch 8: loss improved from 0.00237 to 0.00225, saving model to sae_lstm.h5
Epoch 9/10
Epoch 9: loss improved from 0.00225 to 0.00216, saving model to sae_lstm.h5
Epoch 10/10
Epoch 10: loss improved from 0.00216 to 0.00209, saving model to sae_lstm.h5
Saved model to disk


# Eval

In [None]:
import joblib
preprocess_filename = "preprocess_pipeline_SAE_LSTM.save"
joblib.dump(sc, preprocess_filename) 

['preprocess_pipeline_SAE_LSTM.save']

In [None]:
import torch
from sklearn.metrics import classification_report, roc_auc_score, average_precision_score

In [None]:
test_df =  pd.read_csv("SCVIC_APT/Testing.csv")
test_df.drop(columns=['Flow ID', 'Src IP', 'Dst IP', 'Timestamp'], inplace = True)
test_df.replace([np.inf, -np.inf], -1, inplace=True)
test_df.replace([np.nan], 0, inplace=True)

pivoting_test_df = test_df[test_df['Label'].isin(['NormalTraffic', 'Pivoting'])]

reconnaissance_test_df = test_df[test_df['Label'].isin(['NormalTraffic', 'Reconnaissance'])]

lateralmovement_test_df = test_df[test_df['Label'].isin(['NormalTraffic', 'LateralMovement'])]

dataexfiltration_test_df = test_df[test_df['Label'].isin(['NormalTraffic', 'DataExfiltration'])]

initialcompromise_test_df = test_df[test_df['Label'].isin(['NormalTraffic', 'InitialCompromise'])]


datasetType = 'other'
_nTotal = test_df.shape[0]
_nColumns = test_df.shape[1]

_nTimesteps = 3

for dataset_test in [test_df, pivoting_test_df, reconnaissance_test_df, lateralmovement_test_df, dataexfiltration_test_df, initialcompromise_test_df]:
    X_test  = getEncoderInput(datasetType, dataset_test, 0, _nTotal, _nColumns)
    #X_train = getEncoderInput(dataset_train, 0, _nSamplesTrain, _nColumns)
    #y = getEncoderLabelCoulmn(dataset_train, 0, _nSamplesTrain, _nColumns)
    y_test = getEncoderLabelCoulmn(datasetType, dataset_test, 0, _nTotal)[:-_nTimesteps]

    # Feature Scaling -Normalization recommended for RNN    
    X_test = sc.transform(X_test)

    #Converting training inputs into LSTM training inputs
    _nOperatingColumns = len(X_test[0])
    X_test_sequence = getEncoderInputSequence(X_test, _nTimesteps, _nOperatingColumns)

    # pass the transformed test set through the autoencoder to get the reconstructed result
    reconstructions = sequence_autoencoder_semi.predict(X_test_sequence)


    dim = tf.reduce_prod(tf.shape(X_test_sequence)[1:])
    X_test_sequence_flatten = tf.reshape(X_test_sequence, [-1, dim])

    dim = tf.reduce_prod(tf.shape(reconstructions)[1:])
    reconstructions_flatten = tf.reshape(reconstructions, [-1, dim])

    gap_loss = torch.mean(torch.nn.functional.mse_loss(torch.FloatTensor(X_test_sequence_flatten.numpy()), torch.FloatTensor(reconstructions_flatten.numpy()), reduction='none'), dim=1)

    print(roc_auc_score(y_test, gap_loss.detach().numpy()))
    print(average_precision_score(y_test, gap_loss.detach().numpy()))

    ind = np.argpartition(gap_loss, -sum(y_test))[-sum(y_test)[0]:]

    top_k = np.zeros(gap_loss.shape)

    top_k[ind] = 1

    print(classification_report(y_test,top_k))

0.8266275471535888
0.04491375707595505
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     55580
           1       0.05      0.05      0.05       904

    accuracy                           0.97     56484
   macro avg       0.52      0.52      0.52     56484
weighted avg       0.97      0.97      0.97     56484

0.8246257646635481
0.018174234875925614
              precision    recall  f1-score   support

           0       0.99      0.99      0.99     55580
           1       0.01      0.01      0.01       360

    accuracy                           0.99     55940
   macro avg       0.50      0.50      0.50     55940
weighted avg       0.99      0.99      0.99     55940

0.8247972485731777
0.01275628599114658
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     55580
           1       0.00      0.00      0.00       251

    accuracy                           0.99     55831
   macro a