#Libraries & random seeds

In [None]:
# read & manipulate data
import pandas as pd 
import numpy as np
import tensorflow as tf

# visualisations
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid', context='notebook')
%matplotlib notebook

# misc
import random as rn

# manual parameters
RANDOM_SEED = 42
TRAINING_SAMPLE = 200000
VALIDATE_SIZE = 0.2

# setting random seeds for libraries to ensure reproducibility
np.random.seed(RANDOM_SEED)
rn.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

# Preprocess

## Load

In [None]:
# load the dataset
train_df = pd.read_csv('SCVIC_APT/Training.csv')

In [None]:
train_df.shape

(259120, 84)

In [None]:
train_df.columns

Index(['Flow ID', 'Src IP', 'Src Port', 'Dst IP', 'Dst Port', 'Protocol',
       'Timestamp', 'Flow Duration', 'Total Fwd Packet', 'Total Bwd packets',
       'Total Length of Fwd Packet', 'Total Length of Bwd Packet',
       'Fwd Packet Length Max', 'Fwd Packet Length Min',
       'Fwd Packet Length Mean', 'Fwd Packet Length Std',
       'Bwd Packet Length Max', 'Bwd Packet Length Min',
       'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s',
       'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max',
       'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std',
       'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean',
       'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags',
       'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Length',
       'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s',
       'Packet Length Min', 'Packet Length Max', 'Packet Length Mean',
       'Packet Length Std', 'Packet Len

In [None]:
test_df = pd.read_csv('SCVIC_APT/Testing.csv')

In [None]:
train_df.drop(columns=['Flow ID', 'Src IP', 'Dst IP', 'Timestamp'], inplace = True)
test_df.drop(columns=['Flow ID', 'Src IP', 'Dst IP', 'Timestamp'], inplace = True)

In [None]:
train_df

Unnamed: 0,Src Port,Dst Port,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,49331,389,6,4975,11,32,337.0,3127.0,155.0,0.0,...,20,0.0,0.0,0.0,0.0,1.602165e+15,0.0,1.602165e+15,1.602165e+15,NormalTraffic
1,49332,389,6,4110,9,7,369.0,414.0,155.0,0.0,...,20,0.0,0.0,0.0,0.0,1.602165e+15,0.0,1.602165e+15,1.602165e+15,NormalTraffic
2,0,0,0,1671818,3,0,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,1.602165e+15,0.0,1.602165e+15,1.602165e+15,NormalTraffic
3,0,0,0,1672538,3,0,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,1.602165e+15,0.0,1.602165e+15,1.602165e+15,NormalTraffic
4,137,137,17,2281415,4,0,272.0,0.0,68.0,68.0,...,8,0.0,0.0,0.0,0.0,1.602165e+15,0.0,1.602165e+15,1.602165e+15,NormalTraffic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259115,949,53,17,4003201,3,3,93.0,93.0,31.0,31.0,...,8,0.0,0.0,0.0,0.0,1.450000e+15,0.0,1.450000e+15,1.450000e+15,NormalTraffic
259116,49156,102,6,4057020,20,10,990.0,500.0,99.0,0.0,...,20,0.0,0.0,0.0,0.0,1.450000e+15,0.0,1.450000e+15,1.450000e+15,NormalTraffic
259117,949,53,17,4003254,3,3,93.0,93.0,31.0,31.0,...,8,0.0,0.0,0.0,0.0,1.450000e+15,0.0,1.450000e+15,1.450000e+15,NormalTraffic
259118,49156,102,6,4043435,20,10,990.0,500.0,99.0,0.0,...,20,0.0,0.0,0.0,0.0,1.450000e+15,0.0,1.450000e+15,1.450000e+15,NormalTraffic


In [None]:
a = list(train_df.columns)
a = [''.join(i.split()) for i in a]
train_df = pd.DataFrame(train_df.values, columns = a)

In [None]:
test_df = pd.DataFrame(test_df.values, columns = a)

In [None]:
train_df = train_df[['FlowDuration',
'BwdPacketLengthMax',
'BwdPacketLengthMin',
'BwdPacketLengthMean',
'BwdPacketLengthStd',
'FlowIATMean',
'FlowIATStd',
'FlowIATMax',
'FlowIATMin',
'FwdIATTotal',
'FwdIATMean',
'FwdIATStd',
'FwdIATMax',
'FwdIATMin',
'BwdIATTotal',
'BwdIATMean',
'BwdIATStd',
'BwdIATMax',
'BwdIATMin',
'FwdPSHFlags',
'FwdPackets/s',
'PacketLengthMax',
'PacketLengthMean',
'PacketLengthStd',
'PacketLengthVariance',
'FINFlagCount',
'SYNFlagCount',
'PSHFlagCount',
'ACKFlagCount',
'URGFlagCount',
'AveragePacketSize',
'BwdSegmentSizeAvg',
'FWDInitWinBytes',
'BwdInitWinBytes',
'ActiveMin',
'IdleMean',
'IdleStd',
'IdleMax',
'IdleMin','Label']]
test_df = test_df[['FlowDuration',
'BwdPacketLengthMax',
'BwdPacketLengthMin',
'BwdPacketLengthMean',
'BwdPacketLengthStd',
'FlowIATMean',
'FlowIATStd',
'FlowIATMax',
'FlowIATMin',
'FwdIATTotal',
'FwdIATMean',
'FwdIATStd',
'FwdIATMax',
'FwdIATMin',
'BwdIATTotal',
'BwdIATMean',
'BwdIATStd',
'BwdIATMax',
'BwdIATMin',
'FwdPSHFlags',
'FwdPackets/s',
'PacketLengthMax',
'PacketLengthMean',
'PacketLengthStd',
'PacketLengthVariance',
'FINFlagCount',
'SYNFlagCount',
'PSHFlagCount',
'ACKFlagCount',
'URGFlagCount',
'AveragePacketSize',
'BwdSegmentSizeAvg',
'FWDInitWinBytes',
'BwdInitWinBytes',
'ActiveMin',
'IdleMean',
'IdleStd',
'IdleMax',
'IdleMin','Label']]

## Nan & Inf

In [None]:
train_df.replace([np.inf, -np.inf], -1, inplace=True)
test_df.replace([np.inf, -np.inf], -1, inplace=True)
train_df.replace([np.nan], 0, inplace=True)
test_df.replace([np.nan], 0, inplace=True)

In [None]:
# for col in train_df.columns:
#     m = train_df.loc[train_df[col] != np.inf, col].max()
#     train_df[col].replace(np.inf,m,inplace=True)
#     test_df[col].replace(np.inf,m,inplace=True)

In [None]:
# X_train_cl.replace([np.inf, -np.inf], -1, inplace=True)
# X_train_cl.replace([np.nan], 0, inplace=True)

In [None]:
train_df['Label'].value_counts()

NormalTraffic        254836
Pivoting               2122
Reconnaissance          833
LateralMovement         729
DataExfiltration        527
InitialCompromise        73
Name: Label, dtype: int64

In [None]:
train_df[train_df['Label']=='NormalTraffic'].shape

(254836, 40)

In [None]:
ae_train_df = train_df[train_df['Label']=='NormalTraffic']

In [None]:
X_train, y_train, X_test, y_test = ae_train_df.drop(columns=['Label']), ae_train_df['Label'], test_df.drop(columns=['Label']), test_df['Label']

In [None]:
# X_train_cl, y_train_cl = train_df.drop(columns=['Label', 'Flow ID', 'Src IP', 'Dst IP', 'Timestamp']), train_df['Label']

##Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, stratify=y_train, train_size=0.75)

In [None]:
# X_train_cl, X_val_cl, y_train_cl, y_val_cl = train_test_split(X_train_cl, y_train_cl, stratify=y_train_cl, train_size=0.75)

In [None]:
y_train.unique()

array(['NormalTraffic'], dtype=object)

In [None]:
# y_train_cl.unique()

In [None]:
y_val.unique()

array(['NormalTraffic'], dtype=object)

In [None]:
# y_val_cl.unique()

In [None]:
# y_test.unique()

In [None]:
y_train = (y_train!='NormalTraffic').astype(int)
y_val = (y_val!='NormalTraffic').astype(int)
y_test = (y_test!='NormalTraffic').astype(int)

In [None]:
# y_train_cl = (y_train_cl!='NormalTraffic').astype(int)
# y_val_cl = (y_val_cl!='NormalTraffic').astype(int)

## Scaling

In [None]:
X_train.shape

(191127, 39)

In [None]:
from sklearn.preprocessing import Normalizer, MinMaxScaler
from sklearn.pipeline import Pipeline

# configure our pipeline
pipeline = Pipeline([('normalizer', Normalizer()),
                     ('scaler', MinMaxScaler())])

In [None]:
# get normalization parameters by fitting to the training data
pipeline.fit(X_train)

Pipeline(steps=[('normalizer', Normalizer()), ('scaler', MinMaxScaler())])

In [None]:
import joblib
preprocess_filename = "preprocess_pipeline_AE_39ft.save"
joblib.dump(pipeline, preprocess_filename) 

['preprocess_pipeline_AE_39ft.save']

In [None]:
# transform the training and validation data with these parameters
X_train_transformed = pipeline.transform(X_train)
X_validate_transformed = pipeline.transform(X_val)

In [None]:
# X_train_cl_transformed = pipeline.transform(X_train_cl)
# X_validate_cl_transformed = pipeline.transform(X_val_cl)

# AutoEncoder

In [None]:
# # Load the extension and start TensorBoard
# %load_ext tensorboard
# %tensorboard --logdir logs

## Model architecture

In [None]:
# data dimensions // hyperparameters 
input_dim = X_train_transformed.shape[1]
BATCH_SIZE = 256
EPOCHS = 100

# https://keras.io/layers/core/
autoencoder = tf.keras.models.Sequential([
    
    # deconstruct / encode
    tf.keras.layers.Dense(input_dim, activation='elu', input_shape=(input_dim, )), 
    tf.keras.layers.Dense(16, activation='elu'),
    tf.keras.layers.Dense(8, activation='elu'),
    # tf.keras.layers.Dense(4, activation='elu'),
    # tf.keras.layers.Dense(2, activation='elu'),
    
    # reconstruction / decode
    # tf.keras.layers.Dense(4, activation='elu'),
    tf.keras.layers.Dense(8, activation='elu'),
    tf.keras.layers.Dense(16, activation='elu'),
    tf.keras.layers.Dense(input_dim, activation='elu'),
    # tf.keras.layers.Dense(1, activation='sigmoid'),
    
])

# https://keras.io/api/models/model_training_apis/
autoencoder.compile(optimizer="adam", 
                    loss="mse",
                    metrics=["mse"])
# autoencoder.compile(optimizer="adam", 
#                     loss='binary_crossentropy',
#                     metrics=["binary_crossentropy"])
# print an overview of our model
autoencoder.summary();

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 39)                1560      
                                                                 
 dense_1 (Dense)             (None, 16)                640       
                                                                 
 dense_2 (Dense)             (None, 8)                 136       
                                                                 
 dense_3 (Dense)             (None, 8)                 72        
                                                                 
 dense_4 (Dense)             (None, 16)                144       
                                                                 
 dense_5 (Dense)             (None, 39)                663       
                                                                 
Total params: 3,215
Trainable params: 3,215
Non-trainabl

# Training

In [None]:
from datetime import datetime

# current date and time
yyyymmddHHMM = datetime.now().strftime('%Y%m%d%H%M')

# new folder for a new run
log_subdir = f'{yyyymmddHHMM}_batch{BATCH_SIZE}_layers{len(autoencoder.layers)}'

# define our early stopping
early_stop = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    min_delta=0.0001,
    patience=10,
    verbose=1, 
    mode='min',
    restore_best_weights=True
)

save_model = tf.keras.callbacks.ModelCheckpoint(
    filepath='autoencoder_39ft.hdf5',
    save_best_only=True,
    monitor='val_loss',
    verbose=0,
    mode='min'
)

tensorboard = tf.keras.callbacks.TensorBoard(
    f'logs/{log_subdir}',
    batch_size=BATCH_SIZE,
    update_freq='batch'
)

# callbacks argument only takes a list
cb = [early_stop, save_model, tensorboard]



In [None]:
# history = autoencoder.fit(
#     X_train_transformed, y_train,
#     shuffle=True,
#     epochs=EPOCHS,
#     batch_size=BATCH_SIZE,
#     callbacks=cb,
#     validation_data=(X_validate_transformed, y_val)
# );

history = autoencoder.fit(
    X_train_transformed, X_train_transformed,
    shuffle=True,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=cb,
    validation_data=(X_validate_transformed, X_validate_transformed)
);

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 19: early stopping


# Evaluation

In [None]:
import torch
from sklearn.metrics import classification_report, roc_auc_score, average_precision_score

In [None]:
def aucPerformance(scores, labels):
    roc_auc = roc_auc_score(labels, scores)
#    print(roc_auc)
    ap = average_precision_score(labels, scores)
    print("AUC-ROC: %.4f, AUC-PR: %.4f" % (roc_auc, ap))

In [None]:
pivoting_test_df = test_df[test_df['Label'].isin(['NormalTraffic', 'Pivoting'])]

reconnaissance_test_df = test_df[test_df['Label'].isin(['NormalTraffic', 'Reconnaissance'])]

lateralmovement_test_df = test_df[test_df['Label'].isin(['NormalTraffic', 'LateralMovement'])]

dataexfiltration_test_df = test_df[test_df['Label'].isin(['NormalTraffic', 'DataExfiltration'])]

initialcompromise_test_df = test_df[test_df['Label'].isin(['NormalTraffic', 'InitialCompromise'])]

for df in [test_df, pivoting_test_df, reconnaissance_test_df, lateralmovement_test_df, dataexfiltration_test_df, initialcompromise_test_df]:
  X_test, y_test = df.drop(columns=['Label']), df['Label']

  print(y_test.unique())
  
  y_test = (y_test!='NormalTraffic').astype(int)

  # transform the test set with the pipeline fitted to the training set
  X_test_transformed = pipeline.transform(X_test)

  # pass the transformed test set through the autoencoder to get the reconstructed result
  reconstructions = autoencoder.predict(X_test_transformed)

  gap_loss = torch.mean(torch.nn.functional.mse_loss(torch.FloatTensor(X_test_transformed), torch.FloatTensor(reconstructions), reduction='none'), dim=1)

  print(roc_auc_score(y_test, gap_loss.detach().numpy()))
  print(average_precision_score(y_test, gap_loss.detach().numpy()))
  print(aucPerformance(gap_loss.detach().numpy(), y_test))

  ind = np.argpartition(gap_loss, -sum(y_test))[-sum(y_test):]

  top_k = np.zeros(gap_loss.shape)

  top_k[ind] = 1

  print(classification_report(y_test,top_k))

['NormalTraffic' 'InitialCompromise' 'Reconnaissance' 'Pivoting'
 'LateralMovement' 'DataExfiltration']
0.9081297279409458
0.4243215845860784
AUC-ROC: 0.9081, AUC-PR: 0.4243
None
              precision    recall  f1-score   support

           0       0.99      0.99      0.99     55583
           1       0.42      0.42      0.42       904

    accuracy                           0.98     56487
   macro avg       0.70      0.70      0.70     56487
weighted avg       0.98      0.98      0.98     56487

['NormalTraffic' 'Pivoting']
0.9717478065835478
0.4008164488837856
AUC-ROC: 0.9717, AUC-PR: 0.4008
None
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     55583
           1       0.40      0.40      0.40       360

    accuracy                           0.99     55943
   macro avg       0.70      0.70      0.70     55943
weighted avg       0.99      0.99      0.99     55943

['NormalTraffic' 'Reconnaissance']
0.9026672218346449
0.1459452