In [None]:
# read & manipulate data
import pandas as pd 
import numpy as np
import tensorflow as tf

# visualisations
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid', context='notebook')
%matplotlib notebook

# misc
import random as rn

# manual parameters
RANDOM_SEED = 42
TRAINING_SAMPLE = 200000
# VALIDATE_SIZE = 0.2

# setting random seeds for libraries to ensure reproducibility
np.random.seed(RANDOM_SEED)
rn.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

In [None]:
from sklearn.svm import OneClassSVM as ocSVM

### Preprocessing Data

### Loading the dataset

In [None]:
# load the dataset
train_df = pd.read_csv('SCVIC_APT/Training.csv')

In [None]:
train_df.shape

(259120, 84)

In [None]:
train_df.columns

Index(['Flow ID', 'Src IP', 'Src Port', 'Dst IP', 'Dst Port', 'Protocol',
       'Timestamp', 'Flow Duration', 'Total Fwd Packet', 'Total Bwd packets',
       'Total Length of Fwd Packet', 'Total Length of Bwd Packet',
       'Fwd Packet Length Max', 'Fwd Packet Length Min',
       'Fwd Packet Length Mean', 'Fwd Packet Length Std',
       'Bwd Packet Length Max', 'Bwd Packet Length Min',
       'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s',
       'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max',
       'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std',
       'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean',
       'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags',
       'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Length',
       'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s',
       'Packet Length Min', 'Packet Length Max', 'Packet Length Mean',
       'Packet Length Std', 'Packet Len

In [None]:
test_df = pd.read_csv('SCVIC_APT/Testing.csv')

In [None]:
train_df.drop(columns=['Flow ID', 'Src IP', 'Dst IP', 'Timestamp'], inplace = True)
test_df.drop(columns=['Flow ID', 'Src IP', 'Dst IP', 'Timestamp'], inplace = True)

## Nan & Inf

In [None]:
train_df.replace([np.inf, -np.inf], -1, inplace=True)
test_df.replace([np.inf, -np.inf], -1, inplace=True)
train_df.replace([np.nan], 0, inplace=True)
test_df.replace([np.nan], 0, inplace=True)

In [None]:
# for col in train_df.columns:
#     m = train_df.loc[train_df[col] != np.inf, col].max()
#     train_df[col].replace(np.inf,m,inplace=True)
#     test_df[col].replace(np.inf,m,inplace=True)

In [None]:
# X_train_cl.replace([np.inf, -np.inf], -1, inplace=True)
# X_train_cl.replace([np.nan], 0, inplace=True)

In [None]:
train_df['Label'].value_counts()

NormalTraffic        254836
Pivoting               2122
Reconnaissance          833
LateralMovement         729
DataExfiltration        527
InitialCompromise        73
Name: Label, dtype: int64

In [None]:
train_df[train_df['Label']=='NormalTraffic'].shape

(254836, 80)

In [None]:
ae_train_df = train_df[train_df['Label']=='NormalTraffic']

In [None]:
X_train, y_train, X_test, y_test = ae_train_df.drop(columns=['Label']), ae_train_df['Label'], test_df.drop(columns=['Label']), test_df['Label']

In [None]:
# X_train_cl, y_train_cl = train_df.drop(columns=['Label', 'Flow ID', 'Src IP', 'Dst IP', 'Timestamp']), train_df['Label']

In [None]:
y_train = (y_train!='NormalTraffic').astype(int)
y_test = (y_test!='NormalTraffic').astype(int)

## Scaling

In [None]:
from sklearn.preprocessing import Normalizer, MinMaxScaler
from sklearn.pipeline import Pipeline

# configure our pipeline
pipeline = Pipeline([('normalizer', Normalizer()),
                     ('scaler', MinMaxScaler())])

In [None]:
# get normalization parameters by fitting to the training data
pipeline.fit(X_train)

Pipeline(steps=[('normalizer', Normalizer()), ('scaler', MinMaxScaler())])

In [None]:
import joblib
preprocess_filename = "preprocess_pipeline_ocSVM.save"
joblib.dump(pipeline, preprocess_filename) 

['preprocess_pipeline_ocSVM.save']

In [None]:
# transform the training and data with these parameters
X_train_transformed = pipeline.transform(X_train)

### OneClassSVM

In [None]:
# # Load the extension and start TensorBoard
# %load_ext tensorboard
# %tensorboard --logdir logs

### Training

In [None]:
# Train the one class support vector machine (SVM) model
one_class_svm = ocSVM(nu=0.01, kernel = 'rbf', gamma = 'auto').fit(X_train_transformed)

### Evaluation

In [None]:
import torch
from sklearn.metrics import classification_report, roc_auc_score, average_precision_score

In [None]:
def aucPerformance(scores, labels):
    roc_auc = roc_auc_score(labels, scores)
#    print(roc_auc)
    ap = average_precision_score(labels, scores)
    print("AUC-ROC: %.4f, AUC-PR: %.4f" % (roc_auc, ap))

In [None]:
pivoting_test_df = test_df[test_df['Label'].isin(['NormalTraffic', 'Pivoting'])]

reconnaissance_test_df = test_df[test_df['Label'].isin(['NormalTraffic', 'Reconnaissance'])]

lateralmovement_test_df = test_df[test_df['Label'].isin(['NormalTraffic', 'LateralMovement'])]

dataexfiltration_test_df = test_df[test_df['Label'].isin(['NormalTraffic', 'DataExfiltration'])]

initialcompromise_test_df = test_df[test_df['Label'].isin(['NormalTraffic', 'InitialCompromise'])]

print("OneClassSVM fundamentally doesn't support converting a decision into a probability score, so you cannot pass the necessary scores into functions that require varying a score threshold, such as for ROC or Precision-Recall curves and scores.")
for df in [test_df, pivoting_test_df, reconnaissance_test_df, lateralmovement_test_df, dataexfiltration_test_df, initialcompromise_test_df]:
# for df in [test_df]:
  X_test, y_test = df.drop(columns=['Label']), df['Label']

  print(y_test.unique())
  
  y_test = (y_test!='NormalTraffic').astype(int)

  # transform the test set with the pipeline fitted to the training set
  X_test_transformed = pipeline.transform(X_test)

  # Predict the anomalies
  score = one_class_svm.decision_function(X_test_transformed)


  # Check the score for % of outliers
  score_threshold = np.percentile(score, round(sum(y_test)/y_test.shape[0]*100))
  # Check the model performance at  threshold
  customized_prediction = [1 if i < score_threshold else 0 for i in score]
  
  score_2 = score.max() - score
 
  print("AUC-PR:")
  print(average_precision_score(y_test,score_2))

  # # Check the prediction performance
  print(classification_report(y_test, customized_prediction))

OneClassSVM fundamentally doesn't support converting a decision into a probability score, so you cannot pass the necessary scores into functions that require varying a score threshold, such as for ROC or Precision-Recall curves and scores.
['NormalTraffic' 'InitialCompromise' 'Reconnaissance' 'Pivoting'
 'LateralMovement' 'DataExfiltration']
AUC-PR:
0.247824619500908
              precision    recall  f1-score   support

           0       0.99      0.98      0.99     55583
           1       0.22      0.27      0.24       904

    accuracy                           0.97     56487
   macro avg       0.60      0.63      0.61     56487
weighted avg       0.98      0.97      0.97     56487

['NormalTraffic' 'Pivoting']
AUC-PR:
0.23124262398697415
              precision    recall  f1-score   support

           0       1.00      0.99      0.99     55583
           1       0.20      0.31      0.24       360

    accuracy                           0.99     55943
   macro avg       0.60     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


AUC-PR:
0.18626852309540323
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     55583
           1       0.00      0.00      0.00       142

    accuracy                           1.00     55725
   macro avg       0.50      0.50      0.50     55725
weighted avg       0.99      1.00      1.00     55725

['NormalTraffic' 'DataExfiltration']


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


AUC-PR:
0.11513419113547582
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     55583
           1       0.00      0.00      0.00        74

    accuracy                           1.00     55657
   macro avg       0.50      0.50      0.50     55657
weighted avg       1.00      1.00      1.00     55657

['NormalTraffic' 'InitialCompromise']


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


AUC-PR:
0.2134580304758805
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     55583
           1       0.00      0.00      0.00        77

    accuracy                           1.00     55660
   macro avg       0.50      0.50      0.50     55660
weighted avg       1.00      1.00      1.00     55660



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
score_threshold = np.percentile(score, round(sum(y_test)/y_test.shape[0]*100))

In [None]:
from sklearn.metrics import roc_curve, auc
pivoting_test_df = test_df[test_df['Label'].isin(['NormalTraffic', 'Pivoting'])]

reconnaissance_test_df = test_df[test_df['Label'].isin(['NormalTraffic', 'Reconnaissance'])]

lateralmovement_test_df = test_df[test_df['Label'].isin(['NormalTraffic', 'LateralMovement'])]

dataexfiltration_test_df = test_df[test_df['Label'].isin(['NormalTraffic', 'DataExfiltration'])]

initialcompromise_test_df = test_df[test_df['Label'].isin(['NormalTraffic', 'InitialCompromise'])]

for df in [test_df, pivoting_test_df, reconnaissance_test_df, lateralmovement_test_df, dataexfiltration_test_df, initialcompromise_test_df]:
    X_test, y_test = df.drop(columns=['Label']), df['Label']

    print(y_test.unique())
    
    y_test = (y_test!='NormalTraffic').astype(int)

    # transform the test set with the pipeline fitted to the training set
    X_test_transformed = pipeline.transform(X_test)

    _nSamplesPred = X_test.shape[0]
    _nColumns = X_test.shape[1]
    
    
    score = one_class_svm.decision_function(X_test_transformed)
       
    data_n = pd.DataFrame(X_test)
    data_n = data_n.astype('float32')
    dist = np.zeros(_nSamplesPred)
    for i, x in enumerate(data_n.iloc[0:_nSamplesPred, :].values):
        dist[i] = np.linalg.norm(score[i])
        
    fpr,tpr,threshold = roc_curve(y_test, dist)
    roc_auc = auc(fpr, tpr)
    print("AUC-ROC:",roc_auc)
    
    score_2 = score.max() - score
    print("AUC-PR:",average_precision_score(y_test,score_2))

['NormalTraffic' 'InitialCompromise' 'Reconnaissance' 'Pivoting'
 'LateralMovement' 'DataExfiltration']
AUC-ROC: 0.6927283187592055
AUC-PR: 0.247824619500908
['NormalTraffic' 'Pivoting']
AUC-ROC: 0.7298849868165126
AUC-PR: 0.23124262398697415
['NormalTraffic' 'Reconnaissance']
AUC-ROC: 0.6840252469065142
AUC-PR: 0.1024332081956156
['NormalTraffic' 'LateralMovement']
AUC-ROC: 0.629406777277377
AUC-PR: 0.18626852309540323
['NormalTraffic' 'DataExfiltration']
AUC-ROC: 0.7121470642151426
AUC-PR: 0.11513419113547582
['NormalTraffic' 'InitialCompromise']
AUC-ROC: 0.6454912052666761
AUC-PR: 0.2134580304758805
