In [1]:
import torch

import copy
import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from torch import nn, optim

import torch.nn.functional as F
#from arff2pandas import a2p
from tqdm import tqdm


%matplotlib inline
%config InlineBackend.figure_format='retina'

sns.set(style='whitegrid', palette='muted', font_scale=1.2)

HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]

sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))

rcParams['figure.figsize'] = 12, 8

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

<torch._C.Generator at 0x2caaa6ad5b0>

In [2]:
df = pd.read_csv('../swat_dataset_preprocessed', parse_dates=[" Timestamp"]).iloc[:,1:]
df

Unnamed: 0,Timestamp,FIT101,AIT201,AIT203,DPIT301,LIT301,AIT402,AIT503,AIT504,Normal/Attack
0,2015-12-28 10:00:00,2.427057,262.0161,328.6337,19.74838,956.1651,156.0882,264.5475,12.03538,Normal
1,2015-12-28 10:00:01,2.446274,262.0161,328.6337,19.74838,956.1651,156.0882,264.5475,12.03538,Normal
2,2015-12-28 10:00:02,2.489191,262.0161,328.6337,19.69076,956.4855,156.0882,264.5475,12.03538,Normal
3,2015-12-28 10:00:03,2.534350,262.0161,328.6337,19.69076,956.8060,156.0882,264.5475,12.03538,Normal
4,2015-12-28 10:00:04,2.569260,262.0161,328.6337,19.69076,957.0864,156.0882,264.5475,12.03538,Normal
...,...,...,...,...,...,...,...,...,...,...
449914,2016-02-01 14:59:55,2.559972,168.0979,301.9226,20.39823,974.5498,145.6037,257.1136,14.80390,Normal
449915,2016-02-01 14:59:56,2.549082,168.0979,301.9226,20.39823,974.5898,145.6037,257.1136,14.80390,Normal
449916,2016-02-01 14:59:57,2.531467,168.0979,301.9226,20.33101,974.2695,145.5524,257.1136,14.80390,Normal
449917,2016-02-01 14:59:58,2.521218,168.0979,301.9226,20.29579,974.2294,145.5524,257.1136,14.80390,Normal


In [3]:
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(df['Normal/Attack'])

In [4]:
label_encoder.classes_

array(['Attack', 'Normal'], dtype=object)

In [5]:
df['label'] = encoded_labels

In [6]:
df = df.drop(['Normal/Attack'],axis=1)
df.head()

Unnamed: 0,Timestamp,FIT101,AIT201,AIT203,DPIT301,LIT301,AIT402,AIT503,AIT504,label
0,2015-12-28 10:00:00,2.427057,262.0161,328.6337,19.74838,956.1651,156.0882,264.5475,12.03538,1
1,2015-12-28 10:00:01,2.446274,262.0161,328.6337,19.74838,956.1651,156.0882,264.5475,12.03538,1
2,2015-12-28 10:00:02,2.489191,262.0161,328.6337,19.69076,956.4855,156.0882,264.5475,12.03538,1
3,2015-12-28 10:00:03,2.53435,262.0161,328.6337,19.69076,956.806,156.0882,264.5475,12.03538,1
4,2015-12-28 10:00:04,2.56926,262.0161,328.6337,19.69076,957.0864,156.0882,264.5475,12.03538,1


In [7]:
rows = []

for _, row in tqdm(df.iterrows(), total=df.shape[0]):
    
    row_data = dict(
        day_of_week=row[" Timestamp"].dayofweek,
        day_of_month=row[" Timestamp"].day,
        week_of_year=row[" Timestamp"].week,
        month=row[" Timestamp"].month,
        label = row['label'],
        FIT101 = row['FIT101'],
        AIT201 = row[' AIT201'],
        AIT203 = row['AIT203'],
        DPIT301 = row['DPIT301'],
        LIT301 = row['LIT301'],
        AIT402 = row['AIT402'],
        AIT503 = row['AIT503'],
        AIT504 = row['AIT504'],
    )
    
    rows.append(row_data)
    
features_df = pd.DataFrame(rows)   

100%|████████████████████████████████████████████████████████████████████████| 449919/449919 [01:29<00:00, 5003.85it/s]


In [8]:
features_df.shape

(449919, 13)

In [9]:
features_df.head()

Unnamed: 0,day_of_week,day_of_month,week_of_year,month,label,FIT101,AIT201,AIT203,DPIT301,LIT301,AIT402,AIT503,AIT504
0,0,28,53,12,1,2.427057,262.0161,328.6337,19.74838,956.1651,156.0882,264.5475,12.03538
1,0,28,53,12,1,2.446274,262.0161,328.6337,19.74838,956.1651,156.0882,264.5475,12.03538
2,0,28,53,12,1,2.489191,262.0161,328.6337,19.69076,956.4855,156.0882,264.5475,12.03538
3,0,28,53,12,1,2.53435,262.0161,328.6337,19.69076,956.806,156.0882,264.5475,12.03538
4,0,28,53,12,1,2.56926,262.0161,328.6337,19.69076,957.0864,156.0882,264.5475,12.03538


In [10]:
train_size = int(len(features_df) * 0.8)
test_size = len(features_df) - train_size
train, test = features_df.iloc[0:train_size], features_df.iloc[train_size:len(features_df)]

In [11]:
train.shape

(359935, 13)

In [12]:
def create_dataset(X, y, time_steps=1):
    Xs, ys = [], []
    for i in range(len(X) - time_steps):
        v = X.iloc[i:(i + time_steps)].values
        Xs.append(v)
        ys.append(y.iloc[i + time_steps])
    return np.array(Xs), np.array(ys)
    

In [13]:
time_steps = 25

X_train, y_train = create_dataset(train, train.label, time_steps)
X_test, y_test = create_dataset(test, test.label, time_steps)
print(X_train.shape, y_train.shape)

(359910, 25, 13) (359910,)


In [14]:
# We have to redimension the arrays for the Conv2D digestion benefit

data_train_wide = X_train.reshape((X_train.shape[0], X_train.shape[1], X_train.shape[2], 1))
data_test_wide = X_test.reshape((X_test.shape[0], X_test.shape[1], X_test.shape[2], 1))

print(data_train_wide.shape)
print(data_test_wide.shape)

(359910, 25, 13, 1)
(89959, 25, 13, 1)


In [15]:
from tensorflow import keras 
from tensorflow.keras.models import Sequential

def get_f1(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [21]:
# NN model
from tensorflow.keras import layers 
from tensorflow.keras import optimizers 
from tensorflow.keras import backend as K

n_filters = 64
fsize = 5  # Note that kernel size (1, fsize) = it is not a square kernel...it is rectangular
window_size = 25   # Number of time steps in one period
n_features = 13 # Number of cols in one sample (one table)


MyModel = Sequential()
MyModel.add(layers.Conv2D(n_filters, fsize, activation='relu', input_shape=(window_size, n_features, 1)))
MyModel.add(layers.Flatten())
MyModel.add(layers.Dense(256, activation='relu'))
#MyModel.add(layers.Dropout(0.2))

MyModel.add(layers.Dense(1, activation='sigmoid'))

MyModel.compile(optimizer=optimizers.Adam(lr=1e-4), 
              loss='binary_crossentropy', 
              metrics=[get_f1])
              #metrics=['binary_accuracy'])

print(MyModel.summary())

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_3 (Conv2D)            (None, 21, 9, 64)         1664      
_________________________________________________________________
flatten_3 (Flatten)          (None, 12096)             0         
_________________________________________________________________
dense_6 (Dense)              (None, 256)               3096832   
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 257       
Total params: 3,098,753
Trainable params: 3,098,753
Non-trainable params: 0
_________________________________________________________________
None


In [23]:
history = MyModel.fit(data_train_wide, y_train, 
                      validation_split=0.2, 
                      epochs = 5, 
                      batch_size = 16)

Train on 287928 samples, validate on 71982 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [24]:
# Final Predict
# NOTE final_predictions is a list of probabilities

final_predictions = MyModel.predict(data_test_wide)
final_predictions.shape

(89959, 1)

In [25]:
Preds = final_predictions.copy()
#print(len(Preds))
#print(Preds)
Preds[ np.where( Preds >= 0.5 ) ] = 1
Preds[ np.where( Preds < 0.5 ) ] = 0

Preds.shape

(89959, 1)

In [26]:
# Confusion matrix

from sklearn import metrics
conf_mx = metrics.confusion_matrix(y_test, Preds)

TN = conf_mx[0,0]
FP = conf_mx[0,1]
FN = conf_mx[1,0]
TP = conf_mx[1,1]

print ('TN: ', TN)
print ('FP: ', FP)
print ('FN: ', FN)
print ('TP: ', TP)

recall = TP/(TP+FN)
precision = TP/(TP+FP)

print (recall, precision)

TN:  1566
FP:  3138
FN:  34412
TP:  50843
0.5963638496275878 0.9418684351901595
