In [None]:
!pip install jsonpickle elasticsearch elasticsearch_dsl opencv-python
from odf import *
from pathlib import Path
from tqdm.notebook import tqdm
import numpy as np

In [None]:
# load JSON from 'data' folder into python array
# final version uses mega-dump with 501 files
folder = 'data'
data = [
    jsonpickle.decode(
    Path(os.path.join(folder, f)).read_text()
    ) for f in tqdm(os.listdir(folder))]

In [None]:
'''
converts a word sent over the bus into its component parts
returns: an array containing an integer representing each piece of the message, the class (attack type) label
'''
def word2seq(t:int, w: Word):
    def _inner():
        if isinstance(w, Data):
            return list(w.data)
        if isinstance(w, Command):
            return [
                w.address, 
                w.tr, 
                w.sub_address, 
                w.dword_count
            ]
        if isinstance(w, Status):
            return [
                w.address, 
                w.message_error_bit, 
                w.instrumentation_bit, 
                w.service_request_bit,
                w.reserved_bits,
                w.brdcst_received_bit,
                w.busy_bit,
                w.subsystem_flag_bit,
                w.dynamic_bus_control_accpt_bit,
                w.terminal_flag_bit,
                w.parity_bit,
            ]    
    return [int(i) for i in _inner()], w.fake

In [None]:
# all_attacks defined in odf package
total_attacks = 11

# convert the attack type listed into an integer (for classification)
def attk2index(attk):
    # string to attack index
    for i, a in enumerate(all_attacks):
        if a.__name__ == attk:
            return i+1
    return 0

'''
Converts the raw JSON file into data format which can be interpreted by the network
Inputs
    session: array of arrays containing [[timestamp, word]...] for each message
    size: maximum allowed sequence length in number of words
Output:
    array of tuples containing: (sequence, attack label (T/F), attack type)
'''
def file2sample(session, size=5):
    # load the words sent over the bus
    words = [word2seq(*d) for d in session['data']]

    # empty output array
    windows = []
    # loop through each word sent on the bus
    for i in range(len(words)):
        # get a sequence of command words from the overall list (depending on size)
        win = words[max(0, i-size+1):i+1]
        # extract the sequence of information being sent in the word
        x = [i for w in win for i in w[0]]
        # extract the class label for each word
        y = [w[1] for w in win]
        if len(win) == size:
            # append (sequence, label, type) to output array
            # check if any of the commands that were part of the sequence are malicious
            # if so, label the whole sequence as malicious
            # also determine specifically which type of attack is occuring
            windows.append((x, 1 if any(y) else 0, attk2index(session['attack_types'][0] if any(y) else 'NA')))
    return windows

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# window size
max_win = 10

ds = [s for d in tqdm(data) for s in file2sample(d, size=max_win)]
# the sequences need to be padded so the length matches
# problem: each sequence has a different length depending on type of word (data, status, command)
x = pad_sequences([d[0] for d in ds], padding='post',)
# convert class into numpy array
y = np.array([d[1] for d in ds])
# convert attack type into numpy array
z = np.array([d[2] for d in ds])

In [None]:
# important note: each one has a different number of data points associated depending on the type of word
# so the padding is weird

for i in range(3):
    # (sequence, anomaly label, attack class)
    print(x[i], y[i], z[i])

In [None]:
import tensorflow as tf
import keras
from tensorflow.keras.layers import (Conv1D, Flatten, Dense, 
                                     Dropout, Input, Multiply, 
                                     Embedding, GlobalMaxPooling1D,
                                     LSTM, RepeatVector,GRU, SimpleRNN, Bidirectional)

In [None]:
from tensorflow.keras.layers import Layer, GRUCell, GRU, RNN, Flatten, LSTMCell
from tensorflow.python.keras.layers.recurrent import _generate_zero_filled_state_for_cell

In [None]:
# GCNN
def build():
    input_dim = 256 # maximum integer 
    padding_char = 256
    embedding_size = 8 

    inp = Input( shape=(None,), dtype='int64')
    emb = Embedding( input_dim, embedding_size)(inp)
    filt = Conv1D( filters=128, kernel_size=3, strides=1, use_bias=True, activation='relu', padding='valid' )(emb)
    attn = Conv1D( filters=128, kernel_size=3, strides=1, use_bias=True, activation='sigmoid', padding='valid')(emb)
    gated = Multiply()([filt,attn])
    feat = GlobalMaxPooling1D()( gated )
    dense = Dense(128, activation='relu')(feat)
    out_anomaly = Dense(1, activation='sigmoid', name='anomaly')(dense)
    out_misuse = Dense(11, activation='softmax', name='misuse')(dense)
 
    model = tf.keras.Model(inp, (out_anomaly, out_misuse))
    model.compile(
        loss=['binary_crossentropy', 'sparse_categorical_crossentropy'], 
        optimizer='adam',
        metrics=[['binary_accuracy', 'AUC', 'Precision', 'Recall'], ['SparseCategoricalAccuracy']]
    )
    return model

model = build()

model.summary()

In [None]:
# 10% training data, set verbose=1 to show training progress

start = time.time()
history = model.fit(x, (y, z), validation_split=0.90, epochs=15, batch_size=1024, verbose=1) 
end = time.time()

In [None]:
# final performance 

precision = history.history['val_anomaly_precision'][-1]
recall = history.history['val_anomaly_recall'][-1]

print("Precision: " + str(history.history['val_anomaly_precision'][-1]))
print("Recall: " + str(history.history['val_anomaly_recall'][-1]))
print("F1: " + str(2 * ((precision*recall)/(precision+recall))) )
print("AUC: " + str(history.history['val_anomaly_auc'][-1]))
print("SCA: " +  str(history.history['val_misuse_sparse_categorical_accuracy'][-1]))
print("Runtime: " + str(end - start) + "s")