In [None]:
## Import dependencies
import numpy as np
import pickle
import pandas
import re
import glob
import datetime
import tensorflow as tf
import itertools
import math
import random
#!{sys.executable} -m pip install gensim
#from gensim.models.word2vec import Word2Vec
from collections import Counter
from sklearn.metrics import log_loss, auc, roc_curve
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
from keras import backend as K
from keras.layers import *
from keras.engine.topology import Input
from keras.models import Model, Sequential
from keras.utils import np_utils, to_categorical
from keras.optimizers import TFOptimizer, RMSprop

## Set random seeds for reproducibility
np.random.seed(123)
random.seed(123)

In [None]:
##
## Set modeling parameters
##

seq_len = 10
seq_skip = 1

w2v_size = 25
w2v_min_count = 3
w2v_window = 10
w2v_workers = 4

embedding_a_size = 100
lstm_a_size = 25
lstm_b_size = 25
dense_size = 100

validation_split = 0.1
batch_size = 2048
epochs = 10

cicids_training = datetime.datetime.strptime("2017-07-04 00:00:00", "%Y-%m-%d %H:%M:%S")

num_models = 3

In [None]:
print("Load global stuff...")

port_fwd_dict = pickle.load(open("data/port_fwd_dict.pickle","rb"))

port_rev_dict = pickle.load(open("data/port_rev_dict.pickle","rb"))

protobytes_fwd_dict = pickle.load(open("data/protobytes_fwd_dict.pickle","rb"))

protobytes_rev_dict = pickle.load(open("data/protobytes_rev_dict.pickle","rb"))


In [None]:
def get_label_mode(X):
    X = filter(lambda x: x != "X", X)
    if len(set(X)) == 1 and list(set(X))[0] == "BENIGN":
        return("BENIGN")
    else:
        X = [a for a in X if a != "BENIGN"]
        return(max(set(X), key=X.count))
    
def cicids_processing(sequences, labels, dict_size, seq_len, seq_skip, resample=False):
    X = []
    Y = []
    L = []
    for ii, token_seq in enumerate(sequences):
        label_seq = labels[ii]
        for jj in range(0, len(token_seq)-seq_len, seq_skip):
            X.append(token_seq[jj:(jj+seq_len)])
            Y.append(to_categorical(int(token_seq[jj+seq_len])-1, dict_size))
            L.append(label_seq[jj+seq_len])
            
    if resample==True:
        indices = np.random.choice(np.arange(len(X)),size=len(X),replace=True)
    else:
        indices = np.arange(len(X))
    return(np.array(X)[indices], np.array(Y)[indices], np.array(L)[indices])

In [None]:
aggregations = ["source","destination","dyad","internal","external"]
# aggregations=["external"]

for agg in aggregations:

    cicids_testing = pickle.load(open("data/cicids_"+agg+"_hour_testing.pickle","rb"))
    cicids_training = pickle.load(open("data/cicids_"+agg+"_hour_training.pickle","rb"))

    X_test, Y_test, L_test = cicids_processing(cicids_testing["port_sequence"].tolist(),
                             cicids_testing["label_sequence"].tolist(),
                             len(port_fwd_dict)-1, seq_len, 3, False)

#     pickle.dump(X_test, open("results/"+agg+"_port_truth_X.pickle","wb"))
#     pickle.dump(Y_test, open("results/"+agg+"_port_truth_Y.pickle","wb"))
    pickle.dump(L_test, open("results/"+agg+"_port_truth_L.pickle","wb"))

    for ii in range(num_models):

        X_train, Y_train, L_test = cicids_processing(cicids_training["port_sequence"].tolist(),
                             cicids_training["label_sequence"].tolist(),
                             len(port_fwd_dict)-1, seq_len, 1, True)
        
        model_input = Input(shape=(seq_len, ))
        embedding_a = Embedding(len(port_fwd_dict), 50, input_length=seq_len, mask_zero=True)(model_input)
        lstm_a = Bidirectional(GRU(25, return_sequences=True,implementation=2, reset_after=True, recurrent_activation='sigmoid'), merge_mode="concat")(embedding_a)
        dropout_a = Dropout(0.2)(lstm_a)
        lstm_b = Bidirectional(GRU(25, return_sequences=False, activation="relu", implementation=2, reset_after=True, recurrent_activation='sigmoid'), merge_mode="concat")(dropout_a)
        dropout_b = Dropout(0.2)(lstm_b)
        dense_layer = Dense(100, activation="linear")(dropout_b)
        dropout_c = Dropout(0.2)(dense_layer)
        model_output = Dense(len(port_fwd_dict)-1, activation="softmax")(dropout_c)
        
        model = Model(inputs=model_input, outputs=model_output)
        
      

        model.compile(optimizer=TFOptimizer(tf.contrib.opt.LazyAdamOptimizer()), loss='categorical_crossentropy', metrics = ['accuracy', 'categorical_accuracy'])

        history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size, validation_split=validation_split, class_weight = 'auto')
        
        plt.plot(history.history['loss'])
        plt.plot(history.history['val_loss'])
        plt.title('model train vs validation loss')
        plt.ylabel('loss')
        plt.xlabel('epoch')
        plt.legend(['train', 'validation'], loc='upper right')
        plt.show()
        
        plt.plot(history.history['acc'])
        plt.plot(history.history['val_acc'])
        plt.title('model accuracy')
        plt.ylabel('accuracy')
        plt.xlabel('epoch')
        plt.legend(['train', 'test'], loc='upper left')
        plt.show()
        
        
        
        
        label_input = Input(shape=(len(port_fwd_dict)-1,))
        score_output = Dot(axes=(1,1))([model_output, label_input])
        pred_model = Model(inputs=[model_input,label_input], outputs=score_output)
        preds = pred_model.predict([X_test,Y_test], batch_size=batch_size)

        pickle.dump(preds,open("results/"+agg+"_"+str(ii)+"_port_preds.pickle","wb"))

        print(agg + " " + str(ii) + " complete.")

In [None]:
aggregations = ["source","destination","dyad","internal","external"]

for agg in aggregations:

    cicids_testing = pickle.load(open("/data/cicids_"+agg+"_hour_testing.pickle","rb"))
    cicids_training = pickle.load(open("/data/cicids_"+agg+"_hour_training.pickle","rb"))

    X_test, Y_test, L_test = cicids_processing(cicids_testing["protobytes_sequence"].tolist(),
                             cicids_testing["label_sequence"].tolist(),
                             len(protobytes_fwd_dict)-1, seq_len, seq_skip, False)

#     pickle.dump(X_test, open("results/"+agg+"_protobytes_truth_X.pickle","wb"))
#     pickle.dump(Y_test, open("results/"+agg+"_protobytes_truth_Y.pickle","wb"))
    pickle.dump(L_test, open("results/"+agg+"_protobytes_truth_L.pickle","wb"))

    for ii in range(num_models):

        X_train, Y_train, L_test = cicids_processing(cicids_training["protobytes_sequence"].tolist(),
                             cicids_training["label_sequence"].tolist(),
                             len(protobytes_fwd_dict)-1, seq_len, seq_skip, True)

        
        
        model_input = Input(shape=(seq_len, ))
        embedding_a = Embedding(len(protobytes_fwd_dict), 50, input_length=seq_len, mask_zero=True)(model_input)
        lstm_a = Bidirectional(GRU(25, return_sequences=True,implementation=2, reset_after=True, recurrent_activation='sigmoid'), merge_mode="concat")(embedding_a)
        dropout_a = Dropout(0.2)(lstm_a)
        lstm_b = Bidirectional(GRU(25, return_sequences=False, activation="relu", implementation=2, reset_after=True, recurrent_activation='sigmoid'), merge_mode="concat")(dropout_a)
        dropout_b = Dropout(0.2)(lstm_b)
        dense_layer = Dense(100, activation="linear")(dropout_b)
        dropout_c = Dropout(0.2)(dense_layer)
        model_output = Dense(len(protobytes_fwd_dict)-1, activation="softmax")(dropout_c)

        model = Model(inputs=model_input, outputs=model_output)
        model.compile(optimizer=TFOptimizer(tf.contrib.opt.LazyAdamOptimizer()), loss='categorical_crossentropy', metrics = ['accuracy', 'categorical_accuracy'])

        history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size, validation_split=validation_split, class_weight = 'auto')
        
        plt.plot(history.history['loss'])
        plt.plot(history.history['val_loss'])
        plt.title('model train vs validation loss')
        plt.ylabel('loss')
        plt.xlabel('epoch')
        plt.legend(['train', 'validation'], loc='upper right')
        plt.show()
        
        plt.plot(history.history['acc'])
        plt.plot(history.history['val_acc'])
        plt.title('model accuracy')
        plt.ylabel('accuracy')
        plt.xlabel('epoch')
        plt.legend(['train', 'test'], loc='upper left')
        plt.show()
        
        
        label_input = Input(shape=(len(protobytes_fwd_dict)-1,))
        score_output = Dot(axes=(1,1))([model_output, label_input])
        pred_model = Model(inputs=[model_input,label_input], outputs=score_output)
        preds = pred_model.predict([X_test,Y_test], batch_size=batch_size)

        pickle.dump(preds,open("results/"+agg+"_"+str(ii)+"_protobytes_preds.pickle","wb"))

        print(agg + " " + str(ii) + " complete.")