### Use kernel conda_tensorflow2_p310

In [None]:
import os
import boto3
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
from collections import Counter
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow import feature_column
from sklearn.utils import resample
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.metrics import plot_confusion_matrix

In [None]:
print(tf.__version__)

In [None]:
%run ./read_file.ipynb

In [None]:
# https://stackoverflow.com/questions/68036975/valueerror-shape-must-be-at-least-rank-3-but-is-rank-2-for-node-biasadd
# config for rank error in lstm
tf.config.threading.set_inter_op_parallelism_threads(16)
pd.set_option('display.max_columns', None)
tf.keras.backend.set_image_data_format("channels_last")

In [None]:
# Set Config
embedding_dim = 64
max_length = 6
sequence_length = 6
max_features = 10000
padding_type = 'post'
trunc_type = 'post'
training_portion = 0.8

# place to load models from
ckt_path_generator = 'saved_model/generator'
ckt_path_tokenizer = 'saved_model/tokenizer'
ckt_path_nlp = 'saved_model/nlp'
ckt_path_lstm = 'saved_model/lstm'
ckt_path_lstm_kf_gen = 'saved_model/lstm_kf_gen'

In [None]:
ckt_path = 'lstm_kf_gen.h5'

In [None]:
METRICS = [
        keras.metrics.TruePositives(name='tp'),
        keras.metrics.FalsePositives(name='fp'),
        keras.metrics.TrueNegatives(name='tn'),
        keras.metrics.FalseNegatives(name='fn'), 
        keras.metrics.BinaryAccuracy(name='accuracy'),
        keras.metrics.BinaryCrossentropy(name='binary cross entropy'),
        keras.metrics.Precision(name='precision'),
        keras.metrics.Recall(name='recall'),
        keras.metrics.AUC(name='auc'),
        keras.metrics.AUC(name='prc', curve='PR'), # precision-recall curve
]

In [None]:
# read generated data and update columns to match true data types
dsm = spark.read.parquet(*["s3a://sapient-bucket-trusted/prod/graph/motifs/gen_malicious/*"]) \
                .withColumn("mal_trace", col("mal_trace").cast("integer")) \
                .withColumn("malicious", col("malicious").cast("integer")) \
                .cache()

In [None]:
# converting to the ratio of unique to total in the data since they are all the same
dsm = dsm.limit(500).cache()

In [None]:
# read real data
ds = spark.read.parquet(*["s3a://sapient-bucket-trusted/prod/graph/encoded/real/23Sep3/*"]).cache()

In [None]:
ds = ds.union(dsm).cache()

In [None]:
tot = ds.count()

In [None]:
ds.groupBy("mal_trace") \
    .count() \
    .withColumnRenamed('count', 'cnt_per_group') \
    .withColumn('perc_of_count_total', (col('cnt_per_group') / tot) * 100 ) \
    .sort("perc_of_count_total").show()

In [None]:
def balance_dataframe(ds, ratio):
    # Calculate the number of malicious and non-malicious records
    malicious_count = ds.filter(col("mal_trace") == 1).count()
    non_malicious_count = ds.filter(col("mal_trace") == 0).count()

    target_non_malicious_count = malicious_count * ratio

    fraction_non_malicious = target_non_malicious_count / non_malicious_count
    
    # Sample the records based on the calculated fractions
    malicious_ds = ds.filter(col("mal_trace") == 1)
    non_malicious_ds = ds.filter(col("mal_trace") == 0).sample(fraction_non_malicious)

    # Combine the datasets
    balanced_ds = malicious_ds.union(non_malicious_ds)

    return balanced_ds

In [None]:
# Replace this with ratio of distribution, e.g. ratio of 2 non malicious records to 1 malicious record (1, 2, or 3)
ratio = 5

In [None]:
balanced_ds = balance_dataframe(ds, ratio)

In [None]:
balanced_ds.count()

In [None]:
ds.unpersist()

In [None]:
balanced_ds.limit(5).toPandas()

In [None]:
ds_events = balanced_ds.select('event_sequence').rdd.flatMap(lambda x: x).collect()

In [None]:
ds_labels = balanced_ds.select('mal_trace').rdd.flatMap(lambda x: x).collect()

In [None]:
# # Data conversion - https://towardsdatascience.com/multi-class-text-classification-with-lstm-using-tensorflow-2-0-d88627c10a35
# tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=1000)
# tokenizer.fit_on_texts(ds_events)

In [None]:
# load saved tokenizer
tokenizer = tf.keras.preprocessing.text.Tokenizer()
with open(ckt_path_tokenizer, 'rb') as f:
    tokenizer = pickle.load(f)

In [None]:
# Get our training data word index
word_index = tokenizer.word_index
vocab_count = len(word_index)

In [None]:
# train test split data
train_set, validation_set, train_labels, validation_labels = train_test_split(ds_events, ds_labels,
                                                    stratify=ds_labels, 
                                                    test_size=0.2)

In [None]:
Counter(train_labels)

In [None]:
Counter(validation_labels)

In [None]:
train_sequences = tokenizer.texts_to_sequences(train_set)
train_padded = tf.keras.utils.pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [None]:
validation_sequences = tokenizer.texts_to_sequences(validation_set)
validation_padded = tf.keras.utils.pad_sequences(validation_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [None]:
validation_sequences[0]

In [None]:
validation_padded[0]

In [None]:
train_padded.shape

In [None]:
validation_padded.shape

In [None]:
train_ds = tf.reshape(train_padded, (len(train_padded),6,1))

In [None]:
validation_ds = tf.reshape(validation_padded, (len(validation_padded),6,1))

In [None]:
train_labels_ds = tf.ragged.constant(train_labels)
validation_labels_ds = tf.ragged.constant(validation_labels)

In [None]:
train_ds.shape

In [None]:
train_labels_ds.shape

In [None]:
validation_ds.shape

In [None]:
validation_labels_ds.shape

In [None]:
validation_ds[0]

In [None]:
def create_lstm():
    model = tf.keras.models.Sequential()
    model.add(layers.Embedding(vocab_count + 1, 16))
    model.add(keras.layers.LSTM(500, input_shape=(train_ds.shape[1], train_ds.shape[2]), return_sequences=True))
    model.add(keras.layers.LSTM(300, return_sequences=True))
    model.add(keras.layers.LSTM(200))
    model.add(keras.layers.Dense(train_ds.shape[2], activation='tanh'))
    model.compile(optimizer='adam', loss='binary_crossentropy', 
                  metrics=['accuracy', Precision(name='precision'), Recall(name='recall')])
    return model

In [None]:
import tensorflow as tf
import numpy as np
from sklearn.model_selection import KFold
from tensorflow.keras.metrics import Precision, Recall

import numpy as np


# define the number of folds
k = 5

# create the KFold object
kf = KFold(n_splits=k, shuffle=True, random_state=42)

# define the ModelCheckpoint callback
cp_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=ckt_path_lstm_kf_gen,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True,
    verbose=1)

# convert the data to TensorFlow datasets
train_ds_tf = tf.data.Dataset.from_tensor_slices((train_ds, train_labels_ds))

accuracies = []
precisions = []
recalls = []

# loop over the folds
for fold, (train_idx, val_idx) in enumerate(kf.split(train_ds, train_labels_ds)):
    print(f"Fold {fold+1}")
    # split the data into train and validation sets using TensorFlow datasets
    train_ds_fold = train_ds_tf.skip(val_idx[0]).take(len(train_idx))
    val_ds_fold = train_ds_tf.skip(train_idx[0]).take(len(val_idx))
    
    # create the model
    model = create_lstm()
    
    # train the model
    history = model.fit(train_ds_fold.batch(128), epochs=11, verbose=0,
                        validation_data=val_ds_fold.batch(128), callbacks=[cp_callback])
    
    # evaluate the model
    loss, acc, precision, recall = model.evaluate(val_ds_fold.batch(128))
    print(f"Validation Loss: {loss:.4f}, Validation Accuracy: {acc:.4f}, "
          f"Precision: {precision:.4f}, Recall: {recall:.4f}")

    # Add the metrics values to their respective lists
    accuracies.append(acc)
    precisions.append(precision)
    recalls.append(recall)

# Calculate the average metrics
average_accuracy = np.mean(accuracies)
average_precision = np.mean(precisions)
average_recall = np.mean(recalls)
print(f"Average Validation Accuracy: {average_accuracy:.4f}, "
      f"Average Precision: {average_precision:.4f}, "
      f"Average Recall: {average_recall:.4f}")

In [None]:
model.save('lstm_kf_gen.h5')