### Use kernel conda_tensorflow2_p310

In [1]:
import os
import boto3
import numpy as np
import pandas as pd
import seaborn as sns
from collections import Counter
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow import feature_column
from sklearn.utils import resample
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.metrics import plot_confusion_matrix

In [2]:
print(tf.__version__)

2.11.0


In [3]:
%run ./read_file.ipynb



:: loading settings :: url = jar:file:/home/ec2-user/anaconda3/envs/tensorflow2_p310/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/ec2-user/.ivy2/cache
The jars for the packages stored in: /home/ec2-user/.ivy2/jars
graphframes#graphframes added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-3a41a160-e891-48c4-998a-1328f3cc5a97;1.0
	confs: [default]
	found graphframes#graphframes;0.8.2-spark3.2-s_2.12 in spark-packages
	found org.slf4j#slf4j-api;1.7.16 in central
:: resolution report :: resolve 135ms :: artifacts dl 5ms
	:: modules in use:
	graphframes#graphframes;0.8.2-spark3.2-s_2.12 from spark-packages in [default]
	org.slf4j#slf4j-api;1.7.16 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   2   |   0   |   0   |   0   ||   2   |   0   |
	-----------------------------

23/04/16 17:38:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/04/16 17:38:58 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).


In [4]:
# https://stackoverflow.com/questions/68036975/valueerror-shape-must-be-at-least-rank-3-but-is-rank-2-for-node-biasadd
# config for rank error in lstm
tf.config.threading.set_inter_op_parallelism_threads(16)
pd.set_option('display.max_columns', None)
tf.keras.backend.set_image_data_format("channels_last")

In [5]:
# Set Config
embedding_dim = 64
max_length = 6
sequence_length = 6
max_features = 10000
padding_type = 'post'
trunc_type = 'post'
training_portion = 0.8

# place to load models from
ckt_path_generator = 'saved_model/generator'
ckt_path_tokenizer = 'saved_model/tokenizer'
ckt_path_nlp = 'saved_model/nlp'
ckt_path_lstm = 'saved_model/lstm'
ckt_path_lstm_kf = 'saved_model/lstm_kf'
ckt_path_lstm_kf_gen = 'saved_model/lstm_kf_gen'

In [6]:
METRICS = [
        keras.metrics.TruePositives(name='tp'),
        keras.metrics.FalsePositives(name='fp'),
        keras.metrics.TrueNegatives(name='tn'),
        keras.metrics.FalseNegatives(name='fn'), 
        keras.metrics.BinaryAccuracy(name='accuracy'),
        keras.metrics.BinaryCrossentropy(name='binary cross entropy'),
        keras.metrics.Precision(name='precision'),
        keras.metrics.Recall(name='recall'),
        keras.metrics.AUC(name='auc'),
        keras.metrics.AUC(name='prc', curve='PR'), # precision-recall curve
]

In [7]:
ds = spark.read.parquet(*["s3a://sapient-bucket-trusted/prod/graph/encoded/real/23Sep3/*"]).cache()

                                                                                

In [8]:
tot = ds.count()

                                                                                

In [9]:
ds.groupBy("mal_trace") \
    .count() \
    .withColumnRenamed('count', 'cnt_per_group') \
    .withColumn('perc_of_count_total', (col('cnt_per_group') / tot) * 100 ) \
    .sort("perc_of_count_total").show()

+---------+-------------+-------------------+
|mal_trace|cnt_per_group|perc_of_count_total|
+---------+-------------+-------------------+
|        1|       118763| 0.6835843967111419|
|        0|     17254805|  99.31641560328886|
+---------+-------------+-------------------+



In [10]:
def balance_dataframe(ds, ratio):
    # Calculate the number of malicious and non-malicious records
    malicious_count = ds.filter(col("mal_trace") == 1).count()
    non_malicious_count = ds.filter(col("mal_trace") == 0).count()

    target_non_malicious_count = malicious_count * ratio

    fraction_non_malicious = target_non_malicious_count / non_malicious_count
    
    # Sample the records based on the calculated fractions
    malicious_ds = ds.filter(col("mal_trace") == 1)
    non_malicious_ds = ds.filter(col("mal_trace") == 0).sample(fraction_non_malicious)

    # Combine the datasets
    balanced_ds = malicious_ds.union(non_malicious_ds)

    return balanced_ds

In [11]:
# Replace this with ratio of distribution, e.g. ratio of 2 non malicious records to 1 malicious record (1, 2, or 3)
ratio = 3

In [12]:
balanced_ds = balance_dataframe(ds, ratio)

In [13]:
balanced_ds.count()

                                                                                

474726

In [14]:
ds.unpersist()

DataFrame[Trace: bigint, mal_trace: int, malicious: int, event_sequence: array<string>]

In [15]:
balanced_ds.limit(5).toPandas()

Unnamed: 0,Trace,mal_trace,malicious,event_sequence
0,86473,1,1,"[01000000000100000100010000100000000000001, 10..."
1,98458,1,1,"[00010000000100000100010000100000000000001, 10..."
2,132328,1,1,"[00010000000100000100010000100000000000001, 10..."
3,8589966649,1,1,"[00010000000100000100010000100000000000001, 10..."
4,8589966809,1,1,"[00100000000100000100010000100000000000001, 10..."


In [16]:
ds_events = balanced_ds.select('event_sequence').rdd.flatMap(lambda x: x).collect()

                                                                                

In [17]:
ds_labels = balanced_ds.select('mal_trace').rdd.flatMap(lambda x: x).collect()

                                                                                

In [18]:
# # Data conversion - https://towardsdatascience.com/multi-class-text-classification-with-lstm-using-tensorflow-2-0-d88627c10a35
# tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=1000)
# tokenizer.fit_on_texts(ds_events)

In [None]:
# load saved tokenizer
tokenizer = tf.keras.preprocessing.text.Tokenizer()
with open(ckt_path_tokenizer, 'rb') as f:
    tokenizer = pickle.load(f)

In [20]:
# Get our training data word index
word_index = tokenizer.word_index
vocab_count = len(word_index)

In [21]:
# train test split data
train_set, validation_set, train_labels, validation_labels = train_test_split(ds_events, ds_labels,
                                                    stratify=ds_labels, 
                                                    test_size=0.2)

In [22]:
Counter(train_labels)

Counter({0: 284770, 1: 95010})

In [23]:
Counter(validation_labels)

Counter({0: 71193, 1: 23753})

In [24]:
train_sequences = tokenizer.texts_to_sequences(train_set)
train_padded = tf.keras.utils.pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [25]:
validation_sequences = tokenizer.texts_to_sequences(validation_set)
validation_padded = tf.keras.utils.pad_sequences(validation_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [26]:
validation_sequences[0]

[10, 3, 4]

In [27]:
validation_padded[0]

array([10,  3,  4,  0,  0,  0], dtype=int32)

In [28]:
train_padded.shape

(379780, 6)

In [29]:
validation_padded.shape

(94946, 6)

In [30]:
train_ds = tf.reshape(train_padded, (len(train_padded),6,1))

In [31]:
validation_ds = tf.reshape(validation_padded, (len(validation_padded),6,1))

In [32]:
train_labels_ds = tf.ragged.constant(train_labels)
validation_labels_ds = tf.ragged.constant(validation_labels)

In [33]:
train_ds.shape

TensorShape([379780, 6, 1])

In [34]:
train_labels_ds.shape

TensorShape([379780])

In [35]:
validation_ds.shape

TensorShape([94946, 6, 1])

In [36]:
validation_labels_ds.shape

TensorShape([94946])

In [37]:
validation_ds[0]

<tf.Tensor: shape=(6, 1), dtype=int32, numpy=
array([[10],
       [ 3],
       [ 4],
       [ 0],
       [ 0],
       [ 0]], dtype=int32)>

In [38]:
def create_lstm():
    model = tf.keras.models.Sequential()
    model.add(layers.Embedding(vocab_count + 1, 16))
    model.add(keras.layers.LSTM(500, input_shape=(train_ds.shape[1], train_ds.shape[2]), return_sequences=True))
    model.add(keras.layers.LSTM(300, return_sequences=True))
    model.add(keras.layers.LSTM(200))
    model.add(keras.layers.Dense(train_ds.shape[2], activation='tanh'))
    model.compile(optimizer='adam', loss='binary_crossentropy', 
                  metrics=['accuracy', Precision(name='precision'), Recall(name='recall')])
    return model

In [39]:
import tensorflow as tf
import numpy as np
from sklearn.model_selection import KFold
from tensorflow.keras.metrics import Precision, Recall

import numpy as np


# define the number of folds
k = 5

# create the KFold object
kf = KFold(n_splits=k, shuffle=True, random_state=42)

# define the ModelCheckpoint callback
cp_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=ckt_path_lstm_kf,
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True,
    verbose=1)

# convert the data to TensorFlow datasets
train_ds_tf = tf.data.Dataset.from_tensor_slices((train_ds, train_labels_ds))

accuracies = []
precisions = []
recalls = []

# loop over the folds
for fold, (train_idx, val_idx) in enumerate(kf.split(train_ds, train_labels_ds)):
    print(f"Fold {fold+1}")
    # split the data into train and validation sets using TensorFlow datasets
    train_ds_fold = train_ds_tf.skip(val_idx[0]).take(len(train_idx))
    val_ds_fold = train_ds_tf.skip(train_idx[0]).take(len(val_idx))
    
    # create the model
    model = create_lstm()
    
    # train the model
    history = model.fit(train_ds_fold.batch(128), epochs=11, verbose=0,
                        validation_data=val_ds_fold.batch(128), callbacks=[cp_callback])
    
    # evaluate the model
    loss, acc, precision, recall = model.evaluate(val_ds_fold.batch(128))
    print(f"Validation Loss: {loss:.4f}, Validation Accuracy: {acc:.4f}, "
          f"Precision: {precision:.4f}, Recall: {recall:.4f}")

    # Add the metrics values to their respective lists
    accuracies.append(acc)
    precisions.append(precision)
    recalls.append(recall)

# Calculate the average metrics
average_accuracy = np.mean(accuracies)
average_precision = np.mean(precisions)
average_recall = np.mean(recalls)
print(f"Average Validation Accuracy: {average_accuracy:.4f}, "
      f"Average Precision: {average_precision:.4f}, "
      f"Average Recall: {average_recall:.4f}")

Fold 1

Epoch 1: val_accuracy improved from -inf to 0.83040, saving model to my_checkpoint.ckpt

Epoch 2: val_accuracy improved from 0.83040 to 0.83872, saving model to my_checkpoint.ckpt

Epoch 3: val_accuracy improved from 0.83872 to 0.84061, saving model to my_checkpoint.ckpt

Epoch 4: val_accuracy did not improve from 0.84061

Epoch 5: val_accuracy did not improve from 0.84061

Epoch 6: val_accuracy did not improve from 0.84061

Epoch 7: val_accuracy improved from 0.84061 to 0.84093, saving model to my_checkpoint.ckpt

Epoch 8: val_accuracy did not improve from 0.84093

Epoch 9: val_accuracy did not improve from 0.84093

Epoch 10: val_accuracy improved from 0.84093 to 0.84158, saving model to my_checkpoint.ckpt

Epoch 11: val_accuracy improved from 0.84158 to 0.84192, saving model to my_checkpoint.ckpt
Validation Loss: 0.3527, Validation Accuracy: 0.8419, Precision: 0.7142, Recall: 0.6169
Fold 2

Epoch 1: val_accuracy did not improve from 0.84192

Epoch 2: val_accuracy did not impr

In [None]:
# Create a basic model instance
model = create_lstm()
model.load_weights(ckt_path_lstm_gen)

In [None]:
# Re-evaluate the model
loss, tp, fp, tn, fn, accuracy, bin_ce, precision, recall, auc, prc = model.evaluate(validation_ds, validation_labels_ds, verbose=1)