### Use kernel conda_tensorflow2_p310

In [1]:
import os
import boto3
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
from collections import Counter
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow import feature_column
from sklearn.utils import resample
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.metrics import plot_confusion_matrix

In [2]:
print(tf.__version__)

2.11.0


In [3]:
%run ./read_file.ipynb



:: loading settings :: url = jar:file:/home/ec2-user/anaconda3/envs/tensorflow2_p310/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/ec2-user/.ivy2/cache
The jars for the packages stored in: /home/ec2-user/.ivy2/jars
graphframes#graphframes added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-1db067b4-3764-4aaa-ae27-53a2dfc9dc9a;1.0
	confs: [default]
	found graphframes#graphframes;0.8.2-spark3.2-s_2.12 in spark-packages
	found org.slf4j#slf4j-api;1.7.16 in central
:: resolution report :: resolve 158ms :: artifacts dl 5ms
	:: modules in use:
	graphframes#graphframes;0.8.2-spark3.2-s_2.12 from spark-packages in [default]
	org.slf4j#slf4j-api;1.7.16 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   2   |   0   |   0   |   0   ||   2   |   0   |
	-----------------------------

23/04/17 19:48:38 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/04/17 19:48:38 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).


In [4]:
# https://stackoverflow.com/questions/68036975/valueerror-shape-must-be-at-least-rank-3-but-is-rank-2-for-node-biasadd
# config for rank error in lstm
tf.config.threading.set_inter_op_parallelism_threads(16)
pd.set_option('display.max_columns', None)
tf.keras.backend.set_image_data_format("channels_last")

In [5]:
# Set Config
embedding_dim = 64
max_length = 6
sequence_length = 6
max_features = 10000
padding_type = 'post'
trunc_type = 'post'
training_portion = 0.8

# place to load models from
ckt_path_generator = 'saved_model/generator'
ckt_path_tokenizer = 'saved_model/tokenizer'
ckt_path_lstm = 'saved_model/lstm'
ckt_path_lstm_gen = 'saved_model/lstm_gen'
ckt_path_lstm_kf = 'saved_model/lstm_kf'
ckt_path_lstm_kf_gen = 'saved_model/lstm_kf_gen'

In [6]:
# UPDATE MODEL FOR EVALUATION HERE:
ckt_path = ckt_path_lstm_gen

In [7]:
METRICS = [
        keras.metrics.TruePositives(name='tp'),
        keras.metrics.FalsePositives(name='fp'),
        keras.metrics.TrueNegatives(name='tn'),
        keras.metrics.FalseNegatives(name='fn'), 
        keras.metrics.BinaryAccuracy(name='accuracy'),
        keras.metrics.BinaryCrossentropy(name='binary cross entropy'),
        keras.metrics.Precision(name='precision'),
        keras.metrics.Recall(name='recall'),
        keras.metrics.AUC(name='auc'),
        keras.metrics.AUC(name='prc', curve='PR'), # precision-recall curve
]

In [8]:
ds = spark.read.parquet(*["s3a://sapient-bucket-trusted/prod/graph/encoded/real/23Sep3/*"]).cache()

                                                                                

In [15]:
ds.count()

17373568

In [20]:
def sample_df(data, n=10000):
    # now randomly sample from the malicious data
    total_rows = data.count()
    fraction = float(n) / float(total_rows)
    df_sample, _ = data.randomSplit([fraction, 1.0 - fraction], seed=42)
    df_sample = data.limit(n)
    return df_sample

In [21]:
ds_eval = sample_df(ds).cache()

In [22]:
ds_eval.count()

10000

In [23]:
tot = ds_eval.count()

In [24]:
ds_eval.groupBy("mal_trace") \
    .count() \
    .withColumnRenamed('count', 'cnt_per_group') \
    .withColumn('perc_of_count_total', (col('cnt_per_group') / tot) * 100 ) \
    .sort("perc_of_count_total").show()

+---------+-------------+-------------------+
|mal_trace|cnt_per_group|perc_of_count_total|
+---------+-------------+-------------------+
|        1|           94| 0.9400000000000001|
|        0|         9906|              99.06|
+---------+-------------+-------------------+



In [25]:
ds_events = ds_eval.select('event_sequence').rdd.flatMap(lambda x: x).collect()

                                                                                

In [26]:
ds_labels = ds_eval.select('mal_trace').rdd.flatMap(lambda x: x).collect()

In [27]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=1000)
tokenizer.fit_on_texts(ds_events)

In [28]:
# # load saved tokenizer
# tokenizer = tf.keras.preprocessing.text.Tokenizer()
# with open(ckt_path_tokenizer, 'rb') as f:
#     tokenizer = pickle.load(f)

In [29]:
# Get our training data word index
word_index = tokenizer.word_index
vocab_count = len(word_index)

In [34]:
train_sequences = tokenizer.texts_to_sequences(ds_events)
train_padded = tf.keras.utils.pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [35]:
train_padded.shape

(10000, 6)

In [36]:
train_ds = tf.reshape(train_padded, (len(train_padded),6,1))

In [38]:
train_labels_ds = tf.ragged.constant(ds_labels)

In [39]:
train_ds.shape

TensorShape([10000, 6, 1])

In [40]:
train_labels_ds.shape

TensorShape([10000])

In [46]:
def create_lstm():
    model = tf.keras.models.Sequential()
    model.add(layers.Embedding(vocab_count + 1, 16))
    model.add(keras.layers.LSTM(500, input_shape=(train_ds.shape[1], train_ds.shape[2]), return_sequences=True))
    model.add(keras.layers.LSTM(300, return_sequences=True))
    model.add(keras.layers.LSTM(200))
    model.add(keras.layers.Dense(train_ds.shape[2], activation='tanh'))
    model.compile(loss='binary_crossentropy', 
              optimizer='adam',
              metrics=METRICS)
    return model

In [47]:
model = create_lstm()

In [48]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, None, 16)          1184      
                                                                 
 lstm_6 (LSTM)               (None, None, 500)         1034000   
                                                                 
 lstm_7 (LSTM)               (None, None, 300)         961200    
                                                                 
 lstm_8 (LSTM)               (None, 200)               400800    
                                                                 
 dense_2 (Dense)             (None, 1)                 201       
                                                                 
Total params: 2,397,385
Trainable params: 2,397,385
Non-trainable params: 0
_________________________________________________________________


In [49]:
# Create a basic model instance to test
model = create_lstm()
model.load(ckt_path)

ValueError: Received incompatible tensor with shape (112, 16) when attempting to restore variable with shape (74, 16) and name layer_with_weights-0/embeddings/.ATTRIBUTES/VARIABLE_VALUE.

In [None]:
# Re-evaluate the model
loss, tp, fp, tn, fn, accuracy, bin_ce, precision, recall, auc, prc = model.evaluate(validation_ds, validation_labels_ds, verbose=1)

In [None]:
print("Restored model, accuracy: {:5.2f}%".format(100 * accuracy))

In [None]:
pred = model.predict(validation_ds)

In [None]:
pred_class = np.argmax(pred, axis=-1)

In [None]:
pred_class = list(pred_class)

In [None]:
correct_len = len([i for i, j in zip(pred_class, validation_labels) if i == j])
total_len = len(pred_class) 
incorrect_len = total_len - correct_len

In [None]:
print(f'From a total of {total_len} traces, we correctly predicted {correct_len} and incorrectly predicted {incorrect_len}')