### Use kernel conda_tensorflow2_p310

In [1]:
import boto3
import numpy as np
import pandas as pd
from collections import Counter
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow import feature_column
from sklearn.utils import resample
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import LabelEncoder

In [2]:
print(tf.__version__)

2.11.0


In [3]:
%run ./read_file.ipynb



:: loading settings :: url = jar:file:/home/ec2-user/anaconda3/envs/tensorflow2_p310/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/ec2-user/.ivy2/cache
The jars for the packages stored in: /home/ec2-user/.ivy2/jars
graphframes#graphframes added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-c720fdc8-e742-424d-ba73-ecf73bdba17b;1.0
	confs: [default]
	found graphframes#graphframes;0.8.2-spark3.2-s_2.12 in spark-packages
	found org.slf4j#slf4j-api;1.7.16 in central
:: resolution report :: resolve 134ms :: artifacts dl 4ms
	:: modules in use:
	graphframes#graphframes;0.8.2-spark3.2-s_2.12 from spark-packages in [default]
	org.slf4j#slf4j-api;1.7.16 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   2   |   0   |   0   |   0   ||   2   |   0   |
	-----------------------------

23/04/08 17:13:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/04/08 17:13:36 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).
23/04/08 17:13:37 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [4]:
# https://stackoverflow.com/questions/68036975/valueerror-shape-must-be-at-least-rank-3-but-is-rank-2-for-node-biasadd
# config for rank error in lstm
tf.config.threading.set_inter_op_parallelism_threads(16)
pd.set_option('display.max_columns', None)
tf.keras.backend.set_image_data_format("channels_last")

In [5]:
# Set Config
embedding_dim = 64
max_length = 6
sequence_length = 6
max_features = 10000
padding_type = 'post'
trunc_type = 'post'
training_portion = 0.8

hparams = {
    "batch_size": 128,
    "cnn_filter_sizes": [128, 128, 128],
    "cnn_kernel_sizes": [5, 5, 5],
    "cnn_pooling_sizes": [5, 5, 40],
    "constraint_learning_rate": 0.01,
    "embedding_dim": 100,
    "embedding_trainable": False,
    "learning_rate": 0.005,
    "max_num_words": 10000,
    "max_sequence_length": 250
}

In [6]:
METRICS = [
        keras.metrics.TruePositives(name='tp'),
        keras.metrics.FalsePositives(name='fp'),
        keras.metrics.TrueNegatives(name='tn'),
        keras.metrics.FalseNegatives(name='fn'), 
        keras.metrics.BinaryAccuracy(name='accuracy'),
        keras.metrics.Precision(name='precision'),
        keras.metrics.Recall(name='recall'),
        keras.metrics.AUC(name='auc'),
        keras.metrics.AUC(name='prc', curve='PR'), # precision-recall curve
]

2023-04-08 17:13:38.306188: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:267] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected


In [7]:
# ds = spark.read.parquet("s3a://sapient-bucket-trusted/prod/tensor_sample_data/test_holdout/*").cache()

In [8]:
# tot = ds.count()

In [9]:
ds = spark.read.parquet(*["s3a://sapient-bucket-trusted/prod/graph/encoded/real/23Sep3/*",
                          "s3a://sapient-bucket-trusted/prod/graph/encoded/real/23Sep6/*"]).cache()

                                                                                

In [10]:
tot = ds.count()

                                                                                

In [11]:
ds.groupBy("mal_trace") \
    .count() \
    .withColumnRenamed('count', 'cnt_per_group') \
    .withColumn('perc_of_count_total', (col('cnt_per_group') / tot) * 100 ) \
    .sort("perc_of_count_total").show()

+---------+-------------+-------------------+
|mal_trace|cnt_per_group|perc_of_count_total|
+---------+-------------+-------------------+
|        1|       247127| 0.7781506573127496|
|        0|     31511119|  99.22184934268725|
+---------+-------------+-------------------+



In [12]:
mal_count = ds.where( col("mal_trace") == 1).count()

In [13]:
# df = spark.read.parquet("s3a://sapient-bucket-trusted/prod/tensor_sample_data/test_holdout/*").cache().toPandas()

In [14]:
# can we filter and convert to lists without using pandas?
df = ds.toPandas()

                                                                                

In [15]:
len(df)

31758246

In [16]:
df.dtypes

Trace               int64
mal_trace           int32
malicious         float64
event_sequence     object
dtype: object

In [17]:
df_b = df[df['mal_trace'] == 0]
df_m = df[df['mal_trace'] == 1]

In [18]:
df_b_downsampl = resample(df_b, 
                        replace = False, 
                        n_samples = len(df_m),
                        random_state = 42)

In [19]:
df_b_downsampl.shape

(247127, 4)

In [20]:
df_m.shape

(247127, 4)

In [21]:
df_down = pd.concat([df_m, df_b_downsampl])

In [22]:
# Data conversion - https://towardsdatascience.com/multi-class-text-classification-with-lstm-using-tensorflow-2-0-d88627c10a35
tokenizer = tf.keras.preprocessing.text.Tokenizer()

In [23]:
# https://stackoverflow.com/questions/39748660/how-to-perform-k-fold-cross-validation-with-tensorflow
def make_dataset(X_data,y_data,n_splits):

    def gen():        
        for train_index, test_index in KFold(n_splits).split(X_data):
            X_train, X_test = X_data[train_index], X_data[test_index] # input
            y_train, y_test = y_data[train_index], y_data[test_index] # labels
            
            X_train_seq = tokenizer.texts_to_sequences(X_train)
            X_train_ds = tf.reshape(X_train_seq, (len(X_train_seq),6,1))
            
            X_test_seq = tokenizer.texts_to_sequences(X_test)
            X_test_ds = tf.reshape(X_test_seq, (len(X_test_seq),6,1))
            
            y_test_ds = tf.ragged.constant(y_train)
            y_test_ds = tf.ragged.constant(y_test)
            
            
            yield X_train_ds,y_train_ds,X_test_ds,y_test_ds

    return tf.data.Dataset.from_generator(gen, (tf.float64,tf.float64,tf.float64,tf.float64))

In [24]:
# dataset=make_dataset(df_events,df_labels,10)

In [25]:
df_events = df_down['event_sequence'].tolist()
df_labels = df_down['malicious'].tolist()

In [26]:
tokenizer.fit_on_texts(df_events)

In [27]:
# Get our training data word index
word_index = tokenizer.word_index
vocab_count = len(word_index)

In [28]:
# for X_train,y_train,X_test,y_test in iter(dataset):
#     print(1)

In [29]:
train_set, validation_set, train_labels, validation_labels = train_test_split(df_events, df_labels,
                                                    stratify=df_labels, 
                                                    test_size=0.2)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [34]:
Counter(train_labels)

NameError: name 'train_labels' is not defined

In [None]:
Counter(validation_labels)

In [35]:
train_sequences = tokenizer.texts_to_sequences(train_set)
train_padded = tf.keras.utils.pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

NameError: name 'train_set' is not defined

In [None]:
validation_sequences = tokenizer.texts_to_sequences(validation_set)
validation_padded = tf.keras.utils.pad_sequences(validation_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [None]:
validation_sequences[0]

In [None]:
validation_padded[0]

In [None]:
train_padded.shape

In [None]:
validation_padded.shape

In [None]:
train_ds = tf.reshape(train_padded, (len(train_padded),6,1))

In [None]:
validation_ds = tf.reshape(validation_padded, (len(validation_padded),6,1))

In [None]:
train_labels_ds = tf.ragged.constant(train_labels)
validation_labels_ds = tf.ragged.constant(validation_labels)

In [None]:
train_ds.shape

In [None]:
train_labels_ds.shape

In [None]:
validation_ds.shape

In [None]:
validation_labels_ds.shape

In [None]:
model = tf.keras.models.Sequential()
model.add(layers.Embedding(vocab_count + 1, 16))
model.add(keras.layers.LSTM(500, input_shape=(train_ds.shape[1], train_ds.shape[2]), return_sequences=True))
model.add(keras.layers.LSTM(300, return_sequences=True))
model.add(keras.layers.LSTM(200))
model.add(keras.layers.Dense(train_ds.shape[2], activation='tanh'))

In [None]:
model.summary()

In [None]:
model.compile(loss='binary_crossentropy', 
              optimizer='adam',
              metrics=METRICS)

In [None]:
history = model.fit(
                x=train_ds,
                y=train_labels_ds,
                batch_size=128,
                epochs=100,
                verbose=0,
                callbacks=None,
                validation_split=0.0,
                validation_data=(validation_ds, validation_labels_ds),
                shuffle=True,
                class_weight=None,
                sample_weight=None,
                initial_epoch=0,
                steps_per_epoch=None,
                validation_steps=None,
                validation_batch_size=None,
                validation_freq=1,
                max_queue_size=10,
                workers=1,
                use_multiprocessing=False
                )

In [None]:
def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()

In [None]:
plot_graphs(history, "accuracy")
plot_graphs(history, "loss")
plot_graphs(history, "prc")
plot_graphs(history, "recall")
plot_graphs(history, "auc")

In [None]:
# plt.plot(history.history["accuracy"])
# plt.plot(history.history['val_'+"accuracy"])
# plt.xlabel("Epochs")
# plt.ylabel("accuracy")
# plt.legend(["accuracy", 'val_'+"accuracy"])
# plt.show()

In [None]:
model = tf.keras.Sequential([
    # Add an Embedding layer expecting input vocab of size 5000, and output embedding dimension of size 64 we set at the top
    tf.keras.layers.Embedding(100 + 1, 128, input_length=6),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim)),
    # use ReLU in place of tanh function since they are very good alternatives of each other.
    tf.keras.layers.Dense(embedding_dim, activation='relu'),
    # Add a Dense layer with 6 units and softmax activation.
    # When we have multiple outputs, softmax convert outputs layers into a probability distribution.
    tf.keras.layers.Dense(1, activation='softmax')
])

In [None]:
model.summary()

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
num_epochs = 2
history = model.fit(train_padded, train_ds, epochs=num_epochs, 
                    validation_data=(validation_padded, validation_ds), 
                    verbose=1)

In [None]:
def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()
  
plot_graphs(history, "accuracy")
plot_graphs(history, "loss")

In [None]:
# ds = tf.data.Dataset.from_tensor_slices(([i for i in df['event_sequence']], df['Trace'], df['malicious']))

In [None]:
# Input data based on - https://www.tensorflow.org/tutorials/structured_data/feature_columns
train, test = train_test_split(df, test_size=0.2)

In [None]:
train, val = train_test_split(train, test_size=0.2)

In [None]:
print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

In [None]:
# A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  dataframe = dataframe.copy()
  labels = dataframe.pop('malicious')
  ds = tf.data.Dataset.from_tensor_slices(([i for i in df['event_sequence']], df['malicious']))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  return ds

In [None]:
# A utility method to create a feature column
# and to transform a batch of data
def make_features(feature_column):  
    feature_layer = layers.DenseFeatures(feature_column)
    print(feature_layer(example_batch).numpy())

In [None]:
example_batch = next(iter(train))

In [None]:
example_batch

In [None]:
vocab_size = len(set([x[0] for x in df['event_sequence']] ))

In [None]:
vocab_size

In [None]:
# Vocabulary size and number of words in a sequence.
vocab_size = vocab_size
sequence_length = 6

# Use the text vectorization layer to normalize, split, and map strings to
# integers. Note that the layer uses the custom standardization defined above.
# Set maximum_sequence length as all samples are not of the same length.
vectorize_layer = layers.TextVectorization(
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

In [None]:
batch_size = 5 # A small batch sized is used for demonstration purposes
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

In [None]:
text_ds = train_ds.map(lambda x, y: x)

In [None]:
vectorize_layer.adapt(tf.reshape(text_ds, (len(text_ds),6,1)))

In [None]:
for sequence_batch, label_batch in train_ds.take(1):
  print('A batch of sequences:', sequence_batch ) # list(feature_batch.keys()))
  print('A batch of targets:', label_batch )

In [None]:
model = tf.keras.Sequential([
  #feature_layer,
  layers.Dense(128, activation='relu'),
  layers.Dense(128, activation='relu'),
  layers.Dropout(.1),
  layers.Dense(1)
])

In [None]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [None]:
model.fit(train_ds,
          validation_data=val_ds,
          epochs=10)

#### Shap

In [None]:
import shap
import numpy as np


# Select a background dataset to estimate expected values
background_data = train_ds[np.random.choice(train_ds.shape[0], 100, replace=False)]

# Create an explainer object for the LSTM model
explainer = shap.DeepExplainer(model, background_data)

# Choose a dataset for which you want to calculate SHAP values
sample_data = validation_ds[:10]  # Adjust the sample size as needed

# Compute SHAP values for the selected dataset
shap_values = explainer.shap_values(sample_data)

# Plot the SHAP values
shap.summary_plot(shap_values, sample_data)
