In [1]:
# https://towardsdatascience.com/hands-on-generative-adversarial-networks-gan-for-signal-processing-with-python-ff5b8d78bd28

In [6]:
from numpy import hstack
from numpy import zeros
from numpy import ones
from numpy.random import rand
from numpy.random import randn
import numpy as np
import pandas as pd
from sklearn.utils import resample
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import LabelEncoder
from collections import Counter
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Input
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,LSTM
from matplotlib import pyplot
import matplotlib.pyplot as plt

In [7]:
# https://stackoverflow.com/questions/68036975/valueerror-shape-must-be-at-least-rank-3-but-is-rank-2-for-node-biasadd
# config for rank error in lstm
tf.keras.backend.set_image_data_format("channels_last")

# https://stackoverflow.com/questions/58352326/running-the-tensorflow-2-0-code-gives-valueerror-tf-function-decorated-functio
# tf.config.run_functions_eagerly(True)

In [8]:
# Set Config
embedding_dim = 64
max_length = 6
sequence_length = 6
max_features = 10000
padding_type = 'post'
trunc_type = 'post'
training_portion = 0.8

In [9]:
# prepare real data
%run ./read_file.ipynb



:: loading settings :: url = jar:file:/home/ec2-user/anaconda3/envs/tensorflow2_p310/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/ec2-user/.ivy2/cache
The jars for the packages stored in: /home/ec2-user/.ivy2/jars
graphframes#graphframes added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-fabc9086-6d3e-4151-9529-eb4d128bf686;1.0
	confs: [default]
	found graphframes#graphframes;0.8.2-spark3.2-s_2.12 in spark-packages
	found org.slf4j#slf4j-api;1.7.16 in central
:: resolution report :: resolve 136ms :: artifacts dl 5ms
	:: modules in use:
	graphframes#graphframes;0.8.2-spark3.2-s_2.12 from spark-packages in [default]
	org.slf4j#slf4j-api;1.7.16 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   2   |   0   |   0   |   0   ||   2   |   0   |
	-----------------------------

23/04/12 17:27:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/04/12 17:27:51 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).


In [10]:
data = spark.read.parquet(*["s3a://sapient-bucket-trusted/prod/graph/encoded/real/23Sep3/*"]).cache()

                                                                                

In [11]:
def prepare_real_samples(ds, lc=300):
    # Calculate the number of malicious and non-malicious records
    malicious_ds = ds.filter(col("malicious") == 1).limit(lc)
    ds_events = malicious_ds.select('event_sequence').rdd.flatMap(lambda x: x).collect()
    tokenizer = tf.keras.preprocessing.text.Tokenizer()
    # Get our training data word index
    tokenizer.fit_on_texts(ds_events)
    word_index = tokenizer.word_index
    vocab_count = len(word_index)
    train_sequences = tokenizer.texts_to_sequences(ds_events)
    train_padded = tf.keras.utils.pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
    train_ds = tf.reshape(train_padded, (-1, len(train_padded), 6, 1, 1))
    train_labels_ds = ones((1, input_len, 6, 1, 1))
    return train_ds, train_labels_ds

In [12]:
METRICS = [
        keras.metrics.TruePositives(name='tp'),
        keras.metrics.FalsePositives(name='fp'),
        keras.metrics.TrueNegatives(name='tn'),
        keras.metrics.FalseNegatives(name='fn'), 
        keras.metrics.BinaryAccuracy(name='accuracy'),
        keras.metrics.Precision(name='precision'),
        keras.metrics.Recall(name='recall'),
        keras.metrics.AUC(name='auc'),
        keras.metrics.AUC(name='prc', curve='PR'), # precision-recall curve
]

In [13]:
batch_size = 32

In [14]:
# number of samples to make
input_len = 500

# vocab size as defined in the lstm tokenizer
num_words = 1000 

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

In [15]:
def define_generator():
    # Define the input layer
    inputs = tf.keras.Input(shape=(500, 6, 1))
    
    # Flatten the input data
    flat_inputs = tf.keras.layers.Flatten()(inputs)
    
    # Add the Embedding layer
    # embedding = tf.keras.layers.Embedding(input_len, embedding_dim, input_length=input_len)(flat_inputs) 
    embedding = tf.keras.layers.Embedding(num_words, embedding_dim)(flat_inputs)
    
    # Add the GRU layer
    gru_out, state = tf.keras.layers.GRU(rnn_units, return_sequences=True, return_state=True)(embedding)
    
    # Add the Dense layer
    outputs = tf.keras.layers.Dense(num_words, activation='softmax')(gru_out)
    expanded_outputs = tf.expand_dims(outputs, axis=1)

    # Create the model
    model = tf.keras.Model(inputs, expanded_outputs)
    
    # Compile the model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=METRICS)
    
    return model

In [16]:
# define the standalone discriminator model
def define_discriminator():
    # Define the input layer
    inputs = tf.keras.Input(shape=([1, 3000, 500, 1]))
    
    # Flatten the input data
    flat_inputs = tf.keras.layers.Flatten()(inputs)
    
    # Add the dense layer with LeakyReLU activation
    dense = tf.keras.layers.Dense(512, activation='relu')(flat_inputs)
    leaky_relu = tf.keras.layers.LeakyReLU(alpha=0.2)(dense)
    
    # Add the dense layer with LeakyReLU activation
    dense = tf.keras.layers.Dense(256, activation='relu')(leaky_relu)
    leaky_relu = tf.keras.layers.LeakyReLU(alpha=0.2)(dense)
    
    # Add the dense layer with sigmoid activation
    outputs = tf.keras.layers.Dense(1, activation='sigmoid')(leaky_relu)
    
    # Create the model
    model = tf.keras.Model(inputs, outputs)
    
    # Compile the model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model

In [17]:
# define the combined generator and discriminator model, for updating the generator
def define_gan(generator, discriminator):
    # connect them
    model = Sequential()
    model.add(generator)
    model.add(discriminator)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=METRICS)
    return model

In [18]:
def generate_latent_space():
    n = tf.random.uniform(shape=[input_len, 6, 1], minval=1, maxval=100, dtype=tf.int32)
    n_ds = tf.reshape(n, (-1, input_len, 6, 1))
    return n_ds

In [35]:
def generate_fake_samples(generator, latent_dim=3, n=0):
    # generate points in latent space
    n_ds = generate_latent_space()
    # predict outputs
    X = generator.predict(n_ds, verbose=0)
    # add extra dimension to output tensor
    X = tf.expand_dims(X, axis=-1)
    # create class labels
    y = zeros((1, 1, input_len*6, input_len, 1))
    return X, y

In [20]:
# use the generator to generate n fake examples, with class labels
def train(g_model, d_model, gan_model, latent_dim=3, n_epochs=5, n_batch=128, n_eval=200):
    # determine half the size of one batch, for updating the discriminator
    half_batch = int(n_batch / 2)
    # manually enumerate epochs
    for i in range(n_epochs):
        # prepare real samples
        # x_real, y_real = generate_fake_samples(g_model, input_len)
        x_real, y_real = prepare_real_samples(data, lc=input_len)
        # prepare fake examples using the generator
        x_fake, y_fake = generate_fake_samples(g_model, input_len)
        # update discriminator
        d_model.train_on_batch(x_real, y_real)
        d_model.train_on_batch(x_fake, y_fake)
        # prepare points in latent space as input for the generator
        x_gan = generate_latent_space()
        # create inverted labels for the fake samples
        y_gan = ones((1, input_len, 6, 1))
        # create inverted labels for the fake samples
        # update the generator via the discriminator's error
        gan_model.train_on_batch(x_gan, y_gan)
        # evaluate the model every n_eval epochs
        if (i+1) % n_eval == 0:
            plt.title('Number of epochs = %i'%(i+1))
            pred_data = generate_fake_samples(generator, input_len)[0]
            real_data  = generate_fake_samples(generator, input_len)[0]
            plt.plot(pred_data[0],'.',label='Random Fake Sample',color='firebrick')
            plt.plot(real_data[0],'.',label = 'Random Real Sample',color='navy')
            plt.legend(fontsize=10)
            plt.show()

In [21]:
# Test combination of generator and discriminator
generator = define_generator()
discriminator = define_discriminator()

In [22]:
# use generator

In [23]:
x_fake_pre = generate_latent_space()

In [24]:
x_fake_pre.shape

TensorShape([1, 500, 6, 1])

In [36]:
x_fake, y_fake = generate_fake_samples(generator, input_len)

In [37]:
x_fake.shape

TensorShape([1, 1, 3000, 1000, 1])

In [38]:
y_fake.shape

(1, 1, 3000, 500, 1)

In [28]:
x_real, y_real = prepare_real_samples(data, lc=input_len)

                                                                                

In [29]:
x_real.shape

TensorShape([1, 500, 6, 1, 1])

In [30]:
y_real.shape

(1, 500, 6, 1, 1)

In [39]:
# use discriminator with generated data

In [40]:
discriminator.train_on_batch(x_real, y_real)

ValueError: in user code:

    File "/home/ec2-user/anaconda3/envs/tensorflow2_p310/lib/python3.10/site-packages/keras/engine/training.py", line 1249, in train_function  *
        return step_function(self, iterator)
    File "/home/ec2-user/anaconda3/envs/tensorflow2_p310/lib/python3.10/site-packages/keras/engine/training.py", line 1233, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/home/ec2-user/anaconda3/envs/tensorflow2_p310/lib/python3.10/site-packages/keras/engine/training.py", line 1222, in run_step  **
        outputs = model.train_step(data)
    File "/home/ec2-user/anaconda3/envs/tensorflow2_p310/lib/python3.10/site-packages/keras/engine/training.py", line 1023, in train_step
        y_pred = self(x, training=True)
    File "/home/ec2-user/anaconda3/envs/tensorflow2_p310/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/home/ec2-user/anaconda3/envs/tensorflow2_p310/lib/python3.10/site-packages/keras/engine/input_spec.py", line 295, in assert_input_compatibility
        raise ValueError(

    ValueError: Input 0 of layer "model_1" is incompatible with the layer: expected shape=(None, 1, 3000, 500, 1), found shape=(1, 500, 6, 1, 1)


In [None]:
# combination in gan
gan = define_gan(generator, discriminator)

In [None]:
# test gan created data and model

In [None]:
x_gan = generate_latent_space()
# x_gan = tf.reshape(x_gan, (-1, input_len, 6, 1, 1))
# create inverted labels for the fake samples
y_gan = ones(([1,input_len, 6, 1, 1]))

In [None]:
x_gan.shape

In [None]:
y_gan.shape

In [None]:
discriminator.train_on_batch(x_gan, y_gan)

In [None]:
gan.train_on_batch(x_gan, y_gan)

In [None]:
gan.summary()

In [None]:
train(generator, discriminator, gan, n_epochs=1)

In [None]:
# create data based on the trained model
si = generate_latent_space()
sp = gan.predict(si)

In [None]:
sp.shape

In [None]:
sp_ds = tf.reshape(sp, (input_len, 6, 1))

In [None]:
sp_ds.shape

In [None]:
np.max(sp_ds)

In [None]:
np.max(si)