In [1]:
# https://towardsdatascience.com/hands-on-generative-adversarial-networks-gan-for-signal-processing-with-python-ff5b8d78bd28

In [2]:
from numpy import hstack
from numpy import zeros
from numpy import ones
from numpy.random import rand
from numpy.random import randn
import numpy as np
import pandas as pd
from sklearn.utils import resample
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import LabelEncoder
from collections import Counter
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Input
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,LSTM
from matplotlib import pyplot
import matplotlib.pyplot as plt

In [3]:
# https://stackoverflow.com/questions/68036975/valueerror-shape-must-be-at-least-rank-3-but-is-rank-2-for-node-biasadd
# config for rank error in lstm
tf.keras.backend.set_image_data_format("channels_last")

# https://stackoverflow.com/questions/58352326/running-the-tensorflow-2-0-code-gives-valueerror-tf-function-decorated-functio
# tf.config.run_functions_eagerly(True)

In [4]:
# Set Config
embedding_dim = 64
max_length = 6
sequence_length = 6
max_features = 10000
padding_type = 'post'
trunc_type = 'post'
training_portion = 0.8

In [5]:
# prepare real data
%run ./read_file.ipynb



:: loading settings :: url = jar:file:/home/ec2-user/anaconda3/envs/tensorflow2_p310/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/ec2-user/.ivy2/cache
The jars for the packages stored in: /home/ec2-user/.ivy2/jars
graphframes#graphframes added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-feb3118b-3bbe-4166-a619-39d44d9cf4dd;1.0
	confs: [default]
	found graphframes#graphframes;0.8.2-spark3.2-s_2.12 in spark-packages
	found org.slf4j#slf4j-api;1.7.16 in central
downloading https://repos.spark-packages.org/graphframes/graphframes/0.8.2-spark3.2-s_2.12/graphframes-0.8.2-spark3.2-s_2.12.jar ...
	[SUCCESSFUL ] graphframes#graphframes;0.8.2-spark3.2-s_2.12!graphframes.jar (73ms)
downloading https://repo1.maven.org/maven2/org/slf4j/slf4j-api/1.7.16/slf4j-api-1.7.16.jar ...
	[SUCCESSFUL ] org.slf4j#slf4j-api;1.7.16!slf4j-api.jar (31ms)
:: resolution report :: resolve 818ms :: artifacts dl 107ms
	:: modules in use:
	graphframes#graphframes;0.8.2-spark3.2-s_2.12 from spark-packages in [default]
	org.slf4j#slf4j-api;1.7.16 from central in [default]
	-------

23/04/10 03:48:08 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/04/10 03:48:09 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).


In [6]:
data = spark.read.parquet(*["s3a://sapient-bucket-trusted/prod/graph/encoded/real/23Sep3/*"]).cache()

                                                                                

In [7]:
def prepare_real_samples(ds, lc=300):
    # Calculate the number of malicious and non-malicious records
    malicious_ds = ds.filter(col("malicious") == 1).limit(lc)
    ds_events = malicious_ds.select('event_sequence').rdd.flatMap(lambda x: x).collect()
    tokenizer = tf.keras.preprocessing.text.Tokenizer()
    # Get our training data word index
    tokenizer.fit_on_texts(ds_events)
    word_index = tokenizer.word_index
    vocab_count = len(word_index)
    train_sequences = tokenizer.texts_to_sequences(ds_events)
    train_padded = tf.keras.utils.pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
    train_ds = tf.reshape(train_padded, (-1, len(train_padded), 6, 1, 1))
    train_labels_ds = ones((1, input_len, 6, 1, 1))
    return train_ds, train_labels_ds

In [8]:
METRICS = [
        keras.metrics.TruePositives(name='tp'),
        keras.metrics.FalsePositives(name='fp'),
        keras.metrics.TrueNegatives(name='tn'),
        keras.metrics.FalseNegatives(name='fn'), 
        keras.metrics.BinaryAccuracy(name='accuracy'),
        keras.metrics.Precision(name='precision'),
        keras.metrics.Recall(name='recall'),
        keras.metrics.AUC(name='auc'),
        keras.metrics.AUC(name='prc', curve='PR'), # precision-recall curve
]

In [9]:
batch_size = 32

In [10]:
# number of samples to make
input_len = 500

In [11]:
# define the standalone generator model
# https://www.tensorflow.org/tutorials/generative/dcgan#the_generator
def define_generator():
    model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=(input_len, 6)),
        tf.keras.layers.LSTM(64, return_sequences=True),
        tf.keras.layers.Dense(6, activation='sigmoid'),
        tf.keras.layers.Lambda(lambda x: tf.expand_dims(x, axis=-1)),
    ])
    # Compile the model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=METRICS)
    return model

In [13]:
# define the standalone discriminator model
def define_discriminator():
    model = tf.keras.Sequential([
        tf.keras.layers.Flatten(input_shape=(input_len, 6, 1)), # Flatten the input tensor
        tf.keras.layers.Embedding(100 + 1, 16),
        tf.keras.layers.Reshape((input_len, 6, 16)), # Reshape embedding output to match Dense output
        tf.keras.layers.Flatten(), # Flatten the output of the Embedding layer
        tf.keras.layers.Dense(input_len * 6, activation='tanh'), # Add an additional Dense layer to get desired output shape
        tf.keras.layers.Reshape((input_len, 6, 1)) # Reshape output to desired shape
    ])
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=METRICS)
    return model

In [15]:
# define the combined generator and discriminator model, for updating the generator
def define_gan(generator, discriminator):
    # connect them
    model = Sequential()
    model.add(generator)
    model.add(discriminator)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=METRICS)
    return model

In [16]:
def generate_latent_space():
    n = tf.random.uniform(shape=[input_len, 6, 1], minval=1, maxval=100, dtype=tf.int32)
    n_ds = tf.reshape(n, (-1, input_len, 6, 1))
    return n_ds

In [56]:
def generate_fake_samples(generator, latent_dim=3, n=0):
    # generate points in latent space
    n_ds = generate_latent_space()
    # predict outputs
    X = generator.predict(n_ds, verbose=0)
    print("size of output from generator")
    print(X.shape)
    # add extra dimension to output tensor
    X = tf.expand_dims(X, axis=-1)
    print("size of input to discriminator after expansion")
    print(X.shape)
    # create class labels
    y = zeros((1, input_len, 6, 1, 1))
    print("size of labels to discriminator")
    print(y.shape)
    return X, y

# def generate_fake_samples(generator, latent_dim=3, n=0):
#     # generate points in latent space
#     n_ds = generate_latent_space()
#     # predict outputs
#     X = generator.predict(n_ds, verbose=0)
#     # create class labels
#     y = zeros((1, input_len, 6, 1, 1))
#     return X, y

In [18]:
# use the generator to generate n fake examples, with class labels
def train(g_model, d_model, gan_model, latent_dim=3, n_epochs=5, n_batch=128, n_eval=200):
    # determine half the size of one batch, for updating the discriminator
    half_batch = int(n_batch / 2)
    # manually enumerate epochs
    for i in range(n_epochs):
        # prepare real samples
        # x_real, y_real = generate_fake_samples(g_model, input_len)
        x_real, y_real = prepare_real_samples(data, lc=input_len)
        # prepare fake examples using the generator
        x_fake, y_fake = generate_fake_samples(g_model, input_len)
        # update discriminator
        d_model.train_on_batch(x_real, y_real)
        d_model.train_on_batch(x_fake, y_fake)
        # prepare points in latent space as input for the generator
        x_gan = generate_latent_space()
        # create inverted labels for the fake samples
        y_gan = ones((1, input_len, 6, 1))
        # create inverted labels for the fake samples
        # update the generator via the discriminator's error
        gan_model.train_on_batch(x_gan, y_gan)
        # evaluate the model every n_eval epochs
        if (i+1) % n_eval == 0:
            plt.title('Number of epochs = %i'%(i+1))
            pred_data = generate_fake_samples(generator, input_len)[0]
            real_data  = generate_fake_samples(generator, input_len)[0]
            plt.plot(pred_data[0],'.',label='Random Fake Sample',color='firebrick')
            plt.plot(real_data[0],'.',label = 'Random Real Sample',color='navy')
            plt.legend(fontsize=10)
            plt.show()

In [48]:
# Create test combination of generator and discriminator
ge = define_gen2()
disc = define_discriminator()
gan_m = define_gan(generator, discriminator)

In [44]:
# Test combination of generator and discriminator
generator = define_generator()
discriminator = define_discriminator()
gan = define_gan(generator, discriminator)

In [20]:
# use generator

In [57]:
x_real, y_real = generate_fake_samples(ge, input_len)

size of output from generator
(1, 3000, 500)
size of input to discriminator after expansion
(1, 3000, 500, 1)
size of labels to discriminator
(1, 500, 6, 1, 1)


In [55]:
x_real.shape

TensorShape([1, 3000, 500, 1])

In [51]:
y_real.shape

(1, 500, 6, 1, 1)

In [None]:
# use discriminator with generated data

In [32]:
disc.train_on_batch(x_real, y_real)

In [None]:
# test gan created data and model

In [None]:
x_gan = generate_latent_space()
# x_gan = tf.reshape(x_gan, (-1, input_len, 6, 1, 1))
# create inverted labels for the fake samples
y_gan = ones(([1,input_len, 6, 1, 1]))

In [None]:
x_gan.shape

In [None]:
y_gan.shape

In [None]:
disc.train_on_batch(x_gan, y_gan)

In [None]:
gan_m.train_on_batch(x_gan, y_gan)

In [None]:
gan_m.summary()

In [None]:
# Create real gan

In [36]:
train(generator, discriminator, gan, n_epochs=1)

                                                                                

size of input to generator
(1, 500, 6, 1)
size of labels to discriminator
(1, 500, 6, 1, 1)


In [37]:
# create data based on the trained model
si = generate_latent_space()
sp = gan.predict(si)



In [38]:
sp.shape

(1, 500, 6, 1)

In [39]:
sp_ds = tf.reshape(sp, (input_len, 6, 1))

In [40]:
sp_ds.shape

TensorShape([500, 6, 1])

In [41]:
np.max(sp_ds)

0.9388734

In [42]:
np.max(si)

99