# Conditional- WassersteinGAN

Used to generate new training data for the ransomware families to overcome the skewed distribution of training data towards the benign samples

In [3]:
# Packages
from tensorflow import keras
from tensorflow.keras import layers
from keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.utils import to_categorical
from keras.optimizers import RMSprop
from keras import backend

import matplotlib.pyplot as plt 
import tensorflow as tf
import numpy as np
import os
from PIL import Image

**Change parameters**

-----

In [4]:
# Batch size
batch_size = 64

# Color mode
ch = 'grayscale'

# Image size
iw, ih = 64,64
im_size = (iw,ih)

# Latent dim size
latent_dim = 64

# Number of Epochs
epoch_t = 2000 # Add this point this model has run 2000 epochs through using the checkpoints (see ckpt_cgan_v009)

# Computation environment: Kaggle (0) or Local (1)
cenv = 1


-----------

Automatic notebook preparation

In [5]:
if(ch == 'rgb'):
    chnum = 3
elif(ch == 'grayscale'):
    chnum = 1

Create new folder to save the output of this model

In [6]:
if cenv == 1:
    file_exists = []
    vnum = 1
    dir = "C:/Users/Max/Documents/GitHub/experiments_conditional_gan"
    for files in os.listdir(dir):
        if "cgan" in files:
            try:
                vnum = max(vnum, int(files[-3:]))
            except: 
                continue
            new_vnum = vnum + 1
            file_exists.append(True)
        else: 
            file_exists.append(False)
    # If this is the first notebook you want to save, a new folder will be created with version #001
    if sum(file_exists) == 0:
        new_vnum = 1
        print("No matches found")

    else: 
        print(f"{sum(file_exists)} matches(es) found")
        print("--------------")

    # Print new folder name
    print(f"New folder name: WASSERSTEIN-cgan-local-v{new_vnum:03}")
    print("--------------")
    
    # Create new folder with the name of the notebook and the version number
    new_dir = f"C://Users/Max/Documents/GitHub/conditional_gan/WASSERSTEIN-cgan-local-v{new_vnum:03}"
    os.makedirs(new_dir)

14 matches(es) found
--------------
New folder name: WASSERSTEIN-cgan-local-v018
--------------


FileExistsError: [WinError 183] Cannot create a file when that file already exists: 'C://Users/Max/Documents/GitHub/conditional_gan/WASSERSTEIN-cgan-local-v018'

**Data preprocessing**

In [7]:
if cenv == 0:
    path_root = "/kaggle/input/data-wo-benign"
    path_save_imgs = "/kaggle/working/numpy_arrays/"
if cenv == 1:
    path_root = "C:/Users/Max/Documents/image_data/data_wo_benign"
    path_save_imgs = f"C:/Users/Max/Documents/image_data/WASSERSTEIN-cgan-local-v{new_vnum:03}"

In [34]:
datagen = ImageDataGenerator(
    rescale = 1/255 # Pixel values need to be scaled between 0 and 1
)

In [51]:
prelim_dataset = datagen.flow_from_directory(
    directory = path_root,
    color_mode = ch,
    target_size = im_size,
    interpolation = 'bicubic',
    batch_size = 5000,
    shuffle=True
)
imgs, labels = next(prelim_dataset)

Found 12536 images belonging to 10 classes.


In [52]:
num_samples = prelim_dataset.samples
num_classes = max(prelim_dataset.labels) + 1

In [53]:
prelim_dataset.class_indices

{'BetterSurf': 0,
 'Eksor.A': 1,
 'Obfuscator.AFQ': 2,
 'Occamy.C': 3,
 'OnLineGames.CTB': 4,
 'Reveton.A': 5,
 'Sfone': 6,
 'VB.IL': 7,
 'Zbot': 8,
 'Zbot!CI': 9}

In [80]:
len(list(prelim_dataset.class_indices))

10

In [59]:
labels_argmax

array([2, 0, 1, ..., 8, 0, 4], dtype=int64)

In [66]:
index = np.where(labels_argmax == 0)
imgs[index]

(980, 64, 64, 1)

In [85]:
list(prelim_dataset.class_indices.keys())[0]

'BetterSurf'

Create tf.data.Dataset

In [25]:
dataset = tf.data.Dataset.from_tensor_slices((imgs, labels))
dataset = dataset.shuffle(buffer_size=1024).batch(batch_size)

Calculate number of input channel for Gen and Disc

In [26]:
generator_in_channels = latent_dim + num_classes
discriminator_in_channels = chnum + num_classes
print(generator_in_channels, discriminator_in_channels)

74 11


# Creating discriminator and generator

In [27]:
from keras.initializers import RandomNormal
from keras.constraints import Constraint

# clip model weights to a given hypercube
class ClipConstraint(Constraint):
	# set clip value when initialized
	def __init__(self, clip_value):
		self.clip_value = clip_value
 
	# clip model weights to hypercube
	def __call__(self, weights):
		return backend.clip(weights, -self.clip_value, self.clip_value)
 
	# get the config
	def get_config(self):
		return {'clip_value': self.clip_value}

In [28]:
# Create the discriminator.
init = RandomNormal(stddev=0.02)
const = ClipConstraint(0.01)
discriminator = keras.Sequential(
    [
        keras.layers.InputLayer((iw, ih, discriminator_in_channels)),
        layers.Conv2D(64, (3, 3), strides=(2, 2), padding="same", kernel_initializer=init, kernel_constraint=const),
        layers.LeakyReLU(alpha=0.2),
        layers.Conv2D(128, (3, 3), strides=(2, 2), padding="same", kernel_initializer=init, kernel_constraint=const)),
        layers.LeakyReLU(alpha=0.2),
        layers.Conv2D(256, (3, 3), strides=(2, 2), padding="same", kernel_initializer=init, kernel_constraint=const)),
        layers.LeakyReLU(alpha=0.2),
        layers.GlobalMaxPooling2D(),
        layers.Dense(1),
    ],
    name="discriminator",
)


In [30]:
# Create the generator.
init = RandomNormal(stddev=0.02)
const = ClipConstraint(0.01)
generator = keras.Sequential(
    [
        keras.layers.InputLayer((generator_in_channels,)),
        # We want to generate 128 + num_classes coefficients to reshape into a
        # 7x7x(128 + num_classes) map.
        layers.Dense(8 * 8 * generator_in_channels),
        layers.LeakyReLU(alpha=0.2),
        layers.Reshape((8, 8, generator_in_channels)),
        layers.Conv2DTranspose(128, (4, 4), strides=(2, 2), padding="same", kernel_initializer=init, kernel_constraint=const),
        layers.LeakyReLU(alpha=0.2),
        layers.Conv2DTranspose(128, (4, 4), strides=(2, 2), padding="same", kernel_initializer=init, kernel_constraint=const),
        layers.LeakyReLU(alpha=0.2),
        layers.Conv2DTranspose(128, (4, 4), strides=(2, 2), padding="same", kernel_initializer=init, kernel_constraint=const),
        layers.LeakyReLU(alpha=0.2),
        layers.Conv2D(1, (7, 7), padding="same", activation="sigmoid"),
    ],
    name="generator",
)

In [31]:
discriminator.summary()

Model: "discriminator"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 32, 32, 64)        6400      
_________________________________________________________________
leaky_re_lu (LeakyReLU)      (None, 32, 32, 64)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 16, 16, 128)       73856     
_________________________________________________________________
leaky_re_lu_1 (LeakyReLU)    (None, 16, 16, 128)       0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 8, 8, 256)         295168    
_________________________________________________________________
leaky_re_lu_2 (LeakyReLU)    (None, 8, 8, 256)         0         
_________________________________________________________________
global_max_pooling2d (Global (None, 256)             

In [32]:
generator.summary()

Model: "generator"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 4736)              355200    
_________________________________________________________________
leaky_re_lu_3 (LeakyReLU)    (None, 4736)              0         
_________________________________________________________________
reshape (Reshape)            (None, 8, 8, 74)          0         
_________________________________________________________________
conv2d_transpose (Conv2DTran (None, 16, 16, 128)       151680    
_________________________________________________________________
leaky_re_lu_4 (LeakyReLU)    (None, 16, 16, 128)       0         
_________________________________________________________________
conv2d_transpose_1 (Conv2DTr (None, 32, 32, 128)       262272    
_________________________________________________________________
leaky_re_lu_5 (LeakyReLU)    (None, 32, 32, 128)       0 

**Create Conditional GAN**

In [33]:
class ConditionalGAN(keras.Model):
    def __init__(self, discriminator, generator, latent_dim):
        super(ConditionalGAN, self).__init__()
        self.discriminator = discriminator
        self.generator = generator
        self.latent_dim = latent_dim
        self.gen_loss_tracker = keras.metrics.Mean(name="generator_loss")
        self.disc_loss_tracker = keras.metrics.Mean(name="discriminator_loss")

    @property
    def metrics(self):
        return [self.gen_loss_tracker, self.disc_loss_tracker]

    def compile(self, d_optimizer, g_optimizer, loss_fn):
        super(ConditionalGAN, self).compile()
        self.d_optimizer = d_optimizer
        self.g_optimizer = g_optimizer
        self.loss_fn = loss_fn

    def train_step(self, data):
        # Unpack the data.
        real_images, one_hot_labels = data

        # Add dummy dimensions to the labels so that they can be concatenated with
        # the images. This is for the discriminator.
        image_one_hot_labels = one_hot_labels[:, :, None, None]
        image_one_hot_labels = tf.repeat(
            image_one_hot_labels, repeats=[ih * iw]
        )
        image_one_hot_labels = tf.reshape(
            image_one_hot_labels, (-1, iw, ih, num_classes)
        )

        # Sample random points in the latent space and concatenate the labels.
        # This is for the generator.
        batch_size = tf.shape(real_images)[0]
        random_latent_vectors = tf.random.normal(shape=(batch_size, self.latent_dim))
        random_vector_labels = tf.concat(
            [random_latent_vectors, one_hot_labels], axis=1
        )

        # Decode the noise (guided by labels) to fake images.
        generated_images = self.generator(random_vector_labels)

        # Combine them with real images. Note that we are concatenating the labels
        # with these images here.
        fake_image_and_labels = tf.concat([generated_images, image_one_hot_labels], -1)
        real_image_and_labels = tf.concat([real_images, image_one_hot_labels], -1)
        combined_images = tf.concat(
            [fake_image_and_labels, real_image_and_labels], axis=0
        )

        # Assemble labels discriminating real from fake images.
        labels = tf.concat(
            [tf.ones((batch_size, 1)), tf.zeros((batch_size, 1))], axis=0
        )

        # Train the discriminator.
        with tf.GradientTape() as tape:
            predictions = self.discriminator(combined_images)
            d_loss = self.loss_fn(labels, predictions)
        grads = tape.gradient(d_loss, self.discriminator.trainable_weights)
        self.d_optimizer.apply_gradients(
            zip(grads, self.discriminator.trainable_weights)
        )

        # Sample random points in the latent space.
        random_latent_vectors = tf.random.normal(shape=(batch_size, self.latent_dim))
        random_vector_labels = tf.concat(
            [random_latent_vectors, one_hot_labels], axis=1
        )

        # Assemble labels that say "all real images".
        misleading_labels = tf.zeros((batch_size, 1))

        # Train the generator (note that we should *not* update the weights
        # of the discriminator)!
        with tf.GradientTape() as tape:
            fake_images = self.generator(random_vector_labels)
            fake_image_and_labels = tf.concat([fake_images, image_one_hot_labels], -1)
            predictions = self.discriminator(fake_image_and_labels)
            g_loss = self.loss_fn(misleading_labels, predictions)
        grads = tape.gradient(g_loss, self.generator.trainable_weights)
        self.g_optimizer.apply_gradients(zip(grads, self.generator.trainable_weights))

        # Monitor loss.
        self.gen_loss_tracker.update_state(g_loss)
        self.disc_loss_tracker.update_state(d_loss)
        return {
            "g_loss": self.gen_loss_tracker.result(),
            "d_loss": self.disc_loss_tracker.result(),
        }

**Optimizers**

In [39]:
# Define optimizers
opt = RMSprop(learning_rate=0.00005)
d_optimizer=opt
g_optimizer=opt

**Checkpoints**

In [40]:
class GANMonitor(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        
        # Save the model every 5 epochs 
        if (epoch + 1) % 10 == 0:
          checkpoint.save(file_prefix = checkpoint_prefix)

In [41]:
if cenv == 0:
    checkpoint_dir = '/kaggle/working/checkpoints'
if cenv == 1:
    checkpoint_dir = f'{new_dir}'
    
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(generator_optimizer=g_optimizer,
                                 discriminator_optimizer=d_optimizer,
                                 generator=generator,
                                 discriminator=discriminator)

# Training C-GAN

In [42]:
# calculate wasserstein loss
def wasserstein_loss(y_true, y_pred):
    return backend.mean(y_true * y_pred)

In [45]:
cond_gan = ConditionalGAN(
    discriminator=discriminator, generator=generator, latent_dim=latent_dim
)


cond_gan.compile(
    d_optimizer=opt,
    g_optimizer=opt,
    loss_fn=wasserstein_loss,
)

cond_gan.fit(dataset, epochs=epoch_t, 
        callbacks=GANMonitor()
)


Epoch 1/2000
Epoch 2/2000
Epoch 3/2000
 19/196 [=>............................] - ETA: 19s - g_loss: 0.0000e+00 - d_loss: -519.6472

KeyboardInterrupt: 

# Create new training images using the Conditional GAN

In [21]:
# We first extract the trained generator from our Conditiona GAN.
trained_gen = cond_gan.generator

# Number of images that are generated per class
num_interpolation = 50  # @param {type:"integer"}

# Sample noise for the interpolation.
interpolation_noise = tf.random.normal(shape=(1, latent_dim))
interpolation_noise = tf.repeat(interpolation_noise, repeats=num_interpolation)
interpolation_noise = tf.reshape(interpolation_noise, (num_interpolation, latent_dim))


def interpolate_class(first_number, second_number):
    # Convert the start and end labels to one-hot encoded vectors.
    first_label = keras.utils.to_categorical([first_number], num_classes)
    second_label = keras.utils.to_categorical([second_number], num_classes)
    first_label = tf.cast(first_label, tf.float32)
    second_label = tf.cast(second_label, tf.float32)

    # Calculate the interpolation vector between the two labels.
    percent_second_label = tf.linspace(0, 1, num_interpolation)[:, None]
    percent_second_label = tf.cast(percent_second_label, tf.float32)
    interpolation_labels = (
        first_label * (1 - percent_second_label) + second_label * percent_second_label
    )

    # Combine the noise and the labels and run inference with the generator.
    noise_and_labels = tf.concat([interpolation_noise, interpolation_labels], 1)
    fake = trained_gen.predict(noise_and_labels)
    return fake



In [22]:
# Create new directory for saving folder
os.makedirs(path_save_imgs)

In [23]:
# Retrieve class name based on number
classes_list = list(prelim_dataset.class_indices)

In [24]:
# Create images for every class and store in seperate folder
for i in range(num_classes):
    class_name = classes_list[i]
    class_dir = f"{path_save_imgs}/{class_name}"
    os.makedirs(class_dir)
    start_class = i
    end_class = i
    fake_images = interpolate_class(start_class, end_class)
    fake_images *= 255
    converted_images = fake_images.astype(np.uint8)
    converted_images = tf.image.resize(converted_images, (64, 64)).numpy().astype(np.uint8)
    for j in range(num_interpolation):
        np_array = np.squeeze(converted_images[j], axis=2)
        im = Image.fromarray((np_array))
        im.save(f"{class_dir}/gen_imgs_{class_name}_{j}.png")    