# DenseNet169 using Conditional GAN generated images

Import libraries

In [None]:
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras import datasets, layers, models, losses, Model
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from sklearn.model_selection import train_test_split
import numpy as np
import os
import math

# Changeable parameters 

---------

In [None]:
# Image size (height x width)
ih = 64
iw = 64

# Grayscale or RGB
ch = 'rgb'

# Batch size 
batch_size = 64

# Layer adapt
ksize = 4 # Kernel size : was '4' for 64x64 image
ssize = 2 # Stride size : was '2' for 64x64 image

# Size of test set (in %)
testsize = 0.3

# Number of epochs in model
epoch_t = 40

# Where computation is performed: Kaggle (0) or Local (1)
cenv = 0

--------

In [None]:
if cenv == 0:
    print("Computation environment: Kaggle")
if cenv == 1:
    print("Computation environment: Local")

**Create new directory for version**

In [None]:
if cenv == 1:
    file_exists = []
    vnum = 1
    dir = "C:/Users/Max/Documents/GitHub/DenseNet"
    for files in os.listdir(dir):
        if "DenseNet" in files: 
            try:
                vnum = max(vnum, int(files[-3:]))
            except: 
                continue
            new_vnum = vnum + 1
            file_exists.append(True)
        else: 
            file_exists.append(False)
    # If this is the first notebook you want to save, a new folder will be created with version #001
    if sum(file_exists) == 0:
        new_vnum = 1
        print("No matches found")

    else: 
        print(f"{sum(file_exists)} matches(es) found")
        print("--------------")

    # Print new folder name
    print(f"New folder name: DenseNet-local-v{new_vnum:03}")
    print("--------------")
    
    # Create new folder with the name of the notebook and the version number
    new_dir = f"/Users/Max/Documents/GitHub/DenseNet/DenseNet-local-v{new_vnum:03}"
    os.makedirs(new_dir)

# Data

In [None]:
if cenv == 0:
    path_root = "/kaggle/input/thesis-data"
    path_gen_images = "/kaggle/input//cganlocalv007/cgan-local-v007"
    
    # Directory where checkpoints of DCGAN are stored
    checkpoint_dir = "/kaggle/input/checkpoints" 

if cenv == 1:
    path_root = "C:/Users/Max/Documents/thesis_data"
    path_gen_images = "C:/Users/Max/Documents/image_data/cgan-local-v007"
    
    # Directory where checkpoints of DCGAN are stored
    checkpoint_dir = 'C:/Users/Max/Documents/GitHub/dcgan_kaggle_output/dcgan-kaggle-v002/checkpoints'
    

In [None]:
# Image size
im_si = (ih, iw)

# Convert the color channel to the corresponding number of layers
if(ch == 'rgb'):
    chnum = 3
elif(ch == 'grayscale'):
    chnum = 1

Load the data. No data augmentation takes place

In [None]:
batches = ImageDataGenerator().flow_from_directory(
    directory  = path_root, 
    color_mode = ch, 
    target_size= (ih,iw), 
    interpolation="bicubic",
    class_mode = 'sparse',
    batch_size=40000
)
imgs, labels = next(batches)

Load the generated images. No data augmentation takes place

In [None]:
batches_gen = ImageDataGenerator().flow_from_directory(
    directory  = path_gen_images, 
    color_mode = ch, 
    target_size= (ih,iw), 
    interpolation="bicubic",
    class_mode = 'sparse',
    batch_size=40000
)
imgs_gen, labels_gen = next(batches_gen)

Preprocess the images using Keras built-in DenseNet preprocessing method

In [None]:
imgs = tf.keras.applications.densenet.preprocess_input(imgs)
labels = np.array(labels)

In [None]:
imgs_gen = tf.keras.applications.densenet.preprocess_input(imgs_gen)
labels_gen = np.array(labels_gen)

Split the data into train and test/val set using chosen split (70-30 in this case). Then separate the test/val split into validation and test set using 0.5 split. This leads to 70-15-15 split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(imgs, labels, test_size=testsize)

X_test, X_val, y_test, y_val =  train_test_split(X_test, y_test, test_size=0.5)

Add the generated images and corresponding labels to the training set

In [None]:
X_train = np.concatenate((X_train, imgs_gen), axis = 0)
y_train = np.concatenate((y_train, labels_gen), axis = 0)

In [None]:
X_train_size = len(X_train)
X_test_size = len(X_test)
X_val_size = len(X_val)

print(f"Size of training data: {X_train_size} | Shape of training data {X_train.shape}")
print(f"Size of test data: {X_test_size}  | Shape of test data {X_test.shape}")
print(f"Size of training data: {X_val_size}  | Shape of training data {X_val.shape}")
print(f"Shape of training labels {y_train.shape}")
print(f"Shape of training labels {y_test.shape}")

Every generated image is weighted 0.4 compared to the true labels, which are weighted at 1

In [None]:
sample_weights1 = np.full(math.floor(len(labels)*(1-testsize)), 1)
sample_weights2 = np.full(len(labels_gen), 0.4)
sample_weights = np.concatenate((sample_weights1, sample_weights2))

# DenseNet

Load the base model from the Tensorflow Keras library with the weights from ImageNet. These weights led to the higest performance

In [None]:
base_model = tf.keras.applications.DenseNet169(weights = 'imagenet', include_top = False, input_shape = (64,64,3))

Add three layers at the output side to ensure classificaiton of the 11 classes

In [None]:
x = layers.Flatten()(base_model.output)
x = layers.Dense(1000, activation='relu')(x)
predictions = layers.Dense(11, activation = 'softmax')(x)

In [None]:
head_model = Model(inputs = base_model.input, outputs = predictions)
head_model.compile(optimizer='adam', loss=losses.sparse_categorical_crossentropy, metrics=['accuracy'])

Freeze all but the last eight layers for training. This prevents overfitting

In [None]:
for layer in head_model.layers[:-8]:
    layer.trainable=False
    
for layer in head_model.layers[-8:]:
    layer.trainable=True

Callbacks: Reduce learning rate after 5 epochs of no improvement on the validition accuracy. Also save save checkpoints for the best performing model based on validation loss

In [None]:
anne = ReduceLROnPlateau(monitor='val_accuracy', factor=0.5, patience=5, verbose=1, min_lr=1e-4)
if cenv == 0:
    checkpoint = ModelCheckpoint('model.h5', verbose=1, save_best_only=True)
if cenv == 1:
    checkpoint = ModelCheckpoint(f'{new_dir}/model.h5', verbose=1, save_best_only=True)


Run the model

In [None]:
history = head_model.fit(
    X_train, 
    y_train,
    batch_size=64, 
    epochs=epoch_t, 
    validation_data=(X_val, y_val),
    sample_weight = sample_weights,
    callbacks = [anne, checkpoint]) # EPOCHS WAS 40

Load the best performing model. N.B. the best performing model did so on the validation set. The model has never seen the test set at this point

In [None]:
if cenv == 0:
    best_model = load_model("/kaggle/working/model.h5")
if cenv == 1:
    best_model = load_model(f"{new_dir}/model.h5")

Evaluate the model based on the test set

In [None]:
scores = best_model.evaluate(X_test, y_test)
print(f"Overall CNN Accuracy: {scores[1]}\n(The number of correct predictions divided by the number of total predictions)")

# Plots

Compute the distribution of training data across the 11 different classes

In [None]:
multi_distribution_train = np.unique(y_train, return_counts=True)
perc_train = (multi_distribution_train[1]/y_train.shape[0])*100

Compute the distribution of test/validation data across the 11 different classes (Have the same distribution)

In [None]:
multi_distribution_test = np.unique(y_test, return_counts=True)
perc_test = (multi_distribution_test[1]/y_test.shape[0])*100

Create plot with the distribution of training and test/val data across the 11 classes

In [None]:
classes = batches.class_indices.keys()

In [None]:
X_axis_mc = np.arange(len(list(classes)))

In [None]:
plt.bar(X_axis_mc - 0.2, perc_train, 0.4, label = 'Training')
plt.bar(X_axis_mc + 0.2, perc_test, 0.4,label = 'Validation/Test')

plt.xticks(rotation='vertical')

plt.legend()
plt.title('Distribution of the training and validation/test data')
plt.xticks(X_axis_mc, list(classes))

plt.ylabel('Dataset distribution in percentage (%)')
if cenv == 0:
    plt.savefig("multi_data_dist.png", bbox_inches = 'tight', dpi = 150)
if cenv == 1:
    plt.savefig(f"{new_dir}/multi_data_dist.png", bbox_inches = 'tight', dpi = 150)

Create pie chart displaying the distribution of training, val, and test data with split in original and generated images

In [None]:
def my_fmt(x):
    return '{:.1f}%\n({:.0f})'.format(x, total*x/100)
total = X_train_size + X_test_size + X_val_size

In [None]:
patches, texts, autotexts = plt.pie(
    [labels.shape[0]*.7, labels_gen.shape[0], X_val_size, X_test_size], 
    labels = ["", "Training", "Validation", "Test"], 
    startangle=90, 
    counterclock=False, 
    autopct=my_fmt,
    colors = ['cornflowerblue', 'darkorange', 'cornflowerblue'],
    radius=1.2,
    explode = (0,0,0.2,0.2)
)
plt.setp(texts[1], position = (1.8,-0.3))
plt.setp(autotexts[2], size = 'x-small')
plt.setp(autotexts[3], size = 'x-small')

types = ['Original dataset', 'Generated dataset']
plt.legend(labels = types, loc = 4, bbox_to_anchor=(1.5,0))


plt.title("Training, validation, and test data distribution", y = 1.15)

if cenv == 0:
    plt.savefig("train_test_dist.png", bbox_inches = 'tight', dpi = 150)
if cenv == 1:
    plt.savefig(f"{new_dir}/train_test_dist.png", bbox_inches = 'tight', dpi = 150)

**Evaluating overfitting and other model performance measures**

In [None]:
fig, axs = plt.subplots(2, 1, figsize=(15,15))

axs[0].plot(history.history['loss'])
axs[0].plot(history.history['val_loss'])
axs[0].title.set_text('Training Loss vs Validation Loss')
axs[0].set_xlabel('Epochs')
axs[0].set_ylabel('Loss')
axs[0].legend(['Train','Val'])

axs[1].plot(history.history['accuracy'])
axs[1].plot(history.history['val_accuracy'])
axs[1].title.set_text('Training Accuracy vs Validation Accuracy')
axs[1].set_xlabel('Epochs')
axs[1].set_ylabel('Accuracy')
axs[1].legend(['Train', 'Val'])

if cenv == 0:
    plt.savefig("performance_figure.png", bbox_inches = 'tight')
if cenv == 1:
    plt.savefig(f"{new_dir}/performance_figure.png", bbox_inches = 'tight')

(Y-axis is limited in the following plot)

In [None]:
fig, axs = plt.subplots(2, 1, figsize=(15,15))

axs[0].plot(history.history['loss'])
axs[0].plot(history.history['val_loss'])
axs[0].title.set_text('Training Loss vs Validation Loss')
axs[0].set_xlabel('Epochs')
axs[0].set_ylabel('Loss')
axs[0].legend(['Train','Val'])
axs[0].set_ylim((0,0.3))

axs[1].plot(history.history['accuracy'])
axs[1].plot(history.history['val_accuracy'])
axs[1].title.set_text('Training Accuracy vs Validation Accuracy')
axs[1].set_xlabel('Epochs')
axs[1].set_ylabel('Accuracy')
axs[1].legend(['Train', 'Val'])
axs[1].set_ylim((0.8,1))

if cenv == 0:
    plt.savefig("performance_figure_ylim.png", bbox_inches = 'tight')
if cenv == 1:
    plt.savefig(f"{new_dir}/performance_figure_ylim.png", bbox_inches = 'tight')

# Analyse performance

**Multiclass classification**

In [None]:
from sklearn import metrics
import pandas as pd
import seaborn as sns
from sklearn.metrics import f1_score, matthews_corrcoef, accuracy_score
from prettytable import PrettyTable, MSWORD_FRIENDLY

Multiclass performance table: Save a table with the performance on the test set

In [None]:
t = PrettyTable(['Metric', 'Performance'])
t.add_row(['Valididation accuracy', round(scores[1],4)])
t.add_row(['Validation loss', round(scores[0],4)])
t.header = True
t.align = "l"
t.title = "Perf. of multi-class classification CGAN - DenseNet169"
print(t)

In [None]:
# Saving PrettyTable
table = t.get_string()

if cenv == 0:
    with open('multi_performance_table.txt', 'w') as f:
        f.write(table)
if cenv == 1:
    with open(f'{new_dir}/multi_performance_table.txt', 'w') as f:
        f.write(table)

1D-array of the predicted class per image in the test set

In [None]:
y_pred = np.argmax(head_model.predict(X_test), axis=-1)

1D-array of the true class per imaeg in the test set

In [None]:
y_test2 = y_test

Create confusion matrix for the multi-class classification

In [None]:
c_matrix = metrics.confusion_matrix(y_test2, y_pred)

In [None]:
def confusion_matrix(confusion_matrix, class_names, figsize = (10,7), fontsize=14):
   
    df_cm = pd.DataFrame(
        confusion_matrix, index=class_names, columns=class_names, 
    )
    fig = plt.figure(figsize=figsize)
    try:
        heatmap = sns.heatmap(df_cm, annot=True, fmt="d")
    except ValueError:
        raise ValueError("Confusion matrix values must be integers.")
    heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=fontsize)
    heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=45, ha='right', fontsize=fontsize)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()
    if cenv == 0:
        plt.savefig("multi_class_cmatrix.png")
    if cenv == 1:
        plt.savefig(f"{new_dir}/multi_class_cmatrix.png")

In [None]:
class_names= batches.class_indices.keys()
confusion_matrix(c_matrix, class_names, figsize = (20,7), fontsize=14)

# Binary classification
y_predbin and y_testbin are the binary classification arrays. 0 = ransomware and 1 = benign

In [None]:
y_predbin = [] 
y_truebin = []
for count, value in enumerate(y_test2):
    if y_test2[count] in range(10): # range(10) is 0 to 9, meaning all ransomware families
        y_truebin.append(0)
    else: y_truebin.append(1) # if prediction is not one of the ransomware families, then it is benign
    
    if y_pred[count] in range(10): # range(10) is 0 to 9, meaning all ransomware families
        y_predbin.append(0)
    else: y_predbin.append(1) # if prediction is not one of the ransomware families, then it is benign
    
    continue
if len(y_truebin) == len(y_predbin):
    print(f"Length of the observations in test set: {len(y_truebin)}")

Plot the distribution of dataset as ransomware - benign

In [None]:
rw_count_train= 0
bn_count_train = 0
for count, value in enumerate(multi_distribution_train[1]):
    if count in range(10):
        rw_count_train = rw_count_train + multi_distribution_train[1][count]
    else: 
        bn_count_train = bn_count_train + multi_distribution_train[1][count]
print(f"Ransomware Occurences: {rw_count_train}, Benign Occurences: {bn_count_train}")

In [None]:
rw_count_test= 0
bn_count_test = 0
for count, value in enumerate(multi_distribution_test[1]):
    if count in range(10):
        rw_count_test = rw_count_test + multi_distribution_test[1][count]
    else: 
        bn_count_test = bn_count_test + multi_distribution_test[1][count]
print(f"Ransomware Occurences: {rw_count_test}, Benign Occurences: {bn_count_test}")

In [None]:
rw_perc_train = rw_count_train / (rw_count_train + bn_count_train) * 100
bn_perc_train = bn_count_train / (rw_count_train + bn_count_train) * 100

rw_perc_test = rw_count_test / (rw_count_test + bn_count_test) * 100
bn_perc_test = bn_count_test / (rw_count_test + bn_count_test) * 100

In [None]:
X_axis_bc = np.arange(2)

plt.bar(X_axis_bc - 0.2, [rw_perc_train, bn_perc_train], 0.4, label = 'Training')
plt.bar(X_axis_bc + 0.2, [rw_perc_test, bn_perc_test], 0.4, label = 'Validation/Test')

plt.xticks(rotation='horizontal')

plt.legend()
plt.title('Distribution of the training and validation/test data')
plt.xticks(X_axis_bc, ['Ransomware', 'Benign'])

plt.ylabel('Dataset distribution in percentage (%)')

if cenv == 0:
    plt.savefig("data_dist.png", bbox_inches = 'tight', dpi = 150)
if cenv == 1:
    plt.savefig(f"{new_dir}/bin_data_dist.png", bbox_inches = 'tight', dpi = 150)

Create confusion matrix for the binary classification

In [None]:
c_matrix_bin = metrics.confusion_matrix(y_truebin, y_predbin)

In [None]:
def confusion_matrix_bin(confusion_matrix, class_names_bin, figsize = (5,2), fontsize=7):
   
    df_cm = pd.DataFrame(
        confusion_matrix, index=class_names_bin, columns=class_names_bin, 
    )
    fig = plt.figure(figsize=figsize)
    try:
        heatmap = sns.heatmap(df_cm, annot=True, fmt="d")
    except ValueError:
        raise ValueError("Confusion matrix values must be integers.")
    heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=fontsize)
    heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=45, ha='right', fontsize=fontsize)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.title("CGAN - DenseNet121")
    if cenv == 0:
        plt.savefig("bin_class_cmatrix.png", bbox_inches = 'tight', dpi = 150)
    if cenv == 1:
        plt.savefig(f"{new_dir}/bin_class_cmatrix.png", bbox_inches = 'tight', dpi = 150)

In [None]:
class_names_bin= ("ransomware", "benign")
confusion_matrix_bin(c_matrix_bin, class_names_bin, figsize = (5,2), fontsize=10)

**Compute performance measures**

True Positive Rate

In [None]:
TPR = c_matrix_bin[0,0]/(c_matrix_bin[0,0] + c_matrix_bin[0,1]) #True Positive Rate

Accuracy

In [None]:
ACC = accuracy_score(y_truebin, y_predbin) # Accuracy

F1 Score

In [None]:
F1 = f1_score(y_truebin, y_predbin, labels=0) # F1 Score

Matthews Correlation Coefficient

In [None]:
MCC = matthews_corrcoef(y_truebin, y_predbin) # Matthews Correlation Coefficient

**Show Performance of CNN**

And save it

In [None]:
t = PrettyTable(['Metric', 'Performance'])
t.add_row(['True Positive Rate', round(TPR,4)])
t.add_row(['Accuracy', round(ACC,4)])
t.add_row(['F1 Score', round(F1,4)])
t.add_row(['Matthews Correlation Coefficient', round(MCC,4)])
t.header = True
t.align = "l"
t.title = "Performance of CGAN - DenseNet169"
print(t)

In [None]:
# Saving PrettyTable
table = t.get_string()

if cenv == 0:
    with open('bin_performance_table.txt', 'w') as f:
        f.write(table)
if cenv == 1:
    with open(f'{new_dir}/bin_performance_table.txt', 'w') as f:
        f.write(table)