<a href="https://colab.research.google.com/github/Hameon4/Kaggle-2022-GAN/blob/main/Group_3_mini_project_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **PART 1: BUILD MODEL USING THE VALIDATION AND SOLUTION DATASET**

In [10]:
from tensorflow.keras.layers import Input, Dense, LeakyReLU, Dropout, BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import SGD, Adam
from sklearn.model_selection import train_test_split

import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import sys, os
import numpy as np

In [11]:
# Load in the training data
df = pd.read_csv('Train.csv')

In [12]:
# Assign variables for training while disregarding the Label column
X = df.drop(columns = ['Label']).copy().values
y = df['Label'].values

In [13]:
# Build the model
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=500, criterion='entropy')
model.fit(X, y)

RandomForestClassifier(criterion='entropy', n_estimators=500)

In [14]:
# Load in the Validation and Solution datasets
data_valid = pd.read_csv('Validation.csv')
data_solution = pd.read_csv('Solution.csv')

X_valid = data_valid.drop(columns = ['ID']).copy()
y_valid = data_solution.drop(columns = ['ID']).copy()

In [None]:
# Acquire the baseline RMSE via validation set
from sklearn.metrics import mean_squared_error

p_test = model.predict(X_valid)
rmse = mean_squared_error(y_valid, p_test, squared=False)
print(f'RMSE: {round(rmse * 100, 7)}%');


# **PART 2: UTILIZE GAN TO GENERATE SYNTHETIC DATA**

In [16]:
# Load in the dataset
import pandas as pd
data = pd.read_csv('Train.csv')

In [17]:
# OOP class for our GAN
from tqdm.auto import trange, tqdm

class SPARGAN():
    
    def __init__(self, latent_dim=100, hidden_layers=[512, 256], d_in = 129): 
        self.hidden_layers = hidden_layers
        self.latent_dim = latent_dim
        self.d_in = d_in
        
    
    def layer(x, layer_size):
        x = Dense(layer_size, activation=LeakyReLU(alpha=0.2))(x)
        x = BatchNormalization(momentum=0.8)(x)
        x = Dropout(0.75)(x)
        return x 
    
        
    # Generator Model
    def build_generator(self):
        input_layer = Input(shape=(self.latent_dim,))
        x = input_layer
        
        for layer_size in self.hidden_layers:
            x = SPARGAN.layer(x, layer_size)
            
        x = Dense(self.d_in, activation='relu')(x) 

        model = Model(input_layer, x)
        self.generator = model

    # Discrminator Model 
    def build_discriminator(self):
        input_layer = Input(shape=(self.d_in,))
        x = input_layer
        for layer_size in reversed(self.hidden_layers):
            x = SPARGAN.layer(x, layer_size)
            
        x = Dense(1, activation='sigmoid')(x) # sigmoid - binary classification

        model = Model(input_layer, x)
        self.discriminator = model
        
    def synthesize(self, diff):
        lst = []
        while len(lst) <= diff-1:
            z = np.random.randn(100, self.latent_dim)
            genz = self.generator.predict(z)
            for i in range(len(z)):
                lab = genz[i, -1]
                if lab > 0 and np.isfinite(lab):
                    if len(lst) <= diff-1:
                        genz[i, -1] = 1
                        lst.append(genz[i, :])
                    else:
                        break
                        
        np.reshape(lst, (1535, 129)).shape
        
        # merge fake with original
        synthetic_data = np.reshape(lst, (1535, 129))
        synthetic_data = pd.DataFrame(synthetic_data, columns = [f'F{i}' for i in range(1, 129)]+["Label"])
        return synthetic_data
    
    #fit model
    def fit(self, df, batch_size=32, epochs=100, plot_loss=False):
        # Compile both models in preparation for training

        # Build and compile the discriminator
        self.build_discriminator()
        self.discriminator.compile(
            loss='binary_crossentropy',
            optimizer=Adam(0.0002, 0.5),
            metrics=['accuracy']
        )

        # Build and compile the combined model 
        self.build_generator()

        # Create and input to represent noise sample from latent space
        z = Input(shape=(self.latent_dim))

        # Pass noise through generator to get an image
        row = self.generator(z)

        # Make sure only the generator is trained
        self.discriminator.trainable = False

        # The true output is fake, but we label them real
        fake_pred = self.discriminator(row)

        # Create the combined model object
        combined_model = Model(z, fake_pred)

        # Compile the combined model
        combined_model.compile(
            loss='binary_crossentropy',
            optimizer=Adam(0.002, 0.5)
        )

        # Train the GAN 

        # Create batch labs to use when calling train_on_batch
        ones = np.ones(batch_size)
        zeros = np.zeros(batch_size)

        # Store the losses
        d_losses = []
        g_losses = []

        # Main training loop
        for epoch in trange(epochs):
            ###########################
            ### Train Discriminator ###
            ###########################

            # Select a random batch of images
            idx = np.random.randint(0, df.values.shape[0], batch_size)
            real_rows = df.values[idx]

            # Generate fake images
            noise = np.random.randn(batch_size, self.latent_dim)
            fake_rows = self.generator.predict(noise)

            # Train the discriminator
            # both loss and accuracy are returned
            d_loss_real, d_acc_real = self.discriminator.train_on_batch(real_rows, ones)
            d_loss_fake, d_acc_fake = self.discriminator.train_on_batch(fake_rows, zeros)
            d_loss = 0.5 * (d_loss_real + d_loss_fake)
            d_acc = 0.5 * (d_acc_real + d_acc_fake)

            ###########################
            ##### Train Generator #####
            ###########################  

            noise = np.random.randn(batch_size, self.latent_dim)
            g_loss = combined_model.train_on_batch(noise, ones)

            # Save the losses
            d_losses.append(d_loss)
            g_losses.append(g_loss)

            if epoch % 100 == 0:
                print(f'epoch: {epoch + 1}/{epochs}, d_loss: {d_loss: .2f}, d_acc: {d_acc: .2f}, g_loss: {g_loss:.2f}')

        
        if plot_loss:
            plt.plot(d_losses, label='discriminator_loss') 
            plt.plot(g_losses, label='generator_loss')
            plt.legend()

In [None]:
# Train the model
model = SPARGAN()
model.fit(data, plot_loss=True)

In [19]:
# Count the number of ones and zeroes
zeroes, ones = data['Label'].value_counts()
print(f'Zeros: {zeroes} Ones: {ones}')
diff = np.abs(zeroes - ones)

Zeros: 3000 Ones: 1465


In [20]:
# Append synthetic data to the original data
synthetic_data = model.synthesize(diff)
malware_data = data.append(synthetic_data)
X = malware_data.drop(columns = ['Label']).copy()
y = malware_data['Label']

In [None]:
# Build the model
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=500, criterion='entropy')
model.fit(X, y)

In [22]:
# Load validation and solution dataset
data_valid = pd.read_csv('Validation.csv')
data_solution = pd.read_csv('Solution.csv')

X_valid = data_valid.drop(columns = ['ID']).copy()
y_valid = data_solution.drop(columns = ['ID']).copy()

In [None]:
# Acquire the new RMSE
from sklearn.metrics import mean_squared_error

p_test = model.predict(X_valid)
rmse_gan = mean_squared_error(y_valid, p_test, squared=False)
print(f'RMSE: {round(rmse_gan * 100, 7)}%')

In [None]:
# Calculate the RMSE's delta
delta = round(rmse_gan - rmse, 3)
print(abs(delta))