<a href="https://colab.research.google.com/github/Hameon4/Kaggle-2022-GAN/blob/main/Copy_of_Kaggle_Train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **PART 1: BUILD MODEL USING THE VALIDATION AND SOLUTION DATASET**

In [1]:
from tensorflow.keras.layers import Input, Dense, LeakyReLU, Dropout, BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import SGD, Adam
from sklearn.model_selection import train_test_split

import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import sys, os
import numpy as np

In [2]:
# Load in the training data
df = pd.read_csv('/content/Train.csv')

In [3]:
# Assign variables for training while disregarding the Label column
X = df.drop(columns = ['Label']).copy().values
y = df['Label'].values

In [4]:
x_train, x_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

# Range all values between -1 and 1
x_train = x_train / 272070 * 2 - 1
x_test = x_test / 272070 * 2 - 1

N, D = df.values.shape

In [5]:
# Set the dimensionality of the latent space
latent_dim = 100

In [6]:
# Build the model
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=500, criterion='entropy')
model.fit(X, y)

ExtraTreesClassifier(criterion='entropy', n_estimators=500)

In [7]:
data_valid = pd.read_csv('/content/Validation.csv')
data_solution = pd.read_csv('/content/Solution.csv')

X_valid = data_valid.drop(columns = ['ID']).copy()
y_valid = data_solution.drop(columns = ['ID']).copy()

In [8]:
from sklearn.metrics import mean_squared_error

p_test = model.predict(X_valid)
rmse = mean_squared_error(y_valid, p_test, squared=False)
print(f'RMSE: {round(rmse * 100, 3)}%');


  f"X has feature names, but {self.__class__.__name__} was fitted without"


RMSE: 10.761%


# **PART 2: UTILIZE GAN TO GENERATE SYNTHETIC DATA**

In [9]:
import pandas as pd
data = pd.read_csv('/content/Train.csv')

In [11]:
# Generator Model 
def build_generator(latent_dim):
  i = Input(shape=(latent_dim, ))
  x = Dense(256, activation=LeakyReLU(alpha=0.2))(i)
  x = BatchNormalization(momentum=0.8)(x)
  x = Dense(512, activation=LeakyReLU(alpha=0.2))(x)
  x = BatchNormalization(momentum=0.8)(x)
  x = Dense(1024, activation=LeakyReLU(alpha=0.2))(x)
  x = BatchNormalization(momentum=0.8)(x)
  x = Dense(D, activation='tanh')(x) # use tanh cuz we centered our data b/w -1 and 1

  model = Model(i, x)
  return model

In [12]:
# Discrminator Model 
def build_discriminator(img_size):
  i = Input(shape=(img_size,))
  x = Dense(512, activation=LeakyReLU(alpha=0.2))(i)
  x = Dense(256, activation=LeakyReLU(alpha=0.2))(x)
  x = Dense(1, activation='sigmoid')(x) # sigmoid cuz binary classification
  
  model = Model(i, x)
  return model

In [13]:
# Compile both models in preparation for training

# Build and compile the discriminator
discriminator = build_discriminator(D)
discriminator.compile(
    loss='binary_crossentropy',
    optimizer=Adam(0.0002, 0.5),
    metrics=['accuracy']
)

# Build and compile the combined model 
generator = build_generator(latent_dim)

# Create and input to represent noise sample from latent space
z = Input(shape=(latent_dim))

# Pass noise through generator to get an image
img = generator(z)

# Make sure only the generator is trained
discriminator.trainable = False

# The true output is fake, but we label them real
fake_pred = discriminator(img)

# Create the combined model object
combined_model = Model(z, fake_pred)

# Compile the combined model
combined_model.compile(
    loss='binary_crossentropy',
    optimizer=Adam(0.002, 0.5)
)

In [14]:
# Train the GAN

# Config
batch_size = 32
epochs = 100
sample_period = 10 # every 'sample_period' step generates and saves some data 

# Create batch labels to use when calling train_on_batch
ones = np.ones(batch_size)
zeros = np.zeros(batch_size)

# Store the losses
d_losses = []
g_losses = []

# Create a folder to store generated images
# if not os.path.exists('gan_images'):
#   os.makedirs('gan_images')

In [15]:
# Main training loop
for epoch in range(epochs):
  ###########################
  ### Train Discriminator ###
  ###########################

  # Select a random batch of images
  idx = np.random.randint(0, df.values.shape[0], batch_size)
  real_imgs = df.values[idx]

  # Generate fake images
  noise = np.random.randn(batch_size, latent_dim)
  fake_imgs = generator.predict(noise)

  # Train the discriminator
  # both loss and accuracy are returned
  d_loss_real, d_acc_real = discriminator.train_on_batch(real_imgs, ones)
  d_loss_fake, d_acc_fake = discriminator.train_on_batch(fake_imgs, zeros)
  d_loss = 0.5 * (d_loss_real + d_loss_fake)
  d_acc = 0.5 * (d_acc_real + d_acc_fake)

  ###########################
  ### Train Generator ###
  ###########################  

  noise = np.random.randn(batch_size, latent_dim)
  g_loss = combined_model.train_on_batch(noise, ones)

  # Save the losses
  d_losses.append(d_loss)
  g_losses.append(g_loss)

  if epoch % 100 == 0:
    print(f'epoch: {epoch + 1}/{epochs}, d_loss: {d_loss: .2f},\
    d_acc: {d_acc: .2f}, g_loss: {g_loss:.2f}')

epoch: 1/100, d_loss:  1.27,    d_acc:  0.53, g_loss: 0.55


In [16]:
x = data['Label']
z = 0
o = 0

for i in x:
  if i == 0:
    z += 1
  elif i == 1:
    o += 1
print(f'Zeros: {z} Ones: {o}')
diff = z - o

lst = []
while len(lst) <= diff-1:
  z = np.random.randn(1, latent_dim)
  for i in range(1):
    genz = generator.predict(z)
    lab = genz[:, -1]
    if lab[i] > 0 and lab[i] <= 1 and np.isfinite() and np.nan:
      lst.append(genz)
    else:
      break

Zeros: 3000 Ones: 1465


In [21]:
# merge fake with original
synthetic_data = np.reshape(lst, (1535, 129))
synthetic_data = pd.DataFrame(synthetic_data)
malware_data = data.append(synthetic_data)

In [53]:
# shuffle synthetic data with original data
shuffled_malware_data = malware_data.sample(frac=1)

In [54]:
X = malware_data.drop(columns = ['Label']).copy()
y = malware_data['Label']

In [57]:
X = np.nan_to_num(X)
y = np.nan_to_num(y)

In [58]:
# Build the model
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=500, criterion='entropy')
model.fit(X, y)

ExtraTreesClassifier(criterion='entropy', n_estimators=500)

In [59]:
X.shape

(6000, 257)

In [46]:
data_valid = pd.read_csv('/content/Validation.csv')
data_solution = pd.read_csv('/content/Solution.csv')

X_valid = data_valid.drop(columns = ['ID']).copy()
y_valid = data_solution.drop(columns = ['ID']).copy()

In [47]:
from sklearn.metrics import mean_squared_error

p_test = model.predict(X_valid)
rmse_gan = mean_squared_error(y_valid, p_test, squared=False)
print(f'RMSE: {round(rmse_gan * 100, 7)}%')

  f"X has feature names, but {self.__class__.__name__} was fitted without"


ValueError: ignored

In [None]:
delta = round(rmse_gan - rmse, 3)
print(delta)