# Data Augmentation with Generative Adversarial Network (GAN)
Generative Adversarial Network (GAN) adalah salah satu metode augmentasi data memanfaatkan deep learning. GAN terdiri dari 2 network, yaitu Generator Network dan Discriminator Network. Generator bertugas untuk membuat data sintetik yang menyerupai data asli. Discriminator bertugas sebagai classifier yang memisahkan data asli dan data sintetik.

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import Dataset

import pandas as pd
import numpy as np

In [None]:
# Mendefinisikan class Discriminator
# Berperan sebagai classifier untuk memisahkan data asli dan sintetik.
# Output layer menggunakan Sigmoid activation function agar memiliki hasil di antara 0 sampai 1
# Setiap Dense Layer juga memiliki LeakyReLU activation function. Pemilihan LeakyReLU didasarkan
# pada beberapa referensi model GAN yang juga menggunakan activation function yang sama.

class Discriminator(nn.Module):
  def __init__(self, n_features):
    super().__init__()
    self.disc = nn.Sequential(
        nn.Linear(n_features, 50),
        nn.LeakyReLU(.02),

        nn.Linear(50, 25),
        nn.LeakyReLU(0.02),
        nn.Dropout(.8),

        nn.Linear(25, 1),
        nn.Sigmoid(),
    )

  def forward(self, X):
    return self.disc(X)

# Mendefinisikan class Generator
# Bertugas untuk menghasilkan data sintetik semirip mungkin dengan data asli.
# Setiap Dense Layer memiliki LeakyReLU activation function. Output layer menggunakan Tanh. Pemilihan layer didasarkan
# pada beberapa referensi model GAN yang juga menggunakan activation function yang sama.

class Generator(nn.Module):
  def __init__(self, latent_noise, n_features):
    super().__init__()
    self.gen = nn.Sequential(
        nn.Linear(latent_noise, 10), #1024
        nn.LeakyReLU(.02), #0.02

        nn.Linear(10, 50), #1024, 512
        nn.LeakyReLU(.02), #0.02

        nn.Linear(50, n_features), #512, 14
        nn.Tanh(),
    )

  def forward(self, X):
    return self.gen(X)


# inisiasi class GAN
class GAN(nn.Module):
  def __init__(self, latent_noise, generator_n_output):
    super(GAN, self).__init__()
    self.generator = Generator(latent_noise, n_features)
    self.discriminator = Discriminator(generator_n_output)


  def forward(self, X):
    generated_data = self.generator(X)
    discriminator_output = self.discriminator(generated_data)
    return generated_data, discriminator_output

In [None]:
# train one epoch
# tujuan training adalah untuk menghasilkan error discriminator sebesar mungkin sebab
# itu berarti discriminator semakin gagal dalam melakukan klasifikasi data asli dan sintetik

def train(data_loader, generator, discriminator, gan_optim, disc_optim, criterion):
  device =  "cuda" if torch.cuda.is_available() else "cpu"
  generator.to(device)
  discriminator.to(device)
  discriminator_loss = 0
  generator_loss = 0

  for real_data in data_loader:
    real_data = real_data.to(device)

    disc_optim.zero_grad()

    batch_size = real_data.size(0)
    noise = torch.randn(batch_size, latent_noise).to(device)
    generated_data = generator(noise)

    real_labels = torch.ones(batch_size, 1).to(device)
    fake_labels = torch.zeros(batch_size, 1).to(device)
    real_loss = criterion(discriminator(real_data), real_labels)
    fake_loss = criterion(discriminator(generated_data.detach()), fake_labels)
    discriminator_loss = real_loss + fake_loss

    # discriminator loss dipropagasikan untuk mengkoreksi weight pada hidden layer
    discriminator_loss.backward()
    disc_optim.step()

    gan_optim.zero_grad()

    noise = torch.randn(batch_size, latent_noise).to(device)
    generated_data = generator(noise)

    # generator loss merupakan loss dari discriminator yang dipropagasikan ke hidden layer Generator
    generator_loss = criterion(discriminator(generated_data), real_labels)

    generator_loss.backward()
    gan_optim.step()


  return discriminator_loss, generator_loss

In [None]:
# Set random seed
torch.manual_seed(251003)

# Hyperparameters
latent_noise = 13
n_features = 13
batch_size = 32
num_epochs = 1000
lr = 3e-7

In [None]:
gan = GAN(latent_noise, n_features)

# Mendefinisikan loss function menggunakan Binary Crossentropy Loss
criterion = nn.BCELoss()

# Mendefinisikan Teknik Propagasi atau Optimisasi dengan Adam
gan_optim = optim.Adam(gan.generator.parameters(), lr=lr)
disc_optim = optim.Adam(gan.discriminator.parameters(), lr=lr)

### Data Preprocessing
Bagian penting dalam data preprocessing adalah memastikan tidak ada value null atau NaN di dalam dataset. Data juga dinormalkan agar memiliki range value antara 0 sampai 1

In [None]:
pumpkin_data = pd.read_excel('/content/drive/MyDrive/Datasets/Pumpkin_Seeds_Dataset.xlsx')

class_map = {'Çerçevelik': 0, 'Ürgüp Sivrisi':1}
pumpkin_data['Class'] = pumpkin_data['Class'].replace(class_map)

pumpkin_data.describe()

Unnamed: 0,Area,Perimeter,Major_Axis_Length,Minor_Axis_Length,Convex_Area,Equiv_Diameter,Eccentricity,Solidity,Extent,Roundness,Aspect_Ration,Compactness,Class
count,2500.0,2500.0,2500.0,2500.0,2500.0,2500.0,2500.0,2500.0,2500.0,2500.0,2500.0,2500.0,2500.0
mean,80658.2208,1130.279015,456.60184,225.794921,81508.0844,319.33423,0.860879,0.989492,0.693205,0.791533,2.041702,0.704121,0.48
std,13664.510228,109.256418,56.235704,23.297245,13764.092788,26.89192,0.045167,0.003494,0.060914,0.055924,0.315997,0.053067,0.4997
min,47939.0,868.485,320.8446,152.1718,48366.0,247.0584,0.4921,0.9186,0.468,0.5546,1.1487,0.5608,0.0
25%,70765.0,1048.82975,414.95785,211.245925,71512.0,300.167975,0.8317,0.9883,0.6589,0.7519,1.80105,0.663475,0.0
50%,79076.0,1123.672,449.4966,224.7031,79872.0,317.30535,0.8637,0.9903,0.71305,0.79775,1.9842,0.7077,0.0
75%,89757.5,1203.3405,492.73765,240.672875,90797.75,338.057375,0.897025,0.9915,0.740225,0.834325,2.262075,0.7435,1.0
max,136574.0,1559.45,661.9113,305.818,138384.0,417.0029,0.9481,0.9944,0.8296,0.9396,3.1444,0.9049,1.0


In [None]:
# normalisasi data

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(pumpkin_data)
pumpkin_data_norm = scaler.transform(pumpkin_data)
df = pumpkin_data_norm
df = df.astype('float32')
df.shape

(2500, 13)

In [None]:
# Mendefinisikan class Dataset library Torch

class TabularDataset(Dataset):
  def __init__(self, data):
    self.data = data

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    sample = self.data[idx]
    return sample

In [None]:
dataset = TabularDataset(df)

In [None]:
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

### Start training model

In [None]:
# Memulai proses train model

for epoch in range(num_epochs):
  discriminator_loss, generator_loss = train(data_loader, gan.generator, gan.discriminator, gan_optim, disc_optim, criterion)
  print(f"Epoch: {epoch+1} | Generator Loss: {generator_loss} | Discriminator Loss: {discriminator_loss}")
  if generator_loss == 100:
    break

Epoch: 1 | Generator Loss: 0.7471600770950317 | Discriminator Loss: 1.4251220226287842
Epoch: 2 | Generator Loss: 0.7043226957321167 | Discriminator Loss: 1.3623614311218262
Epoch: 3 | Generator Loss: 0.686579704284668 | Discriminator Loss: 1.482797384262085
Epoch: 4 | Generator Loss: 0.6987943649291992 | Discriminator Loss: 1.36784827709198
Epoch: 5 | Generator Loss: 0.7196148633956909 | Discriminator Loss: 1.5038294792175293
Epoch: 6 | Generator Loss: 0.7213377952575684 | Discriminator Loss: 1.3956871032714844
Epoch: 7 | Generator Loss: 0.7780773043632507 | Discriminator Loss: 1.3711570501327515
Epoch: 8 | Generator Loss: 0.7424606084823608 | Discriminator Loss: 1.468957543373108
Epoch: 9 | Generator Loss: 0.6931590437889099 | Discriminator Loss: 1.3801465034484863
Epoch: 10 | Generator Loss: 0.7384734153747559 | Discriminator Loss: 1.4347805976867676
Epoch: 11 | Generator Loss: 0.7559648752212524 | Discriminator Loss: 1.4036240577697754
Epoch: 12 | Generator Loss: 0.7713848948478699

In [None]:
device = 'cuda' if torch.cuda.is_available else 'cpu'

# Mengenerate data sintetik sebanyak 5000 row dari random noise menggunakan
# Generator yang sudah ditrain
noise = torch.randn(5000, latent_noise).to(device)
generated_data = gan.generator(noise).to('cpu')
generated_data = generated_data.detach().numpy()
generated_data_df = pd.DataFrame(generated_data)
generated_data_df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,0.305642,0.016291,0.086594,0.038966,0.212711,-0.042049,0.03268,0.145826,0.209107,-0.122492,-0.12233,-0.048662,-0.251433
std,0.117409,0.052289,0.037856,0.045703,0.058776,0.115424,0.073159,0.086471,0.07596,0.033151,0.081034,0.047164,0.095688
min,0.092367,-0.147534,-0.031817,-0.19499,0.047462,-0.592373,-0.185574,-0.060949,0.033925,-0.26336,-0.402556,-0.240171,-0.602665
25%,0.215331,-0.017553,0.063138,0.010936,0.175312,-0.104743,-0.019594,0.077723,0.15218,-0.14067,-0.17703,-0.075823,-0.316557
50%,0.291023,0.012822,0.080005,0.045219,0.2052,-0.015912,0.017164,0.135479,0.201937,-0.11818,-0.121602,-0.049596,-0.240793
75%,0.377398,0.046638,0.103771,0.073149,0.245125,0.039233,0.072636,0.201417,0.257349,-0.102867,-0.073821,-0.021929,-0.179946
max,0.814484,0.251262,0.359413,0.132292,0.590629,0.215448,0.401399,0.52522,0.580519,0.019473,0.219136,0.13982,-0.034373


In [None]:
pumpkin_data = pd.read_excel('/content/drive/MyDrive/Datasets/Pumpkin_Seeds_Dataset.xlsx')

class_map = {'Çerçevelik': 0, 'Ürgüp Sivrisi':1}
pumpkin_data['Class'] = pumpkin_data['Class'].replace(class_map)

real_data = pumpkin_data
real_data.describe()

Unnamed: 0,Area,Perimeter,Major_Axis_Length,Minor_Axis_Length,Convex_Area,Equiv_Diameter,Eccentricity,Solidity,Extent,Roundness,Aspect_Ration,Compactness,Class
count,2500.0,2500.0,2500.0,2500.0,2500.0,2500.0,2500.0,2500.0,2500.0,2500.0,2500.0,2500.0,2500.0
mean,80658.2208,1130.279015,456.60184,225.794921,81508.0844,319.33423,0.860879,0.989492,0.693205,0.791533,2.041702,0.704121,0.48
std,13664.510228,109.256418,56.235704,23.297245,13764.092788,26.89192,0.045167,0.003494,0.060914,0.055924,0.315997,0.053067,0.4997
min,47939.0,868.485,320.8446,152.1718,48366.0,247.0584,0.4921,0.9186,0.468,0.5546,1.1487,0.5608,0.0
25%,70765.0,1048.82975,414.95785,211.245925,71512.0,300.167975,0.8317,0.9883,0.6589,0.7519,1.80105,0.663475,0.0
50%,79076.0,1123.672,449.4966,224.7031,79872.0,317.30535,0.8637,0.9903,0.71305,0.79775,1.9842,0.7077,0.0
75%,89757.5,1203.3405,492.73765,240.672875,90797.75,338.057375,0.897025,0.9915,0.740225,0.834325,2.262075,0.7435,1.0
max,136574.0,1559.45,661.9113,305.818,138384.0,417.0029,0.9481,0.9944,0.8296,0.9396,3.1444,0.9049,1.0


In [None]:
generated_data.shape

(5000, 13)

In [None]:
generated_data = generated_data_df.values

for i in range(generated_data.shape[1]):
  generated_data[:,i] = (generated_data[:,i] - generated_data[:,i].min())/(generated_data[:,i].max() - generated_data[:,i].min())

generated_data = generated_data.astype('float32')

In [None]:
generated_df = pd.DataFrame(generated_data)
generated_df_numpy = generated_df.values
generated_df_numpy.shape

(5000, 13)

In [None]:
generated_df_numpy[:,12]

array([0.6675177 , 0.14279106, 0.58377266, ..., 0.41775852, 0.3089065 ,
       0.5215311 ], dtype=float32)

In [None]:
# Denormalisasi data agar memiliki nilai dengan range mirip dengan data asli

for idx, col in enumerate(real_data.columns):
  max = real_data[col].max()
  min = real_data[col].min()

  if col != 'Class' and col != 'Area':
    generated_df_numpy[:, idx] = generated_df_numpy[:, idx]*(max - min) + min
    generated_df_numpy = generated_df_numpy[generated_df_numpy[:, idx] >= min]

  elif col == 'Class':
    X = generated_df_numpy[:, idx]
    X_norm = []
    for X_i in X:
      X_i = 0 if X_i < 0.5 else 1
      X_norm.append(X_i)
    X_norm = np.array(X_norm)
    generated_df_numpy[:, idx] = X_norm

  elif col == 'Area':
    X = generated_df_numpy[:, idx]
    X_norm = []
    for X_i in X:
      X_i = X_i * (max - min) + min
      X_norm.append(X_i)
    X_norm = np.array(X_norm)
    generated_df_numpy[:, idx] = X_norm

new_df = pd.DataFrame(generated_df_numpy)
new_df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,74117.023438,1152.331543,424.072998,262.005096,75752.382812,362.832336,0.661654,0.945339,0.583892,0.746354,2.048255,0.73422,0.7542
std,14411.225586,90.596733,33.002201,21.455608,9740.819336,24.282143,0.056834,0.011182,0.050252,0.045125,0.260128,0.04271,0.430604
min,47939.0,868.484985,320.844604,152.171799,48366.0,247.058395,0.4921,0.9186,0.468,0.5546,1.1487,0.5608,0.0
25%,63031.967773,1093.693512,403.623978,248.845806,69554.451172,349.643082,0.621045,0.936532,0.546232,0.72161,1.872664,0.709624,1.0
50%,72322.703125,1146.322205,418.328979,264.940659,74507.648438,368.330765,0.649601,0.944001,0.579149,0.752223,2.050594,0.733374,1.0
75%,82924.666016,1204.911926,439.047653,278.052536,81124.3125,379.931793,0.692695,0.952528,0.615807,0.773067,2.203974,0.758428,1.0
max,136574.0,1559.449951,661.911316,305.817993,138384.0,417.002899,0.9481,0.9944,0.8296,0.9396,3.1444,0.9049,1.0


In [None]:
new_df[[0, 4, 12]] = new_df[[0, 4, 12]].astype(int)

new_df.columns = real_data.columns
new_df

Unnamed: 0,Area,Perimeter,Major_Axis_Length,Minor_Axis_Length,Convex_Area,Equiv_Diameter,Eccentricity,Solidity,Extent,Roundness,Aspect_Ration,Compactness,Class
0,55730,1155.412231,378.056366,263.222534,64748,377.582764,0.627790,0.937673,0.514988,0.760821,2.058789,0.754934,1
1,81850,1203.990967,564.665527,265.642395,113399,398.524017,0.666381,0.953716,0.723836,0.791000,2.833417,0.863898,0
2,56691,1230.360596,371.063477,244.080109,71541,373.999298,0.629373,0.946049,0.583005,0.750483,2.138185,0.791028,1
3,62410,1367.269531,415.794891,233.070724,80367,351.725372,0.677522,0.970288,0.644715,0.702836,2.291118,0.743717,1
4,59028,1090.964844,385.268921,254.597992,64327,380.620056,0.644424,0.931928,0.527692,0.785164,2.070366,0.755354,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,70415,1155.803589,418.090210,289.292908,76340,370.242432,0.602830,0.930423,0.535034,0.760331,1.810571,0.741103,1
4996,90195,1296.499878,458.215851,276.466553,83805,321.628723,0.651667,0.942690,0.581542,0.687192,1.626000,0.683104,1
4997,72713,1071.256348,479.345764,212.254517,68820,356.394806,0.696356,0.937712,0.605875,0.740397,2.792032,0.737174,0
4998,76544,1172.845581,432.220978,227.433502,75017,349.184204,0.586717,0.944879,0.568123,0.740454,2.236784,0.762664,0


In [None]:
# menyimpan data sintetik ke dalam format csv agar dapat diconsume oleh model
# Machine Learning yang lain.

new_df.to_csv('/content/drive/MyDrive/Datasets/5000_Pumpkin_Seeds_GAN_Dataset.csv', index=False)