<a href="https://colab.research.google.com/github/GaurangSeth/GaurangSethMScDataScience/blob/master/00_git_CTGAN_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install tabgan

In [None]:
import warnings
warnings.filterwarnings('ignore')
import os
import io
import requests

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import metrics

from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor

from tabgan.sampler import GANGenerator

In [None]:
# Reading the actual operating dataset to generate synthetic data
url_original_datset='https://raw.githubusercontent.com/GaurangSeth/GaurangSethMScDataScience/master/Original%20Dataset_30%20Variables_Not%20to%20be%20Shared.csv'
original_data = pd.read_csv(url_original_datset)

# Splitting the dataset into train and test data. "Plant C5PLusYield" is our target variable
# 80-20 Test Train Split done
df_x_train, df_x_test, df_y_train, df_y_test = train_test_split(original_data.drop("Plant C5PlusYield", axis=1),
                                                                original_data["Plant C5PlusYield"],
                                                                test_size=0.20,random_state=42)
# Create dataframe versions for tabular GAN
df_x_test, df_y_test = df_x_test.reset_index(drop=True),df_y_test.reset_index(drop=True)
df_y_train = pd.DataFrame(df_y_train)
df_y_test = pd.DataFrame(df_y_test)

# Pandas to Numpy
x_train = df_x_train.values
x_test = df_x_test.values
y_train = df_y_train.values
y_test = df_y_test.values

In [None]:
## Identifying the optimal number of Hidden layers
# Define the Tabular GAN model builder function
def build_gan_model(num_layers):
    gan_model = keras.models.Sequential()

    gan_model.add(keras.layers.Dense(x_train.shape[1],input_shape=(x_train.shape[1],),activation='relu'))
    #gan_model.add(keras.layers.LeakyReLU(alpha=0.2))

    for i in range(num_layers):
        gan_model.add(keras.layers.Dense((int(x_train.shape[1]/pow(2,i+1))),activation='relu'))
        #gan_model.add(keras.layers.LeakyReLU(alpha=0.2))

    gan_model.add(keras.layers.Dense(1))

    gan_model.compile(loss='mean_squared_error', optimizer='adam')

    
    return gan_model

# Create a KerasRegressor object for GridSearchCV
tabular_gan_model = KerasRegressor(build_fn=build_gan_model, verbose=0)

monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, 
    patience=5, verbose=1, mode='auto',
    restore_best_weights=True)

# Define the parameter grid for grid search
param_grid = {'num_layers': [1, 2, 3, 4, 5]}
# Perform cross-validation grid search
grid_search = GridSearchCV(estimator=tabular_gan_model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error')

grid_search.fit(x_train, y_train)
#grid_search.fit(x_train,y_train,validation_data=(x_test,y_test),
#        callbacks=[monitor], verbose=2,epochs=1000)

# Print the best parameter and corresponding MSE score
best_num_layers = grid_search.best_params_['num_layers']
best_mse_score = -grid_search.best_score_
print("Best number of layers:", best_num_layers)
print("Best MSE score:", best_mse_score)

Best number of layers: 2
Best MSE score: 15.955705750056367


In [None]:
best_model = grid_search.best_estimator_.model
best_model.summary()

Model: "sequential_15"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_75 (Dense)            (None, 29)                870       
                                                                 
 dense_76 (Dense)            (None, 14)                420       
                                                                 
 dense_77 (Dense)            (None, 7)                 105       
                                                                 
 dense_78 (Dense)            (None, 1)                 8         
                                                                 
Total params: 1,403
Trainable params: 1,403
Non-trainable params: 0
_________________________________________________________________


In [None]:
%%time
# Build the neural network
model = Sequential()
# 1st layer
model.add(Dense(x_train.shape[1], input_dim=x_train.shape[1], activation='relu')) 
model.add(Dense((int(x_train.shape[1]/2)), activation='relu')) # Hidden 1
model.add(Dense((int(x_train.shape[1]/4)), activation='relu')) # Hidden 2
model.add(Dense((int(x_train.shape[1]/8)), activation='relu')) # Hidden 3
model.add(Dense(1)) # Output
model.compile(loss='mean_squared_error', optimizer='adam')

monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, 
        patience=5, verbose=1, mode='auto',
        restore_best_weights=True)
model.fit(x_train,y_train,validation_data=(x_test,y_test),
        callbacks=[monitor], verbose=2,epochs=1000)

Epoch 1/1000
198/198 - 2s - loss: 526.0227 - val_loss: 3.4103 - 2s/epoch - 11ms/step
Epoch 2/1000
198/198 - 1s - loss: 3.5465 - val_loss: 3.2760 - 1s/epoch - 5ms/step
Epoch 3/1000
198/198 - 1s - loss: 3.4269 - val_loss: 3.2767 - 1s/epoch - 7ms/step
Epoch 4/1000
198/198 - 1s - loss: 3.2660 - val_loss: 3.0023 - 882ms/epoch - 4ms/step
Epoch 5/1000
198/198 - 1s - loss: 3.1024 - val_loss: 3.0470 - 739ms/epoch - 4ms/step
Epoch 6/1000
198/198 - 1s - loss: 2.9280 - val_loss: 3.2613 - 746ms/epoch - 4ms/step
Epoch 7/1000
198/198 - 1s - loss: 2.6862 - val_loss: 2.4167 - 641ms/epoch - 3ms/step
Epoch 8/1000
198/198 - 0s - loss: 2.5377 - val_loss: 2.2043 - 350ms/epoch - 2ms/step
Epoch 9/1000
198/198 - 0s - loss: 2.2289 - val_loss: 2.8296 - 359ms/epoch - 2ms/step
Epoch 10/1000
198/198 - 0s - loss: 2.0615 - val_loss: 1.7116 - 355ms/epoch - 2ms/step
Epoch 11/1000
198/198 - 0s - loss: 1.7333 - val_loss: 3.0405 - 353ms/epoch - 2ms/step
Epoch 12/1000
198/198 - 0s - loss: 1.5388 - val_loss: 1.6158 - 362ms/

<keras.callbacks.History at 0x7f1150d2ce20>

In [None]:
pred = model.predict(x_test)
score = np.sqrt(metrics.mean_squared_error(pred,y_test))
print("Final score (RMSE): {}".format(score))

Final score (RMSE): 1.0731030549199259


In [None]:
%%time
gen_x, gen_y = GANGenerator(gen_x_times=1.1, cat_cols=None,
           bot_filter_quantile=0.001, top_filter_quantile=0.999, \
              is_post_process=True,
           adversarial_model_params={
               "metrics": "rmse", "max_depth": 2, "max_bin": 100, 
               "learning_rate": 0.02, "random_state": \
                42, "n_estimators": 500,
           }, pregeneration_frac=2, only_generated_data=True,\
           gan_params = {"batch_size": 500, "patience": 25, \
          "epochs" : 500,}).generate_data_pipe(df_x_train, df_y_train,\
          df_x_test, deep_copy=True, only_adversarial=False, \
          use_adversarial=True)

Fitting CTGAN transformers for each column:   0%|          | 0/30 [00:00<?, ?it/s]

Training CTGAN, epochs::   0%|          | 0/500 [00:00<?, ?it/s]

CPU times: user 12min 43s, sys: 24 s, total: 13min 7s
Wall time: 12min 50s


In [None]:
#df_new=gen_x.merge(gen_y)
synthetic_data=pd.concat((gen_x,gen_y),axis=1)
synthetic_data.shape

(5975, 30)

In [None]:
# original_data: Original dataset as pandas DataFrame
# synthetic_data: Synthetically generated dataset from CTGAN as pandas DataFrame

# Find duplicate rows in the synthetic dataset
duplicates = synthetic_data.duplicated(subset=original_data.columns)

# Drop the duplicate rows from the synthetic dataset
synthetic_data = synthetic_data[~duplicates]

# Print the count of duplicate rows
print("Number of rows in synthetic dataset that are complete copies of original dataset and are dropped:", duplicates.sum())

Number of rows in synthetic dataset that are complete copies of original dataset and are dropped: 13


In [None]:
synthetic_data.shape

(5962, 30)

In [None]:
# Predict
pred = model.predict(gen_x.values)
score = np.sqrt(metrics.mean_squared_error(pred,gen_y.values))
print("Final score (RMSE): {}".format(score))

Final score (RMSE): 1.0296374700417914


In [None]:
## Checking the number of duplicate rows in Synthetic Data
duplicate_rows = synthetic_data.duplicated()
num_duplicates = duplicate_rows.sum()

print("Number of duplicate rows:", num_duplicates)

Number of duplicate rows: 0


In [None]:
#synthetic_data.to_csv('CTGAN Generated data_2.csv',index=False)