# Import Libraries

In [0]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import random
import argparse
import numpy as np
import matplotlib.pyplot as plt
from tensorflow import set_random_seed

from keras import regularizers
from keras import backend as K
from keras.models import Model
from keras.utils import plot_model
from keras.losses import mse, binary_crossentropy
from keras.layers import Lambda, Input, Dense, Dropout

GLOBAL_SEED = 1
LOCAL_SEED = 42

set_random_seed(GLOBAL_SEED)
np.random.seed(GLOBAL_SEED)
random.seed(GLOBAL_SEED)

Using TensorFlow backend.


In [0]:
# Define PATH to file
# path = 'gdrive/My Drive/Generators/DataSets/Selected/breast-cancer-wisconsin/wdbc.data'
# path = 'gdrive/My Drive/Generators/DataSets/Selected/breast-cancer-wisconsin/breast-cancer-wisconsin.data'
# path = 'gdrive/My Drive/Generators/DataSets/Selected/balance-scale/balance-scale.data'
# path = 'gdrive/My Drive/Generators/DataSets/Selected/pima-indians-diabetes/pima-indians-diabetes.data'
# path = 'gdrive/My Drive/Generators/DataSets/Selected/tic-tac-toe/tic-tac-toe.data'
# path = 'gdrive/My Drive/Generators/DataSets/Selected/annealing/anneal.data'
# path = 'gdrive/My Drive/Generators/DataSets/Selected/breast-cancer/breast-cancer.data'
# path = 'gdrive/My Drive/Generators/DataSets/Selected/cylinder-bands/bands.data'
# path = 'gdrive/My Drive/Generators/DataSets/Selected/credit-screening/crx.data'
# path = 'gdrive/My Drive/Generators/DataSets/Selected/statlog/australian/australian.dat'
path = 'gdrive/My Drive/Generators/DataSets/Selected/statlog/german/german.data'
# path = 'gdrive/My Drive/Generators/DataSets/Selected/statlog/german/german.data-numeric'
# path = 'gdrive/My Drive/Generators/DataSets/Selected/spectrometer/lrs.data'
# path = 'gdrive/My Drive/Generators/DataSets/Selected/soybean/soybean-large.data'

In [0]:
intermediate_dim = 512

# Read Data

In [0]:
set_random_seed(GLOBAL_SEED)
np.random.seed(GLOBAL_SEED)
random.seed(GLOBAL_SEED)
import pandas as pd
na_values = {'?', ' '}
df = pd.read_csv(path,
                 sep=' ',
                 header=None,
                 na_filter=True, 
                 verbose=False, 
                 skip_blank_lines=True, 
                 na_values=na_values,
                 keep_default_na=False)
print('Origin dataset:')                 
print(df.head())
# Drop N/A 
df.dropna(axis=1, how='any', inplace=True)
print(df.head())
df.replace('U',np.NaN, inplace=True)
# df.dropna(axis=0, how='any', inplace=True)
# Drop ID column
df.drop([0], axis=1, inplace=True)
df = df.reset_index(drop=True)

col_names = list(df)
new_names = {}
for i, name in enumerate(col_names):
    new_names[name] = 'X' + str(i)
df.rename(columns=new_names, inplace=True)
# For soybean
# colnums = len(df.columns)
# for i in df.columns:
#     df[i] = df[i].astype('category')

# For Prima diabetes
# df['X9'] = df['X9'].astype('category')
# df['X8'] = df['X8'].astype('category')
df = df.reindex(sorted(df.columns), axis=1)
print(df.head())

Origin dataset:
    0   1    2    3     4    5    6   ...    14 15    16  17    18    19 20
0  A11   6  A34  A43  1169  A65  A75  ...  A152  2  A173   1  A192  A201  1
1  A12  48  A32  A43  5951  A61  A73  ...  A152  1  A173   1  A191  A201  2
2  A14  12  A34  A46  2096  A61  A74  ...  A152  1  A172   2  A191  A201  1
3  A11  42  A32  A42  7882  A61  A74  ...  A153  1  A173   2  A191  A201  1
4  A11  24  A33  A40  4870  A61  A73  ...  A153  2  A173   2  A191  A201  2

[5 rows x 21 columns]
    0   1    2    3     4    5    6   ...    14 15    16  17    18    19 20
0  A11   6  A34  A43  1169  A65  A75  ...  A152  2  A173   1  A192  A201  1
1  A12  48  A32  A43  5951  A61  A73  ...  A152  1  A173   1  A191  A201  2
2  A14  12  A34  A46  2096  A61  A74  ...  A152  1  A172   2  A191  A201  1
3  A11  42  A32  A42  7882  A61  A74  ...  A153  1  A173   2  A191  A201  1
4  A11  24  A33  A40  4870  A61  A73  ...  A153  2  A173   2  A191  A201  2

[5 rows x 21 columns]
   X0   X1   X10  X11   X1

# Split dataset

In [0]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
np.random.seed(GLOBAL_SEED)
vals = np.copy(df.values)
total_nums = len(vals)

df_train, df_validation = train_test_split(df, test_size=0.5, 
                                           random_state=LOCAL_SEED, 
                                           shuffle=True)
# Write the test dataset
df_validation = df_validation.reindex(sorted(df_validation.columns), axis=1)
df_validation.to_csv(path + '_For_Test.csv', index=False)
print(df_validation.head())

     X0   X1   X10  X11   X12   X13  X14  ...    X3   X4   X5 X6   X7    X8  X9
521  18  A32  A121   24  A143  A152    1  ...  3190  A61  A73  2  A92  A101   2
737  18  A32  A123   35  A143  A152    1  ...  4380  A62  A73  3  A93  A101   4
740  24  A31  A123   32  A141  A152    1  ...  2325  A62  A74  2  A93  A101   3
660  12  A32  A121   23  A143  A151    1  ...  1297  A61  A73  3  A94  A101   4
411  33  A34  A123   35  A143  A152    2  ...  7253  A61  A74  3  A93  A101   2

[5 rows x 20 columns]


# Recognize categorical columns

In [0]:
df = df_train.copy(deep=True)
print(df.head())

     X0   X1   X10  X11   X12   X13  X14  ...     X3   X4   X5 X6   X7    X8  X9
680   6  A32  A124   56  A143  A152    1  ...   1538  A61  A72  1  A92  A101   2
177   6  A34  A123   52  A143  A152    2  ...    338  A63  A75  4  A93  A101   4
395  39  A33  A124   32  A143  A151    1  ...  11760  A62  A74  2  A93  A101   3
911  24  A34  A123   25  A141  A152    1  ...   4736  A61  A72  2  A92  A101   4
793  24  A32  A124   51  A143  A153    1  ...   2892  A61  A75  3  A91  A101   4

[5 rows x 20 columns]


In [0]:
# df['X9'] = df['X9'].astype('category')
# df['X8'] = df['X8'].astype('category')
colnums = len(df.columns)
for i in df.columns:
    try:
        if df[i].dtype.name == 'object':
            df[i] = df[i].astype('category')
        else:
            df[i].astype('float32')
    except:
        continue
print(df.head())
print(df.describe())

     X0   X1   X10  X11   X12   X13  X14  ...     X3   X4   X5 X6   X7    X8  X9
680   6  A32  A124   56  A143  A152    1  ...   1538  A61  A72  1  A92  A101   2
177   6  A34  A123   52  A143  A152    2  ...    338  A63  A75  4  A93  A101   4
395  39  A33  A124   32  A143  A151    1  ...  11760  A62  A74  2  A93  A101   3
911  24  A34  A123   25  A141  A152    1  ...   4736  A61  A72  2  A92  A101   4
793  24  A32  A124   51  A143  A153    1  ...   2892  A61  A75  3  A91  A101   4

[5 rows x 20 columns]
               X0         X11         X14  ...            X3          X6          X9
count  500.000000  500.000000  500.000000  ...    500.000000  500.000000  500.000000
mean    21.452000   35.478000    1.416000  ...   3449.456000    3.002000    2.840000
std     12.056634   11.299951    0.586201  ...   3073.550205    1.108356    1.084783
min      4.000000   19.000000    1.000000  ...    250.000000    1.000000    1.000000
25%     12.000000   27.000000    1.000000  ...   1364.000000    2.

In [0]:
# df['X9'] = df['X9'].astype('category')
# df['X8'] = df['X8'].astype('category')
categorical = df.select_dtypes(['category']).columns
print(categorical)
for f in categorical:
    dummies = pd.get_dummies(df[f], prefix = f, prefix_sep = '_')
    df = pd.concat([df, dummies], axis = 1)
    
# drop original categorical features
df.drop(categorical, axis = 1, inplace = True)

Index(['X1', 'X10', 'X12', 'X13', 'X15', 'X17', 'X18', 'X2', 'X4', 'X5', 'X7',
       'X8'],
      dtype='object')


In [0]:
df.to_csv(path + 'For_training.csv', index=False)

# VAE

## Split train and test data

In [0]:
set_random_seed(GLOBAL_SEED)
np.random.seed(GLOBAL_SEED)
random.seed(GLOBAL_SEED)
df = pd.read_csv(path + 'For_training.csv')
vae_train = np.copy(df.values)
vae_train.astype('float32')
scaler = MinMaxScaler()
print(np.amax(vae_train[:, 2]))

vae_train = scaler.fit_transform(vae_train)
x_train, x_test = train_test_split(vae_train, test_size=0.5,
                                   random_state=LOCAL_SEED,
                                   shuffle=True)

print(x_train.shape)
print(x_test.shape)
print(np.amax(x_train))
print(np.amax(x_test))

4
(250, 58)
(250, 58)
1.0
1.0


In [0]:
set_random_seed(GLOBAL_SEED)
np.random.seed(GLOBAL_SEED)
random.seed(GLOBAL_SEED)

original_dim = x_train.shape[1]
x_train = np.reshape(x_train, [-1, original_dim])
x_test = np.reshape(x_test, [-1, original_dim])
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
print(x_train.shape)
print(x_test.shape)

(250, 58)
(250, 58)


## Define VAE class

In [0]:
set_random_seed(GLOBAL_SEED)
np.random.seed(GLOBAL_SEED)
random.seed(GLOBAL_SEED)
class VAE:
    def __init__(self, input_shape=(original_dim,), 
                 intermediate_dim=128, latent_dim=2, summary=False):
        
        self._build_model(input_shape,
                         intermediate_dim, 
                          latent_dim, summary)
    
    def _build_model(self, input_shape, intermediate_dim, latent_dim,
                    summary=False):
        inputs = Input(shape=input_shape, name='encoder_input')
        x = inputs
        x = Dense(intermediate_dim, activation='relu')(x)
        x = Dense(intermediate_dim//2, activation='relu')(x)
        
        z_mean = Dense(latent_dim, name='z_mean')(x)
        z_log_var = Dense(latent_dim, name='z_log_var')(x)

        z = Lambda(self.sampling, output_shape=(latent_dim,), 
                   name='z')([z_mean, z_log_var])

        self.encoder = Model(inputs, [z_mean, z_log_var, z], 
                        name='encoder')
        
        latent_inputs = Input(shape=(latent_dim,), 
                              name='z_sampling')
        x = latent_inputs
        x = Dense(intermediate_dim//2, activation='relu')(x)
        x = Dense(intermediate_dim, activation='relu')(x)
        outputs = Dense(original_dim, activation='sigmoid')(x)

        self.decoder = Model(latent_inputs, outputs, name='decoder')
        outputs = self.decoder(self.encoder(inputs)[2])
        self.vae = Model(inputs, outputs, name='vae_mlp')
        
        reconstruction_loss = binary_crossentropy(inputs, outputs)
        reconstruction_loss *= original_dim
        kl_loss = 1 + z_log_var - K.square(z_mean) - K.exp(z_log_var)
        kl_loss = K.sum(kl_loss, axis=-1)
        kl_loss *= -0.5
        
        vae_loss = K.mean(reconstruction_loss + kl_loss)	
        
        self.vae.add_loss(vae_loss)
        self.vae.compile(optimizer='adam')
        if summary: 
            print(self.vae.summary())
        
    def sampling(self, args):
        z_mean, z_log_var = args
        batch = K.shape(z_mean)[0]
        dim = K.int_shape(z_mean)[1]
        epsilon = K.random_normal(shape=(batch, dim))
        return z_mean + K.exp(0.5 * z_log_var) * epsilon
        
    def fit(self, x_train, x_test, epochs=100, batch_size=100,
           verbose=1):
        self.vae.fit(x_train, 
            shuffle=True,
            epochs=epochs,
            batch_size=batch_size,
            verbose=verbose,
            validation_data=(x_test, None))
    
    def encoder_predict(self, x_test, batch_size=100):
        return self.encoder.predict(x_test,
                                   batch_size=batch_size)
    
    def generate(self, latent_val, batch_size=100):
        return self.decoder.predict(latent_val)
    
    def predict(self, x_test, batch_size=1):
        prediction = self.vae.predict(x_test)
        return prediction

## Training VAE

Just let the last value to test

In [0]:
print(x_train.shape)
print(np.amax(x_train))
print(np.amin(x_train))

(250, 58)
1.0
0.0


In [0]:
set_random_seed(GLOBAL_SEED)
np.random.seed(GLOBAL_SEED)
random.seed(GLOBAL_SEED)

latent_dim = original_dim//2
if latent_dim < 2:
    latent_dim = 2
vae = VAE(intermediate_dim=intermediate_dim, latent_dim=latent_dim)
vae.fit(x_train, x_test, epochs=150)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Train on 250 samples, validate on 250 samples
Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/15

## Generate data with VAE

In [0]:
set_random_seed(GLOBAL_SEED)
np.random.seed(GLOBAL_SEED)
random.seed(GLOBAL_SEED)

x_test = np.reshape(x_test, (-1, original_dim))
x_test_encoded = vae.encoder.predict(x_test)
x_test_encoded = np.asarray(x_test_encoded)

print(x_test_encoded.shape)

(3, 250, 29)


## Computing time

In [0]:
import time
set_random_seed(GLOBAL_SEED)
np.random.seed(GLOBAL_SEED)
random.seed(GLOBAL_SEED)

computation = []
for _ in range(10):
    start = time.time()
    total_nums = 2
    results = []
    for i in range(x_test_encoded.shape[1]):
        latent_gen = []
        for _ in range(total_nums):
            epsilon = np.random.normal(0., 1., x_test_encoded.shape[2])
            latent_gen.extend([x_test_encoded[0, i, :] + np.exp(x_test_encoded[1, i, :]*0.5)*epsilon])
        latent_gen = np.asarray(latent_gen)
        results.append(vae.generate(latent_gen))

    results = np.asarray(results)
    results = np.reshape(results, (-1, original_dim))
    results = scaler.inverse_transform(results)

    end = time.time()
    computation.append(end-start)
print(np.mean(computation), np.std(computation))

0.17435188293457032 0.00878902653198987


In [0]:
set_random_seed(GLOBAL_SEED)
np.random.seed(GLOBAL_SEED)
random.seed(GLOBAL_SEED)

total_nums = 2
results = []
for i in range(x_test_encoded.shape[1]):
    latent_gen = []
    for _ in range(total_nums):
        epsilon = np.random.normal(0., 1., x_test_encoded.shape[2])
        latent_gen.extend([x_test_encoded[0, i, :] + np.exp(x_test_encoded[1, i, :]*0.5)*epsilon])
    latent_gen = np.asarray(latent_gen)
    results.append(vae.generate(latent_gen))
    
results = np.asarray(results)
results = np.reshape(results, (-1, original_dim))
print(results.shape)
results = scaler.inverse_transform(results)

(500, 58)


## Handling generated data

In [0]:
print(len(results[:, 1]))
print(results[0, 0])

500
19.706902


In [0]:
d = {}
names = list(df)
for i, name in enumerate(names):
    d[name] = results[:, i]
df = pd.DataFrame(data=d)

## Re-categorical columns from generated data

In [0]:
names = list(df)
c_dict = {}
for n in names:
    if '_' in n:
        index = n.index('_')
        c_dict[n[:index]] = [c for c in names if n[:index+1] in c]
values = []
for key, items in c_dict.items():
    dummies = df[items]
    d_names = list(dummies)
    c_dict = {}
    for n in d_names:
        c_dict[n] = n[n.index('_')+1:]
    dummies.rename(columns=c_dict, 
                   inplace=True)
    df[key] = dummies.idxmax(axis=1)
    df.drop(items, axis=1, inplace=True)
print(df.head())

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


          X0        X11       X14       X16       X19  ...   X2   X4   X5   X7    X8
0  19.706902  38.357006  1.294747  1.025787  1.067945  ...  A43  A65  A72  A92  A101
1  18.419031  34.398422  1.324374  1.005644  1.103526  ...  A42  A61  A72  A92  A101
2  23.244650  37.615776  1.678816  1.129324  1.262234  ...  A43  A65  A75  A93  A101
3  20.215624  37.397747  1.623275  1.133832  1.373835  ...  A42  A61  A72  A93  A101
4  21.632650  49.568626  2.123373  1.500253  1.052936  ...  A40  A61  A75  A93  A101

[5 rows x 20 columns]


In [0]:
df = df.reindex(sorted(df.columns), axis=1)
df.to_csv(path + '_vae.csv', index=False)

# Dropout VAE

## Split train and test data

In [0]:
df = pd.read_csv(path + 'For_training.csv')
train = np.copy(df.values)
train.astype('float32')
scaler = MinMaxScaler()
print(np.amax(train[:, 2]))

train = scaler.fit_transform(train)
x_train, x_test = train_test_split(train, test_size=0.5,
                                  random_state=LOCAL_SEED,
                                  shuffle=True)
print(x_train.shape)
print(x_test.shape)
print(np.amax(x_train))
print(np.amax(x_test))

4
(250, 58)
(250, 58)
1.0
1.0


In [0]:
set_random_seed(GLOBAL_SEED)
np.random.seed(GLOBAL_SEED)
random.seed(GLOBAL_SEED)
original_dim = x_train.shape[1]
x_train = np.reshape(x_train, [-1, original_dim])
x_test = np.reshape(x_test, [-1, original_dim])
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
print(x_train.shape)
print(x_test.shape)

(250, 58)
(250, 58)


## Define Dropout VAE

In [0]:
from keras.regularizers import l2
from keras.losses import categorical_crossentropy
set_random_seed(GLOBAL_SEED)
np.random.seed(GLOBAL_SEED)
random.seed(GLOBAL_SEED)

class DropoutVAE:
    def __init__(self, input_shape=(original_dim,), 
                 intermediate_dim=32, latent_dim=3, dropout=0.05, 
                 summary=False):
        
        self._build_model(input_shape,
                         intermediate_dim, 
                          latent_dim, summary,
                          dropout)
    
    def _build_model(self, input_shape, intermediate_dim, latent_dim,
                    summary=False, dropout=0.05):
        inputs = Input(shape=input_shape, name='encoder_input')
        x = inputs
        x = Dense(intermediate_dim, activation='relu')(x)
        x = Dense(intermediate_dim//2, activation='relu')(x)
        
        z_mean = Dense(latent_dim, name='z_mean')(x)
        z_log_var = Dense(latent_dim, name='z_log_var')(x)

        z = Lambda(self.sampling, output_shape=(latent_dim,), 
                   name='z')([z_mean, z_log_var])

        self.encoder = Model(inputs, [z_mean, z_log_var, z], 
                        name='encoder')
        
        latent_inputs = Input(shape=(latent_dim,), 
                              name='z_sampling')
        x = latent_inputs
        x = Dense(intermediate_dim//2, activation='relu',
                 kernel_regularizer=l2(1e-4),
                 bias_regularizer=l2(1e-4))(x)
        x = Dropout(dropout)(x)
        x = Dense(intermediate_dim, activation='relu',
                 kernel_regularizer=l2(1e-4),
                 bias_regularizer=l2(1e-4))(x)
        x = Dropout(dropout)(x)
        outputs = Dense(original_dim, activation='sigmoid',
                       kernel_regularizer=l2(1e-4),
                       bias_regularizer=l2(1e-4))(x)

        self.decoder = Model(latent_inputs, 
                             outputs, 
                             name='decoder')
        outputs = self.decoder(self.encoder(inputs)[2])
        self.vae = Model(inputs, outputs, 
                         name='vae_mlp')
        
        reconstruction_loss = binary_crossentropy(inputs, outputs)
        reconstruction_loss *= original_dim
        kl_loss = 1 + z_log_var - K.square(z_mean) - K.exp(z_log_var)
        kl_loss = K.sum(kl_loss, axis=-1)
        kl_loss *= -0.5
        
        vae_loss = K.mean(reconstruction_loss + kl_loss)	
        
        self.vae.add_loss(vae_loss)
        self.vae.compile(optimizer='adam')
        if summary: 
            print(self.vae.summary())
        
    def sampling(self, args):
        z_mean, z_log_var = args
        batch = K.shape(z_mean)[0]
        dim = K.int_shape(z_mean)[1]
        epsilon = K.random_normal(shape=(batch, dim))
        return z_mean + K.exp(0.5 * z_log_var) * epsilon
        
    def fit(self, x_train, x_test, epochs=100, batch_size=100,
           verbose=1):
        self.vae.fit(x_train, 
            shuffle=True,
            epochs=epochs,
            batch_size=batch_size,
            verbose=verbose,
            validation_data=(x_test, None))
    
    def encoder_predict(self, x_test, batch_size=100):
        return self.encoder.predict(x_test,
                                   batch_size=batch_size)
    
    def generate(self, latent_val, batch_size=100):
        return self.decoder.predict(latent_val)
    
    def predict(self, x_test, batch_size=1, nums=1000):
        Yt_hat = []
        for _ in range(nums):
            Yt_hat.extend(self.vae.predict(x_test))
                          
        return np.asarray(Yt_hat)
                          

## Train and evaluate Dropout VAE

In [0]:
set_random_seed(GLOBAL_SEED)
np.random.seed(GLOBAL_SEED)
random.seed(GLOBAL_SEED)

latent_dim = original_dim//2
if latent_dim < 2:
    latent_dim = 2
vae = DropoutVAE(intermediate_dim=intermediate_dim,
                 dropout=0.2, latent_dim=latent_dim,
                 summary=True)
vae.fit(x_train, x_test, epochs=150)

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
encoder_input (InputLayer)   (None, 58)                0         
_________________________________________________________________
encoder (Model)              [(None, 29), (None, 29),  176442    
_________________________________________________________________
decoder (Model)              (None, 58)                169018    
Total params: 345,460
Trainable params: 345,460
Non-trainable params: 0
_________________________________________________________________
None
Train on 250 samples, validate on 250 samples
Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/

## Generate data with Dropout VAE

In [0]:
set_random_seed(GLOBAL_SEED)
np.random.seed(GLOBAL_SEED)
random.seed(GLOBAL_SEED)

x_test = np.reshape(x_test, (-1, original_dim))
print(x_test.shape)
print(x_test[0].reshape(-1, original_dim).shape)

(250, 58)
(1, 58)


## Computing time

In [0]:
set_random_seed(GLOBAL_SEED)
np.random.seed(GLOBAL_SEED)
random.seed(GLOBAL_SEED)

computation = []
total_nums = 2
for _ in range(10):
    start = time.time()
    results = []
    x_test_encoded = vae.predict(x_test,
                                 nums=total_nums)
    x_test_encoded = np.asarray(x_test_encoded)
    results = x_test_encoded
    results = np.asarray(results)
    results = scaler.inverse_transform(results)
    end = time.time()
    computation.append(end-start)
print(np.mean(computation), np.std(computation))

0.024167180061340332 0.027872028438232863


In [0]:
set_random_seed(GLOBAL_SEED)
np.random.seed(GLOBAL_SEED)
random.seed(GLOBAL_SEED)
total_nums = 2
results = []
for i in range(x_test.shape[0]):
    x_test_encoded = vae.predict(x_test[i].reshape(-1, original_dim), 
                                 nums=total_nums)
    
    x_test_encoded = x_test_encoded.reshape(total_nums, original_dim)
    results.append(x_test_encoded)
results = np.asarray(results)
results = results.reshape(total_nums*results.shape[0], original_dim)
results = scaler.inverse_transform(results)

## Handling Generated data

In [0]:
d = {}
names = list(df)
for i, name in enumerate(names):
    d[name] = results[:, i]
df = pd.DataFrame(data=d)

## Re-categoricalize data from Generated data

In [0]:
names = list(df)
c_dict = {}
for n in names:
    if '_' in n:
        index = n.index('_')
        c_dict[n[:index]] = [c for c in names if n[:index+1] in c]
values = []
for key, items in c_dict.items():
    dummies = df[items]
    d_names = list(dummies)
    c_dict = {}
    for n in d_names:
        c_dict[n] = n[n.index('_')+1:]
    dummies.rename(columns=c_dict, 
                   inplace=True)
    df[key] = dummies.idxmax(axis=1)
    df.drop(items, axis=1, inplace=True)
print(df.head())

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


          X0        X11       X14       X16       X19  ...   X2   X4   X5   X7    X8
0  25.646967  36.278141  1.612389  1.092323  1.528939  ...  A40  A61  A75  A92  A101
1  19.446032  31.034742  1.259892  1.087729  1.262563  ...  A43  A61  A73  A94  A101
2  23.790070  35.311882  1.587378  1.292253  1.226454  ...  A49  A61  A73  A93  A101
3  18.309425  36.927261  1.755193  1.321804  1.228635  ...  A43  A61  A73  A93  A101
4  24.141148  35.124191  1.367912  1.032213  1.157195  ...  A43  A61  A73  A93  A101

[5 rows x 20 columns]


In [0]:
df = df.reindex(sorted(df.columns), axis=1)
df.to_csv(path + '_dropout.csv', index=False)

# Encoding categorical data

In [0]:
df = pd.read_csv(path + '_For_Test.csv',
                 na_filter=True, 
                 verbose=False, 
                 skip_blank_lines=True, 
                 na_values=na_values,
                 keep_default_na=False)
df_mc = pd.read_csv(path + '_dropout.csv',
                 na_filter=True, 
                 verbose=False, 
                 skip_blank_lines=True, 
                 na_values=na_values,
                 keep_default_na=False)
df_vae = pd.read_csv(path + '_vae.csv',
                 na_filter=True, 
                 verbose=False, 
                 skip_blank_lines=True, 
                 na_values=na_values,
                 keep_default_na=False)
names = list(df)

In [0]:
from sklearn.preprocessing import LabelEncoder
colnums = len(df.columns)
for i in df.columns:
    try:
        if df[i].dtype.name == 'object':
            df[i] = df[i].astype('category')
    except:
        continue
cat_columns = df.select_dtypes(['category']).columns
print(cat_columns)
for col in cat_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].values)
    df_mc[col] = le.transform(df_mc[col].values)
    df_vae[col] = le.transform(df_vae[col].values)
    

Index(['X1', 'X10', 'X12', 'X13', 'X15', 'X17', 'X18', 'X2', 'X4', 'X5', 'X7',
       'X8'],
      dtype='object')


In [0]:
df = df.reindex(sorted(df.columns), axis=1)
df_mc = df_mc.reindex(sorted(df_mc.columns), axis=1)
df_vae = df_vae.reindex(sorted(df_vae.columns), axis=1)
df.to_csv(path + '_For_Test_encoded.csv')
df_mc.to_csv(path + '_dropout_encoded.csv')
df_vae.to_csv(path + '_vae_encoded.csv')