<a href="https://colab.research.google.com/github/Ladvien/gan_name_maker/blob/master/human_names_vae.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

The intent is to create a Variational Autoencoder which can take one-hot encoded human first names and encode them into a latent space.



# Parameters

In [0]:
# Engineering parameters.
data_set            = '93k' # "6k" or "93
pad_character       = '~'
allowed_chars       = f'abcdefghijklmnopqrstuvwxyz{pad_character}'
len_allow_chars     = len(allowed_chars)
max_name_length     = 10 

# Inputs
inputs              = len_allow_chars * max_name_length

# Parameters
optimizer_name        = 'rmsprop'
learning_rate         = 0.0001

epochs                = 45000
batch_size            = 32
num_samples           = 8

e_dropout             = 0.2
d_dropout             = 0.2

e_h_activation          = 'relu' # Activation function for hidden layers.
d_h_activation          = 'relu'

activation              = 'sigmoid' 

e_batchnorm             = False
d_batchnorm             = False

params = {
    'epochs': epochs,
    'batch_size': batch_size,
    'learning_rate': learning_rate,
    'optimizer_name': optimizer_name,
    'inputs': inputs,
    'num_samples_per_step': num_samples,
    'allowed_chars': allowed_chars,
    'max_name_length': max_name_length,
    'e_h_activation': e_h_activation,
    'd_h_activation': d_h_activation,
    'e_dropout': e_dropout,
    'd_dropout': d_dropout,
    'e_batchnorm': e_batchnorm,
    'd_batchnorm': d_batchnorm
}

# Load Data

In [0]:
import pandas as pd
import numpy as np

In [12]:
!git clone https://github.com/Ladvien/gan_name_maker

fatal: destination path 'gan_name_maker' already exists and is not an empty directory.


In [0]:
if data_set == '6k':
  # ~6k names
  df = pd.read_csv('./gan_name_maker/vectorized_names_6k.csv')
elif data_set == '93k':
  # ~93k names
  df = pd.read_csv('./gan_name_maker/vectorized_names_93k.csv')
  df = df.rename(columns = {'Name':'name'})
else:
  print('Please select data_set')

params['data_set'] = data_set

cols = list(df)

# Move the name column to the beginning.
cols.insert(0, cols.pop(cols.index('name')))
df = df.loc[:, cols]

# Drop the yucky columns.
df.drop('Unnamed: 0', axis = 1, inplace = True)

# Sort values by name
df.sort_values(by = 'name', ascending = True, inplace = True)

# Libraries

In [0]:
#%tensorflow_version 2.x

In [15]:
# Personal tools.
!pip install git+https://github.com/Ladvien/ladvien_ml.git
from ladvien_ml import FeatureModel

import tensorflow as tf

from tensorflow.python.keras.layers import Dense, Dropout, Activation,\
                                     Input, LeakyReLU, BatchNormalization, ReLU
from tensorflow.python.keras.models import Model

from tensorflow.keras.callbacks import History 

fm = FeatureModel()

Collecting git+https://github.com/Ladvien/ladvien_ml.git
  Cloning https://github.com/Ladvien/ladvien_ml.git to /tmp/pip-req-build-3h32wb8y
  Running command git clone -q https://github.com/Ladvien/ladvien_ml.git /tmp/pip-req-build-3h32wb8y
Building wheels for collected packages: ladvien-ml
  Building wheel for ladvien-ml (setup.py) ... [?25l[?25hdone
  Created wheel for ladvien-ml: filename=ladvien_ml-0.0.1-cp36-none-any.whl size=10658 sha256=55f6dbf394f8e72c4554eb92c95f487165163dbb30a13327739d7a091f062cbe
  Stored in directory: /tmp/pip-ephem-wheel-cache-fukr9a5a/wheels/c3/84/cb/159d16e33d8e5df3db4d1eae4b5066b58b86cd5131cd82f985
Successfully built ladvien-ml


Using TensorFlow backend.


# Encoder

In [0]:
def encoder(input, e_activation, e_batchnorm, dropout = 0.1):
  
  # Input layer
  E = input
  
  # First Hidden Layer
  E = Dense(int(input.shape[1].value * 0.75), activation = e_activation)(E)
  if e_batchnorm:
    E = BatchNormalization()(E)
  E = Dropout(d_dropout)(E)

  # Second Hidden Layer
  E = Dense(int(input.shape[1].value * 0.50), activation = e_activation)(E)
  if e_batchnorm:
    E = BatchNormalization()(E)
  E = Dropout(d_dropout)(E)

  # # Third Hidden Layer
  E = Dense(int(input.shape[1].value * 0.25), activation = e_activation)(E)
  if e_batchnorm:
    E = BatchNormalization()(E)
  E = Dropout(d_dropout)(E)

  E._name = 'encoder'

  return E

In [0]:
def decoder(encoder, output_shape, optimizer, last_activation, activation, batch_norm, dropout = 0.1):
  
  input_shape = encoder.shape[1].value

  # First Hidden Layer
  D = Dense(int(input_shape / 0.75), activation = activation)(encoder)
  if batch_norm:
    D = BatchNormalization()(D)
  D = Dropout(dropout)(D)

  # Second Hidden Layer
  D = Dense(int(input_shape / 0.50), activation = activation)(D)
  if batch_norm:
    D = BatchNormalization()(D)
  D = Dropout(dropout)(D)

  # # Third Hidden Layer
  D = Dense(output_shape, activation = activation)(D)
  if batch_norm:
    D = BatchNormalization()(D)
  D = Dropout(dropout)(D)

  D._name = 'decoder'

  return D

In [28]:
# Input shape will be the number of possible characters times 
# the maximum name length allowed.
vectorized_name_length = df.shape[1]

# Select optimizer.
optimizer = fm.select_optimizer(optimizer_name, learning_rate)

# Select activation function for hidden layers.
if e_h_activation == 'relu':
  e_activation = ReLU()
elif e_h_activation == 'lrelu':
  e_activation = LeakyReLU()

if d_h_activation == 'relu':
  d_activation = ReLU()
elif d_h_activation == 'lrelu':
  d_activation = LeakyReLU()

I = Input(shape=(inputs))

E = encoder(I, e_activation, e_batchnorm, dropout = e_dropout)
D = decoder(E, inputs, optimizer, activation, d_activation, e_batchnorm, dropout = e_dropout)

autoencoder = Model(I, D)
autoencoder.summary()
autoencoder.compile(optimizer = optimizer, loss = 'binary_crossentropy', metrics = ['accuracy'])

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         [(None, 270)]             0         
_________________________________________________________________
dense_6 (Dense)              (None, 202)               54742     
_________________________________________________________________
dropout_6 (Dropout)          (None, 202)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 135)               27405     
_________________________________________________________________
dropout_7 (Dropout)          (None, 135)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 67)                9112      
_________________________________________________________________
dropout_8 (Dropout)          (None, 67)                0   

# Prepare Data

In [0]:
# Randomize inputs.
df = df.sample(df.shape[0])

# Make sure no odd nans.
df.dropna(inplace = True)

# Drop the 'name' and 'real' columns.
X = df.iloc[:,1:]

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, X, test_size = 0.2, random_state = 42)

# Train

In [0]:
autoencoder.fit(x_train, x_train,
                epochs=100,
                batch_size=256,
                shuffle=True,
                validation_data=(x_test, x_test))

Train on 74310 samples, validate on 18578 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100

In [0]:
x_train.head()