In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers, losses, Input, backend
from tensorflow.keras.datasets import fashion_mnist
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

In [0]:
(x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()

x_train = x_train.astype('float32') / 255.
x_test = x_test.astype('float32') / 255.

print (x_train.shape)
print (x_test.shape)

# Plain Autoencoder

In [None]:
class Autoencoder(Model):
  def __init__(self, encoder_input_shape, n_layers_encoder,
               n_layers_decoder, output_dim):
    super(Autoencoder, self).__init__()
    self.encoder_input_shape = (28, 28, 1)
    self.output_dim = 2

    ### The encoder
    self.encoder = tf.keras.Sequential([
      layers.Input(shape=self.encoder_input_shape, name='encoder_input'),
      layers.Conv2D(filters = 32, kernel_size = (3, 3), strides = 1, padding = 'same', activation = 'LeakyReLU'),
      layers.Conv2D(filters = 64, kernel_size = (3, 3), strides = 2, padding = 'same', activation = 'LeakyReLU'),
      layers.Conv2D(filters = 64, kernel_size = (3, 3), strides = 2, padding = 'same', activation = 'LeakyReLU'),
      layers.Conv2D(filters = 64, kernel_size = (3, 3), strides = 1, padding = 'same', activation = 'LeakyReLU'),
    ])

    ### The decoder
    self.decoder = tf.keras.Sequential([
      layers.Conv2DTranspose(filters = 64, kernel_size = (3, 3), strides=1, padding = 'same', activation = 'LeakyReLU'),
      layers.Conv2DTranspose(filters = 64, kernel_size = (3, 3), strides=2, padding = 'same', activation = 'LeakyReLU'),
      layers.Conv2DTranspose(filters = 32, kernel_size = (3, 3), strides=2, padding = 'same', activation = 'LeakyReLU'),
      layers.Conv2DTranspose(filters = 1,  kernel_size = (3, 3), strides=1, padding = 'same', activation = 'sigmoid')
    ])

  def call(self, x):
    encoded = self.encoder(x)
    shape_before_flattening = backend.int_shape(encoded)[1:]
    encoded = layers.Flatten()(encoded)
    encoder_output = layers.Dense(self.output_dim)(encoded)
    self.mu = layers.Dense(self.output_dim, name = 'mu')(encoded)
    self.log_var = layers.Dense(self.output_dim, name = 'log_var')(encoded)

    decoder_input = layers.Input(shape=self.output_dim, name = 'decoder_input')
    decoded = layers.Dense(np.prod(shape_before_flattening))(decoder_input)
    decoded = layers.Reshape(shape_before_flattening)(encoder_output)
    decoder_output = self.decoder(decoded)

    return decoder_output


input_shape = x_test.shape[1:]
n_layers_encoder = 4
n_layers_decoder = 4
output_dim = 2

# Joining the Encoder to the Decoder
autoencoder = Autoencoder(input_shape, n_layers_encoder, n_layers_decoder,
                          output_dim)


In [None]:
def r_loss(y_true, y_pred):
  return backend.mean(backend.square(y_true - y_pred), axis = [1, 2, 3])

autoencoder.compile(optimizer = 'adam', loss = r_loss)

28 x 28 image
3 x 3 filter = 9 params per filter
1 bias term per filter + 9 = 10 params per filter
32 filters = 10 x 32 params = 320 params
the result is a 28 x 28 x 32 (each filter picks out a particular feature)

stack the filters
28 x 28 x 32 input shape
64 new filters x 9 params per filter = 576 params
576 params per channel (filtered image)
576 x 32 channels = 18432 params
1 bias term per new filter = 1 x 64 = 64 params
18496 params

# Variational Autoencoder



In [None]:
from numpy.core.fromnumeric import nonzero
class Autoencoder(Model):
  def __init__(self, encoder_input_shape, n_layers_encoder,
               n_layers_decoder, output_dim):
    super(Autoencoder, self).__init__()
    self.encoder_input_shape = (28, 28, 1)
    self.output_dim = 2
    self.shape_before_flattening = (7, 7, 64)
    self.dense1 = layers.Dense(self.output_dim, name = 'mu')
    self.dense2 = layers.Dense(self.output_dim, name = 'log_var')
    self.dense3 = layers.Dense(np.prod(self.shape_before_flattening))

    ### The encoder
    self.encoder = tf.keras.Sequential([
      layers.Input(shape=self.encoder_input_shape, name='encoder_input'),
      layers.Conv2D(filters = 32, kernel_size = (3, 3), strides = 1, padding = 'same', activation = 'LeakyReLU'),
      layers.Conv2D(filters = 64, kernel_size = (3, 3), strides = 2, padding = 'same', activation = 'LeakyReLU'),
      layers.Conv2D(filters = 64, kernel_size = (3, 3), strides = 2, padding = 'same', activation = 'LeakyReLU'),
      layers.Conv2D(filters = 64, kernel_size = (3, 3), strides = 1, padding = 'same', activation = 'LeakyReLU'),
    ])

    ### The decoder
    self.decoder = tf.keras.Sequential([
      layers.Conv2DTranspose(filters = 64, kernel_size = (3, 3), strides=1, padding = 'same', activation = 'LeakyReLU'),
      layers.Conv2DTranspose(filters = 64, kernel_size = (3, 3), strides=2, padding = 'same', activation = 'LeakyReLU'),
      layers.Conv2DTranspose(filters = 32, kernel_size = (3, 3), strides=2, padding = 'same', activation = 'LeakyReLU'),
      layers.Conv2DTranspose(filters = 1,  kernel_size = (3, 3), strides=1, padding = 'same', activation = 'sigmoid')
    ])

  def call(self, x):
    encoded = self.encoder(x)                                      # 32x28x28x1
    self.shape_before_flattening = backend.int_shape(encoded)[1:]  # 32x7x7x64
    print(self.shape_before_flattening)
    encoded = layers.Flatten()(encoded)                            # 32x3136
    self.mu = self.dense1(encoded)                                 # 32x2 ; each image gets a pseudo mean value (3136 values translated into a 2 coords (the mean))
    self.log_var = self.dense2(encoded)                            # 32x2 ; and a pseudo log_var value (between -inf and inf)

    def sampling(args):
      # Samples a random point from the normal distribution (for a specific mu and log_var)
      mu, log_var = args
      epsilon = backend.random_normal(shape = backend.shape(mu), mean = 0, stddev = 1.) # random point in normal distribution
      return mu + backend.exp(log_var / 2) * epsilon

    encoder_output = layers.Lambda(sampling, name = 'encoder_output')([self.mu, self.log_var]) # 32 x 2 sampled points

    # identical decoder to a plain autoencoder
    decoder_input = layers.Input(shape=self.output_dim, name = 'decoder_input')
    decoded = self.dense3(encoder_output)
    decoded = layers.Reshape(self.shape_before_flattening)(decoded)

    decoder_output = self.decoder(decoded)
    print(f"decoder_output_shape = {decoder_output.shape}")
    return decoder_output

input_shape = x_test.shape[1:]
n_layers_encoder = 4
n_layers_decoder = 4
output_dim = 2

# Joining the Encoder to the Decoder
autoencoder = Autoencoder(input_shape, n_layers_encoder, n_layers_decoder,
                          output_dim)


# The Loss Function (Reconstruction Loss + KL Divergence Loss)

In [None]:
# compilation
recon_loss_factor = 1000
def vae_loss(y_true, y_pred):
  print(f"y_true_shape = {y_true.shape}")
  print(f"y_pred_shape = {y_pred.shape}")
  RMSE_recon_loss = tf.math.reduce_mean(backend.square(y_true - y_pred))
  print(f"RMSE_recon_loss = {RMSE_recon_loss}")
  kl_loss = tf.keras.losses.kullback_leibler_divergence(y_true, y_pred)
  return (recon_loss_factor * RMSE_recon_loss) + kl_loss

In [None]:
optimizer = Adam(learning_rate = 0.0005)
autoencoder.compile(optimizer = optimizer, loss = vae_loss)
                    # metrics = [vae_recon_loss, vae_kl_loss])

# Train the model

In [None]:
autoencoder.fit(x = x_train, y = x_train,
                batch_size = 32,
                shuffle = True,
                epochs = 10,
                validation_data=(x_test, x_test))

Epoch 1/10
(7, 7, 64)
decoder_output_shape = (32, 28, 28, 1)
y_true_shape = (32, 28, 28)
y_pred_shape = (32, 28, 28)
RMSE_recon_loss = Tensor("vae_loss/Mean:0", shape=(), dtype=float32)
(7, 7, 64)
decoder_output_shape = (32, 28, 28, 1)
y_true_shape = (32, 28, 28)
y_pred_shape = (32, 28, 28)
RMSE_recon_loss = Tensor("vae_loss/Mean:0", shape=(), dtype=float32)
decoder_output_shape = (None, 28, 28, 1)
y_true_shape = (None, 28, 28)
y_pred_shape = (None, 28, 28)
RMSE_recon_loss = Tensor("vae_loss/Mean:0", shape=(), dtype=float32)
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x78db5f9223b0>

In [None]:
autoencoder.decoder.summary()

Model: "sequential_61"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_transpose_120 (Conv  (None, 7, 7, 64)          36928     
 2DTranspose)                                                    
                                                                 
 conv2d_transpose_121 (Conv  (None, 14, 14, 64)        36928     
 2DTranspose)                                                    
                                                                 
 conv2d_transpose_122 (Conv  (None, 28, 28, 32)        18464     
 2DTranspose)                                                    
                                                                 
 conv2d_transpose_123 (Conv  (None, 28, 28, 1)         289       
 2DTranspose)                                                    
                                                                 
Total params: 92609 (361.75 KB)
Trainable params: 926

## Using VAEs to Generate Faces
**What's different from the digits example:**
1. The faces data has 3 input channels (RGB) instead of just 1. This means that the final convolutional transpose layer of the decoder needs to have 3 channels. Started in colour, ends in colour.

2. There will be 200 dimensions in the latent space instead of just 2. More dimensions means more features/complexity/detail will be encoded.

3. Batch normalization after each convolution layer to speed up training. Dropout is also used to prevent overfitting.

4. The reconstruction loss factor is increased to 10000. 10000 was found to generate good results.  

5. We use a generator to feed images to the VAE from a folder, rather than loading all the images into memory first.

# Data

In [4]:
from keras.preprocessing.image import ImageDataGenerator
from PIL import Image as PImage
from os import listdir

In [24]:
DATA_FOLDER = 'img_align_celeba_dir'
NUM_IMAGES = 202599
INPUT_SHAPE = (64, 64, 3)
BATCH_SIZE = 32
LATENT_DIM = 200

In [25]:
data_gen = ImageDataGenerator(rescale=1./255)

data_flow = data_gen.flow_from_directory(DATA_FOLDER,
                                         target_size = INPUT_SHAPE[:2],
                                         batch_size = BATCH_SIZE,
                                         shuffle = True,
                                         class_mode = 'input',
                                         interpolation = 'bilinear')

Found 202599 images belonging to 1 classes.


# Architecture

In [56]:
from numpy.core.fromnumeric import nonzero
class Autoencoder(Model):
  def __init__(self, encoder_input_shape, latent_dim):
    super(Autoencoder, self).__init__()
    self.encoder_input_shape = encoder_input_shape
    self.latent_dim = latent_dim
    self.shape_before_flattening = (8, 8, 64)
    self.dense1 = layers.Dense(self.latent_dim, name = 'mu')
    self.dense2 = layers.Dense(self.latent_dim, name = 'log_var')
    self.dense3 = layers.Dense(np.prod(self.shape_before_flattening))

    ### The encoder
    self.encoder = tf.keras.Sequential([
      layers.Input(shape=self.encoder_input_shape, name='encoder_input'),
      layers.Conv2D(filters = 32, kernel_size = (3, 3), strides = 2, padding = 'same'),
      layers.BatchNormalization(),
      layers.LeakyReLU(),
      layers.Dropout(rate = 0.25),
      layers.Conv2D(filters = 64, kernel_size = (3, 3), strides = 2, padding = 'same'),
      layers.BatchNormalization(),
      layers.LeakyReLU(),
      layers.Dropout(rate = 0.25),
      layers.Conv2D(filters = 64, kernel_size = (3, 3), strides = 2, padding = 'same'),
      layers.BatchNormalization(),
      layers.LeakyReLU(),
      layers.Dropout(rate = 0.25),
      layers.Conv2D(filters = 64, kernel_size = (3, 3), strides = 2, padding = 'same'),
      layers.BatchNormalization(),
      layers.LeakyReLU(),
      layers.Dropout(rate = 0.25)
    ])

    ### The decoder
    self.decoder = tf.keras.Sequential([
      layers.Conv2DTranspose(filters = 64, kernel_size = (3, 3), strides=2, padding = 'same'),
      layers.BatchNormalization(),
      layers.LeakyReLU(),
      layers.Dropout(rate = 0.25),
      layers.Conv2DTranspose(filters = 64, kernel_size = (3, 3), strides=2, padding = 'same'),
      layers.BatchNormalization(),
      layers.LeakyReLU(),
      layers.Dropout(rate = 0.25),
      layers.Conv2DTranspose(filters = 32, kernel_size = (3, 3), strides=2, padding = 'same'),
      layers.BatchNormalization(),
      layers.LeakyReLU(),
      layers.Dropout(rate = 0.25),
      layers.Conv2DTranspose(filters = 3,  kernel_size = (3, 3), strides=2, padding = 'same', activation = 'sigmoid')
    ])

  def call(self, x):
    print(f'input_shape = {x.shape}')
    encoded = self.encoder(x)
    self.shape_before_flattening = backend.int_shape(encoded)[1:]
    print(self.shape_before_flattening)
    encoded = layers.Flatten()(encoded)
    mu = self.dense1(encoded)
    log_var = self.dense2(encoded)
    
    def sampling(args):
      # Samples a random point from the normal distribution (for a specific mu and log_var)
      func_mu, func_log_var = args
      epsilon = backend.random_normal(shape = backend.shape(func_mu), mean = 0, stddev = 1.) # random point in normal distribution
      return mu + backend.exp(func_log_var / 2) * epsilon

    encoder_output = layers.Lambda(sampling, name = 'encoder_output')([mu, log_var]) # samples a point from each distribution
    print(f'encoder_output_shape = {encoder_output.shape}')

    # identical decoder to a plain autoencoder
    decoder_input = layers.Input(shape=self.latent_dim, name = 'decoder_input')
    decoded = self.dense3(encoder_output)
    print(f'decoded_shape_b4_reshape = {decoded.shape}')
    decoded = layers.Reshape(self.shape_before_flattening)
    print(f'decoded_shape_after_reshape = {decoded.shape}')
    decoder_output = self.decoder(decoded)
    print(f'decoder_output_shape = {decoder_output.shape}')
    return decoder_output


In [52]:
# Build the model
VAE = Autoencoder(INPUT_SHAPE, LATENT_DIM)

# Training

In [53]:
# compilation
recon_loss_factor = 10000
def vae_loss(y_true, y_pred):
  print(f"y_true_shape = {y_true.shape}")
  print(f"y_pred_shape = {y_pred.shape}")
  RMSE_recon_loss = tf.math.reduce_mean(backend.square(y_true - y_pred))
  print(f"RMSE_recon_loss = {RMSE_recon_loss}")
  kl_loss = tf.keras.losses.kullback_leibler_divergence(y_true, y_pred)
  return (recon_loss_factor * RMSE_recon_loss) + kl_loss

In [54]:
optimizer = Adam(learning_rate = 0.0005)
VAE.compile(optimizer = optimizer, loss = vae_loss)

In [55]:
VAE.fit(
    x = data_flow,
    epochs = 200
)

input_shape = (32, 64, 64, 3)
(4, 4, 64)
length of mu = 200
length of log_var = 200
encoder_output_shape = (32, 200)
decoded_shape_b4_reshape = (32, 4096)


AttributeError: Exception encountered when calling layer 'autoencoder_9' (type Autoencoder).

'Reshape' object has no attribute 'shape'

Call arguments received by layer 'autoencoder_9' (type Autoencoder):
  • x=tf.Tensor(shape=(32, 64, 64, 3), dtype=float32)

# Generating New Faces

In [None]:
n_to_show = 30

new_output_dim = np.random.normal(size = (n_to_show, VAE.output_dim))

reconst = VAE.decoder.predict(np.array(new_output_dim))

fig = plt.figure(figsize = (18, 5))
fig.subplots_adjust(hspace = 0.4, wspace = 0.4)
for i in range(n_to_show):
  ax = fig.add_subplot(3, 10, i + 1)
  ax.imshow(reconst[i, :, :, :])
  ax.axis('off')

plt.show()