In [None]:
!nvidia-smi  # check GPU usage -- can ignore this

In [None]:
# DO NOT run this cell -- stuff for our server
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ['HTTP_PROXY']='http://proxy:3128/'
os.environ['HTTPS_PROXY']='http://proxy:3128/'

In [None]:
# imports
import tensorflow as tf
import tensorflow_probability as tfp
import numpy as np
from matplotlib import pyplot as plt
from matplotlib import image as mpimage

from data.utils import parse_image_example
from modeling.layers import ConvNormAct, ResidualBlock

tfkl = tf.keras.layers

In [None]:
# dataset
# this assumes a preprocessed flickr-faces dataset (code is also in the repository)
# you can replace this with MNIST or CIFAR or whatever you'd like :)

batch_size = 128
# I use the 32x32 version here
train_data = tf.data.TFRecordDataset("data/flickr_32_train.TFR").shuffle(60000).map(parse_image_example).batch(batch_size)
test_data = tf.data.TFRecordDataset("data/flickr_32_test.TFR").map(parse_image_example).batch(batch_size)

In [None]:
# look at some images to confirm they look good
test_images = np.concatenate([batch for batch in iter(test_data)], axis=0)

plt.figure(figsize=(15,15))
for ind, img in enumerate(test_images[:64]):
    plt.subplot(8, 8, ind+1)
    plt.imshow(img)
    plt.axis("off")
plt.show()

In [None]:
# a few options for loss functions
# all of these SUM over the image dimensions (height, width, channels)
# -- recall the discussion in the exercise

def squared_loss(y_true, y_pred):
    # this is what we get if assuming a gaussian likelihood and unknown fixed sigma
    batch_shape = tf.shape(y_true)[0]
    y_true = tf.reshape(y_true, [batch_shape, -1])
    y_pred = tf.reshape(y_pred, [batch_shape, -1])
    
    return tf.reduce_mean(tf.reduce_sum((y_true - y_pred)**2, axis=-1))


def logloss(y_true, y_pred):
    # this is what we get if assuming a gaussian likelihood and choosing optimal sigma
    batch_shape = tf.shape(y_true)[0]
    y_true = tf.reshape(y_true, [batch_shape, -1])
    y_pred = tf.reshape(y_pred, [batch_shape, -1])
    
    return tf.reduce_mean(tf.math.log(tf.norm(y_true - y_pred, axis=-1)))


def bernoulli_loss(y_true, y_pred):
    # this is what we get if assuming a bernoulli likelihood
    batch_shape = tf.shape(y_true)[0]
    y_true = tf.reshape(y_true, [batch_shape, -1])
    y_pred = tf.reshape(y_pred, [batch_shape, -1])
    
    xent = tf.nn.sigmoid_cross_entropy_with_logits(labels=y_true, logits=y_pred)
    
    return tf.reduce_mean(tf.reduce_sum(xent, axis=-1))


def continuous_bernoulli_log_normalizer(lam, l_lim=0.49, u_lim=0.51):
    # this is what we get if assuming a continuous bernoulli likelihood.
    # taken from https://github.com/cunningham-lab/cb_and_cc
    cut_lam = tf.where(tf.logical_or(tf.less(lam, l_lim), tf.greater(lam, u_lim)), lam, l_lim * tf.ones_like(lam))
    log_norm = tf.math.log(tf.abs(2.0 * tf.math.atanh(1 - 2.0 * cut_lam))) - tf.math.log(tf.abs(1 - 2.0 * cut_lam))
    taylor = tf.math.log(2.0) + 4.0 / 3.0 * tf.pow(lam - 0.5, 2) + 104.0 / 45.0 * tf.pow(lam - 0.5, 4)
    return tf.where(tf.logical_or(tf.less(lam, l_lim), tf.greater(lam, u_lim)), log_norm, taylor)


def continuous_bernoulli_loss(y_true, y_pred):
    # this is the loss for the continuous bernoulli distribution.
    # it's really just binary cross-entropy plus one more term corresponding to the normalization constant
    batch_shape = tf.shape(y_true)[0]
    y_true = tf.reshape(y_true, [batch_shape, -1])
    y_pred = tf.reshape(y_pred, [batch_shape, -1])
    
    base = tf.nn.sigmoid_cross_entropy_with_logits(labels=y_true, logits=y_pred)
    add = continuous_bernoulli_log_normalizer(tf.clip_by_value(tf.nn.sigmoid(y_pred), 1e-4, 1 - 1e-4))
    
    return tf.reduce_mean(tf.reduce_sum(base - add, axis=-1))


def continuous_bernoulli_expected_value(lam, l_lim=0.49, u_lim=0.51):
    # our networks output distribution parameters, but we want the expected value.
    # for gaussian and bernoulli distributions, the expected value is just equal to (one of) the
    # distribution parameter(s).
    # if using continuous bernoulli, the expected value is a bit more complicated
    cut_lam = tf.where(tf.logical_or(tf.less(lam, l_lim), tf.greater(lam, u_lim)), lam, l_lim * tf.ones_like(lam))
    expected = cut_lam / (2*cut_lam - 1) + 1 / (2*tf.math.atanh(1 - 2*cut_lam))
    return tf.where(tf.logical_or(tf.less(lam, l_lim), tf.greater(lam, u_lim)), expected, 0.5*tf.ones_like(expected))

In [None]:
# autoencoders have target = input.
# having to provide data as (image, image) tuples is annoying, so
# I write a custom train step that does not require labels.
class Autoencoder(tf.keras.Model):
    def train_step(self, data):
        with tf.GradientTape() as tape:
            reconstructions = self(data, training=True)
            
            recon_loss = self.compiled_loss(data, reconstructions)
            
        variables = self.trainable_variables
        gradients = tape.gradient(recon_loss, variables)
        self.optimizer.apply_gradients(zip(gradients, variables))
        
        self.compiled_metrics.update_state(data, reconstructions)
        # Return a dict mapping metric names to current value
        return {m.name: m.result() for m in self.metrics}
    
    def test_step(self, data):
        reconstructions = self(data, training=False)

        recon_loss = self.compiled_loss(data, reconstructions)
        
        self.compiled_metrics.update_state(data, reconstructions)
        # Return a dict mapping metric names to current value
        return {m.name: m.result() for m in self.metrics}

In [None]:
def residual_stack(inputs, filters, strides, blocks_per_level, mode, name):
    outputs = inputs
    for level_ind, (level_filters, level_stride) in enumerate(zip(filters, strides)):
        for block_ind in range(blocks_per_level):
            outputs = ResidualBlock(level_filters,
                                    mode, 
                                    strides=level_stride if block_ind == (blocks_per_level - 1) else 1,
                                    name="_".join([name, str(level_ind+1), str(block_ind+1)]))(outputs)
        
    return outputs

In [None]:
tf.keras.backend.clear_session()


blocks_per_level = 2
filters = [64, 128, 256, 256]
strides = [2, 2, 2, 1]

# note, this architecture encodes 32,32,3 to 4,4,64
# this is quite big compared to the input (downsampling by a factor of 3 only)
# this architecture could probably use some more tuning :)
encoder_input = tf.keras.Input((32, 32, 3))
encoder_output = residual_stack(encoder_input, filters, strides, blocks_per_level, "conv", "encoder")
encoder_final = tfkl.Conv2D(64, 1)(encoder_output)

encoder = tf.keras.Model(encoder_input, encoder_final, name="encoder")
code_shape = encoder.output_shape[1:]

decoder_input = tf.keras.Input(code_shape)
decoder_output = residual_stack(decoder_input, reversed(filters), strides, blocks_per_level, "transpose", "decoder")
decoder_final = tfkl.Conv2D(3, 1)(decoder_output)

decoder = tf.keras.Model(decoder_input, decoder_final, name="decoder")

model = Autoencoder(encoder_input, decoder(encoder(encoder_input)))
model.summary(expand_nested=True)

In [None]:
#loss_function = tf.losses.BinaryCrossentropy(from_logits=True)  # bernoulli likelihood, but AVERAGES over image dimensions
loss_function = bernoulli_loss # bernoulli likelihood
#loss_function = squared_loss  # gaussian likelihood with fixed sigma
#loss_function = logloss  # gaussian likelihood with optimal sigma
#loss_function = continuous_bernoulli_loss  # continuous bernoulli likelihood

n_steps = 100000
n_data = 60000
n_epochs = n_steps // (n_data // batch_size)
decay_function = tf.keras.optimizers.schedules.CosineDecay(0.001, n_steps)
optimizer = tf.optimizers.Adam(decay_function)

model.compile(loss=loss_function, optimizer=optimizer, jit_compile=True)

In [None]:
class ReconstructionCallback(tf.keras.callbacks.Callback):
    def __init__(self, frequency, **kwargs):
        super().__init__(**kwargs)
        self.frequency = frequency
    
    def on_epoch_end(self, epoch, logs=None):
        if not epoch % self.frequency:
            cropped_test = test_images[:32]  # TODO better not hardcode
            # IF using gaussian likelihood: remove the sigmoid here
            generated_batch = tf.nn.sigmoid(self.model(cropped_test)).numpy()
            # IF using continuous bernoulli likelihood: add this line after the sigmoid
            #generated_batch = continuous_bernoulli_expected_value(tf.clip_by_value(generated_batch, 1e-4, 1-1e-4)).numpy()
        
            plt.figure(figsize=(15,15))
            for ind, (original, reconstruction) in enumerate(zip(cropped_test, generated_batch)):
                comparison = np.concatenate((original, reconstruction), axis=1)
                plt.subplot(8, 4, ind+1)
                plt.imshow(comparison)
                plt.axis("off")
            plt.suptitle("Test set reconstructions")
            plt.show()


do_train = True

if do_train:
    #lr_schedule = tf.keras.callbacks.ReduceLROnPlateau(factor=0.2, patience=5, verbose=1)
    early_stop = tf.keras.callbacks.EarlyStopping(patience=30, restore_best_weights=True, verbose=1)
    reconstruct = ReconstructionCallback(10)  # plots reconstructions every 10 epochs

    history = model.fit(train_data, epochs=n_epochs, validation_data=test_data,
                        callbacks=[early_stop, reconstruct])
    model.save_weights("weights/weights_assignment0.h5")
    # note: the if using the continuous bernoulli loss, the loss will likely be < 0.
    # this might seem a bit weird, but is actually not an issue.
    # recall that this loss is the negative log likelihood.
    # - if the NLL is negative, that means the log likelihood is positive
    # - if the log likelihood is > 0, that implies that the likelihood is > 1
    # - a p > 1 is nothing unusual for continuous distributions, where we are using *density* functions
else:
    # instead of training, could load a previously trained model
    model.load_weights("weights/weights_assignment0.h5")

In [None]:
model.evaluate(test_data)

In [None]:
# this line if you are using bernoulli likelihood
reconstructions = tf.nn.sigmoid(model.predict(test_data)).numpy()
# this line IN ADDITION to the above if you are using *continuous* bernoulli likelihood
#reconstructions = continuous_bernoulli_expected_value(tf.clip_by_value(reconstructions, 1e-4, 1-1e-4)).numpy()

# this line instead of the above, if you are using gaussian likelihood
#reconstructions = model.predict(te_data)

# compare some inputs and reconstructions
plt.figure(figsize=(15, 15))
for ind, (original, reconstruction) in enumerate(zip(test_images[:32], reconstructions[:32])):
    plt.subplot(8, 4, ind+1)
    concat = np.concatenate((original, reconstruction), axis=1)
    plt.imshow(concat, vmin=0, vmax=1)
    plt.axis("off")
plt.show()

In [None]:
# collecting codes on the training set
all_codes = encoder.predict(train_data).reshape((-1, np.prod(code_shape)))

In [None]:
# some hamfisted attempts at generating random codes and applying the decoder
def generate_images(method):
    if method == "uniform":
        random_codes = tf.random.uniform((64, np.prod(code_shape)), all_codes.min(axis=0), all_codes.max(axis=0))
    elif method == "gaussian":
        random_codes = tf.random.normal((64, np.prod(code_shape)), all_codes.mean(axis=0), np.std(all_codes, axis=0))
    elif method == "gaussian_full":
        covariance = np.cov(all_codes, rowvar=False).astype(np.float32)
        distr = tfp.distributions.MultivariateNormalFullCovariance(all_codes.mean(axis=0), covariance)
        random_codes = distr.sample(64)
        
    random_codes = tf.reshape(random_codes, (-1,) + code_shape)
    
    generated = tf.nn.sigmoid(decoder(random_codes)).numpy()
    # again: the below is specific to using continuous bernoulli likelihood
    #generated = continuous_bernoulli_expected_value(tf.clip_by_value(tf.nn.sigmoid(generated), 1e-4, 1-1e-4)).numpy()
    
    plt.figure(figsize=(15, 15))
    for ind, img in enumerate(generated):
        plt.subplot(8, 8, ind + 1)
        plt.imshow(img)
        plt.axis("off")
    plt.show()

    return generated

In [None]:
gen = generate_images("uniform")

In [None]:
gen = generate_images("gaussian")

In [None]:
gen = generate_images("gaussian_full")

In [None]:
# look at distribution of the first few code dimensions. looks reasonably gaussian!
plt.figure(figsize=(15, 15))
for ind, dim in enumerate(all_codes.T[:64]):
    plt.subplot(8, 8, ind+1)
    plt.hist(dim, bins=50)
plt.show()

In [None]:
# covariance matrix reveals dependencies between dimensions
covariance = np.cov(all_codes, rowvar=False).astype(np.float32)

plt.figure(figsize=(15, 15))
absmax = abs(covariance).max()
plt.imshow(covariance[:64, :64], vmin=-absmax, vmax=absmax, cmap="coolwarm")
plt.colorbar()
plt.show()

In [None]:
# better: *correlation* matrix
covariance = np.corrcoef(all_codes, rowvar=False).astype(np.float32)

plt.figure(figsize=(15, 15))
absmax = abs(covariance).max()
plt.imshow(covariance[:64, :64], vmin=-1, vmax=1, cmap="coolwarm")
plt.colorbar()
plt.show()