In [22]:
import pdb
import time
import os, ssl, sys
sys.path.append('../code/')

import numpy as np
import tensorflow as tf
import gpflow

from gpflow.likelihoods import Gaussian
from gpflow.kernels import SquaredExponential, White
from gpflow.utilities import print_summary
from gpflow.base import Parameter
from gpflow.config import default_float
from scipy.cluster.vq import kmeans2
from scipy.stats import norm
from scipy.special import logsumexp

from datasets import Datasets

import warnings
warnings.filterwarnings('ignore')

gpflow.config.set_default_float(np.float64)
np.random.seed(0)
tf.random.set_seed(0)

In [30]:
splits = 20
dataset = 'boston'
batch_size = 10000
test_batch_size = 100
learning_rate = 0.01
num_inducing = 100
log_dir = './log/'
iterations = 1000
logging_iter_freq = 500

Get the dataset:

In [31]:
datasets = Datasets(data_path='../data/')

In [32]:
running_err = 0
running_loss = 0
running_time = 0
test_errs = np.zeros(splits)
test_nlls = np.zeros(splits)
test_times = np.zeros(splits)

def optimisation_step(model, X, Y, optimizer):
    with tf.GradientTape() as tape:
        tape.watch(model.trainable_variables)
        obj = - model.elbo((X, Y))
        grads = tape.gradient(obj, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

def monitored_training_loop(model, train_dataset, logdir, iterations,
                            logging_iter_freq, optimizer):
    batches = iter(train_dataset)
    tf_optimisation_step = tf.function(optimisation_step)

    for i in range(iterations):
        X, Y = next(batches)
        tf_optimisation_step(model, X, Y, optimizer)

        iter_id = i + 1
        if iter_id % logging_iter_freq == 0:
            print('Epoch {}: ELBO (batch) {}'.format(iter_id,
                                                     model.elbo((X, Y))))

In [33]:
for i in range(20):
    print('Split: {}'.format(i))
    print('Getting dataset...')
    data = datasets.all_datasets[dataset].get_data(i, normalize=True)
    X, Y, Xs, Ys, Y_std = [data[_] for _ in ['X', 'Y', 'Xs', 'Ys', 'Y_std']]
    Z = kmeans2(X, num_inducing, minit='points')[0]
    
    batch_size = batch_size if batch_size < X.shape[0] else X.shape[0]
    train_dataset = tf.data.Dataset.from_tensor_slices((X, Y)).repeat()\
            .prefetch(X.shape[0]//2)\
            .shuffle(buffer_size=(X.shape[0]//2))\
            .batch(batch_size)
    
    print('Setting up SVGP model...')
    kernel = SquaredExponential()
    likelihood = Gaussian(variance=0.05)
    model = gpflow.models.SVGP(kernel=kernel, likelihood=likelihood, 
                               inducing_variable=Z)
                
    print('Training SVGP model...')
    optimizer = tf.optimizers.Adam(learning_rate)
    t0 = time.time()
    monitored_training_loop(model, train_dataset, logdir=log_dir, 
                            iterations=iterations,
                            logging_iter_freq=logging_iter_freq,
                            optimizer=optimizer)
    t1 = time.time()
    test_times[i] = t1 - t0
    print('Time taken to train: {}'.format(t1 - t0))
    running_time += t1 - t0
    
    means, vars = [], []
    if len(Xs) > test_batch_size:
        for mb in range(-(-len(Xs) // test_batch_size)):
            m, v = model.predict_y(Xs[mb*test_batch_size:(mb+1)*test_batch_size, :])
            means.append(m)
            vars.append(v)
    else:
        m, v = model.predict_y(Xs)
        means.append(m)
        vars.append(v)
        
    mean_ND = np.concatenate(means, 1)
    var_ND = np.concatenate(vars, 1)
    
    test_err = np.mean(Y_std * np.mean((Ys - mean_ND) ** 2.0) ** 0.5)
    test_errs[i] = test_err
    print('Average RMSE: {}'.format(test_err))
    running_err += test_err
    
    test_nll = np.mean(norm.logpdf(Ys * Y_std, mean_ND * Y_std, var_ND ** 0.5 * Y_std))
    test_nlls[i] = test_nll
    print('Average test log likelihood: {}'.format(test_nll))
    running_loss += test_nll

Split: 0
Getting dataset...
Setting up SVGP model...
Training SVGP model...
Epoch 500: ELBO (batch) -376.1104228025378
Epoch 1000: ELBO (batch) -334.1774254068115
Time taken to train: 7.233223915100098
Average RMSE: 2.1966386933601365
Average test log likelihood: -2.3495337431086654
Split: 1
Getting dataset...
Setting up SVGP model...
Training SVGP model...
Epoch 500: ELBO (batch) -432.00234444588386
Epoch 1000: ELBO (batch) -363.91983822920474
Time taken to train: 7.293781995773315
Average RMSE: 2.4725995617981327
Average test log likelihood: -2.3906873315563733
Split: 2
Getting dataset...
Setting up SVGP model...
Training SVGP model...
Epoch 500: ELBO (batch) -354.44459400209166
Epoch 1000: ELBO (batch) -328.0347315174917
Time taken to train: 8.118633031845093
Average RMSE: 2.7913412357717187
Average test log likelihood: -2.454332121430589
Split: 3
Getting dataset...
Setting up SVGP model...
Training SVGP model...
Epoch 500: ELBO (batch) -350.6846205246418
Epoch 1000: ELBO (batch) -3

KeyboardInterrupt: 