# DGP for regression

Here we'll show the DGP for regression, using small to medium data sets. 

In [1]:
import sys
sys.path.append('/homes/mlghomes/mh740/GPflow')
sys.path.append('/homes/mlghomes/mh740/Doubly-Stochastic-DGP/doubly_stochastic_dgp')
sys.path.append('/homes/mlghomes/mh740/Doubly-Stochastic-DGP/approx_EP_dgp')

import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline 

from gpflow.likelihoods import Gaussian
from gpflow.kernels import RBF, White
from gpflow.mean_functions import Constant
from gpflow.sgpr import SGPR, GPRFITC
from gpflow.svgp import SVGP
from gpflow.gpr import GPR

from scipy.cluster.vq import kmeans2
from sklearn.model_selection import ParameterGrid

from get_data import get_regression_data
from dgp import DGP
from aepdgp import GPNetwork
import time


Let's use the kin8nm data set

In [2]:
X, Y, Xs, Ys = get_regression_data('kin8nm', split=0)
print 'N: {}, D: {}, Ns: {}'.format(X.shape[0], X.shape[1], Xs.shape[0])

N: 7372, D: 8, Ns: 820


## Single layer models

Our baseline model is a sparse GP, but since the dataset is small we can also train without minibatches so we'll also compare to a collapsed sparse GP (with analytically optimal $q(\mathbf u)$) which is known as SGPR in GPflow terminology, and we'll also cpmpare to FITC

In [3]:
def make_single_layer_models(X, Y, Z):
    D = X.shape[1]
    Y_mean, Y_std = np.average(Y), np.std(Y) 

    m_sgpr = SGPR(X, Y, RBF(D, variance=Y_std**2), Z.copy(), mean_function=Constant(Y_mean))
    m_svgp = SVGP(X, Y, RBF(D, variance=Y_std**2), Gaussian(), Z.copy(), mean_function=Constant(Y_mean))
    m_fitc = GPRFITC(X, Y, RBF(D, variance=Y_std**2), Z.copy(), mean_function=Constant(Y_mean))

    for m in [m_sgpr, m_svgp, m_fitc]:
        m.mean_function.fixed = True
        m.likelihood.variance = 0.1 * Y_std
    return m_sgpr, m_svgp, m_fitc


## DGP models

We'll include a DGP with a single layer here for comparision. We've used a large minibatch size of $\text{min}(10000, N)$, but it works just fine for smaller batches

In [4]:
def make_dgp(X, Y, inducing_points, final_inducing_points, hidden_units, units, share_inducing_inputs):
    D = X.shape[1]
    Y_mean, Y_std = np.average(Y), np.std(Y) 
    
    model = DGP(X, Y, inducing_points, final_inducing_points, hidden_units, units, share_inducing_inputs)

    # same final layer inits we used for the single layer model
    model.likelihood.variance = Y_std*0.1
    model.layers[-1].mean_function = Constant(Y_mean)
    model.layers[-1].mean_function.fixed = True
    
    # start the inner layers almost deterministically 
    for layer in model.layers[:-1]:
        for node in layer.nodes:
            node.q_sqrt = node.q_sqrt.value * 1e-5
    
    return model

def make_aepdgp(X, Y, ind_points, L, W):
    layer_shape = [(W, ind_points)]*(L-1) + [(1, ind_points)]
    return GPNetwork(X, Y, layer_shape, linear_mean=False)

## Prediction

We'll calculate test rmse and likelihood in batches (so the larger datasets don't cause memory problems)

For the DGP models we need to take an average over the samples for the rmse. The `predict_density` function already does this internally


In [None]:
def batch_assess(model, assess_model, X, Y):
    n_batches = max(int(X.shape[0]/1000.), 1)
    lik, sq_diff = [], []
    for X_batch, Y_batch in zip(np.array_split(X, n_batches), np.array_split(Y, n_batches)):
        l, sq = assess_model(model, X_batch, Y_batch)
        lik.append(l)
        sq_diff.append(sq)
    lik = np.concatenate(lik, 0)
    sq_diff = np.array(np.concatenate(sq_diff, 0), dtype=float)
    return np.average(lik), np.average(sq_diff)**0.5

def assess_single_layer(model, X_batch, Y_batch):
    lik = model.predict_density(X_batch, Y_batch)
    mean, var = model.predict_y(X_batch)
    sq_diff = ((mean - Y_batch)**2)
    return lik, sq_diff 

S = 100
def assess_sampled(model, X_batch, Y_batch):
    lik = model.predict_density(X_batch, Y_batch, S)
    mean_samples, var_samples = model.predict_y(X_batch, 100)
    mean = np.average(mean_samples, 0)
    sq_diff = ((mean - Y_batch)**2)
    return lik, sq_diff 

def assess_aepdgp(model, X_batch, Y_batch):
    lik = model.predictBatchDensity(X_batch, Y_batch)
    mean, var = model.predictBatch(X_batch)
    sq_diff = ((mean - Y_batch)**2)
    return lik, sq_diff 

## Training 

We'll optimize single layer models and using LFBGS and the dgp models with Adam. It will be interesting to compare the result of `m_svgp` compared to `m_dgp1`: if there is a difference it will be down to the optimizer. 

We need to take the data in batches to predict for the larger data sets, so we'll define that now

In [None]:
grid = {'hidden_units': [0, 1, 2, 3, 4],
        'inducing_points': [10, 50, 100],
        'final_inducing_points': [10, 50, 100],
        'units': [1, 2, 4, 8, 16, 32],
        'share_inducing_inputs': [True, False]}

for params in ParameterGrid(grid):
    print(params)
    Z = kmeans2(X, params['inducing_points'], minit='points')[0]
    model = make_dgp(X, Y,
                     params['inducing_points'],
                     params['final_inducing_points'],
                     params['hidden_units'],
                     params['units'],
                     params['share_inducing_inputs'])
    t = time.time()
    model.optimize(tf.train.AdamOptimizer(0.01), maxiter=5000)
    lik, rmse = batch_assess(model, assess_sampled, X, Y)
    print 'Training lik: {:.4f}, rmse: {:.4f}. Training time: {:.4f}'.format(lik, rmse, time.time() - t)
    lik, rmse = batch_assess(model, assess_sampled, Xs, Ys)
    print 'Test lik: {:.4f}, rmse: {:.4f}. Training time: {:.4f}'.format(lik, rmse, time.time() - t)


{'units': 1, 'final_inducing_points': 10, 'share_inducing_inputs': True, 'hidden_units': 0, 'inducing_points': 10}


In [None]:
for i in range(10):
    print('iter', i)
    for m, name in zip([m_dgp2_w, m_dgp2], ['dgp2_wide', 'dgp2']):
        t = time.time()
        m.optimize(tf.train.AdamOptimizer(0.01), maxiter=1000)
        lik, rmse = batch_assess(m, assess_sampled, Xs, Ys)
        print '{:<16}  lik: {:.4f}, rmse: {:.4f}. Training time: {:.4f}'.format(name, lik, rmse, time.time() - t)


Now for the non DGP models:

In [None]:
single_layer_models = [m_sgpr, m_svgp, m_fitc, m_sgpr_500, m_svgp_500, m_fitc_500]
single_layer_names = ['col sgp', 'sgp', 'fitc', 'col sgp 500', 'sgp 500', 'fitc 500']
for m, name in zip(single_layer_models, single_layer_names):
    t = time.time()
    m.optimize()
    lik, rmse = batch_assess(m, assess_single_layer, Xs, Ys)
    print '{:<16}  lik: {:.4f}, rmse: {:.4f}. Training time: {:.4f}'.format(name, lik, rmse, time.time() - t)
