# MNIST classification

Timings here are for a machine with a K80 GPU (an Azure NC6 instance). Running with a CPU only machine is going to quite a bit slower. 

**This requires tensorflow 1.0. For some reason things are breaking on tensorflow 1.1 (https://github.com/GPflow/GPflow/issues/415)**

In [1]:
import matplotlib.pyplot as plt
%matplotlib inline

import sys
sys.path.append('/homes/mlghomes/mh740/GPflow')
sys.path.append('/homes/mlghomes/mh740/Doubly-Stochastic-DGP/doubly_stochastic_dgp')

print(sys.path)

['', '/usr/lib/python2.7', '/usr/lib/python2.7/plat-x86_64-linux-gnu', '/usr/lib/python2.7/lib-tk', '/usr/lib/python2.7/lib-old', '/usr/lib/python2.7/lib-dynload', '/usr/local/lib/python2.7/dist-packages', '/usr/lib/python2.7/dist-packages', '/usr/lib/python2.7/dist-packages/PILcompat', '/usr/lib/python2.7/dist-packages/gtk-2.0', '/usr/lib/pymodules/python2.7', '/usr/lib/python2.7/dist-packages/ubuntu-sso-client', '/usr/lib/python2.7/dist-packages/wx-2.8-gtk2-unicode', '/usr/local/lib/python2.7/dist-packages/IPython/extensions', '/homes/mlghomes/mh740/.ipython', '/homes/mlghomes/mh740/GPflow', '/homes/mlghomes/mh740/Doubly-Stochastic-DGP/doubly_stochastic_dgp']


In [2]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

import numpy as np
import tensorflow as tf

from gpflow.likelihoods import MultiClass
from gpflow.kernels import RBF, White, Linear, Matern32, Matern52
from gpflow.svgp import SVGP
from gpflow.gpr import GPR

from gpflow.param import AutoFlow

from scipy.stats import mode
from scipy.cluster.vq import kmeans2

from get_data import get_mnist_data
from dgp import DGP

import time

X, Y, Xs, Ys = get_mnist_data()

Extracting ../data/MNIST_data/train-images-idx3-ubyte.gz
Extracting ../data/MNIST_data/train-labels-idx1-ubyte.gz
Extracting ../data/MNIST_data/t10k-images-idx3-ubyte.gz
Extracting ../data/MNIST_data/t10k-labels-idx1-ubyte.gz


We'll use 100 inducing points 

In [3]:
M = 30
Z = kmeans2(X, M, minit='points')[0]

Slightly annoyingly,  `AutoFlow` takes `Ynew` as a `float_type` in `predict_density`, but for the mutliclass likelihood the input is `tf.int32` (also the number of dimensions are different). We defined both versions in our `DGP` class, but as a workaround for `SVGP` we just override the behaviour:


In [4]:
class MultiClassSVPG(SVGP):
    @AutoFlow((tf.float64, [None, None]), (tf.int32, [None,]))
    def predict_density(self, Xnew, Ynew):
        pred_f_mean, pred_f_var = self.build_predict(Xnew)
        return self.likelihood.predict_density(pred_f_mean, pred_f_var, Ynew)

We'll compare three models: an ordinary sparse GP and DGPs with 2 and 3 layers. 

We'll use a batch size of 1000 for all models 

In [5]:
m_sgp = MultiClassSVPG(X, Y.reshape(-1, 1).astype(np.float64), RBF(784, lengthscales=2, variance=2), 
            MultiClass(10), Z, 
            num_latent=10, minibatch_size=10000, whiten=True)

In [6]:
def make_dgp(L):
    kernels = [RBF(784, lengthscales=2., variance=2.)]
    for l in range(L-1):
        kernels.append(RBF(30, lengthscales=2., variance=2.))
    model = DGP(X, Y, Z, kernels, MultiClass(10), 
                num_samples=1,
                minibatch_size=10000,
                num_latent_Y=10)

    for layer in model.layers[:-1]:
        layer.q_sqrt = layer.q_sqrt.value * 1e-5 
    
    return model
m_dgp2 = make_dgp(2)
m_dgp3 = make_dgp(3)

For the SGP model we'll calcuate accuracy by simply taking the max mean prediction:

In [7]:
def assess_model_sgp(model, X_batch, Y_batch):
    m, v = model.predict_y(X_batch)
    l = model.predict_density(X_batch, Y_batch)
    a = (np.argmax(m, 1)==Y_batch)
    return l, a

For the DGP models we have stochastic predictions. We need a single prediction for each datum, so to do this we take $S$ samples for the one-hot predictions ($(S, N, 10)$ matrices for mean and var), then we take the max over the class means (to give a $(S, N)$ matrix), and finally we take the modal class over the samples (to give a vector of length $N$):

We'll use 100 samples

In [8]:
S = 100
def assess_model_dgp(model, X_batch, Y_batch):
    m, v = model.predict_y(X_batch, S)
    l = model.predict_density(X_batch, Y_batch, S)
    a = (mode(np.argmax(m, 2), 0)[0].flatten()==Y_batch)
    return l, a

We need batch predictions (we might run out of memory otherwise)

In [9]:
def batch_assess(model, assess_model, X, Y):
    n_batches = int(len(X)/1000)
    lik, acc = [], []
    for X_batch, Y_batch in zip(np.split(X, n_batches), np.split(Y, n_batches)):
        l, a = assess_model(model, X_batch, Y_batch)
        lik.append(l)
        acc.append(a)
    lik = np.concatenate(lik, 0)
    acc = np.array(np.concatenate(acc, 0), dtype=float)
    return np.average(lik), np.average(acc)

Finally, we'll use the following callback to log what's going on. We'll train for 10000 iterations, printing every 1000 to see how convergence is doing. We'll predict also at the training data to see what's going (we don't use a validation set). 

In [10]:
class CB(object):
    def __init__(self, model, assess_model):
        self.model = model
        self.assess_model = assess_model
        self.i = 0
        self.t = time.time()
        self.train_time = 0
        self.ob = []
        self.train_lik = []
        self.train_acc = []
    def cb(self, x):
        self.i += 1
        if self.i % 100 == 0:
            # time how long we've be training 
            self.train_time += time.time() - self.t
            self.t = time.time()
            
            # assess the model on the training data
            self.model.set_state(x)
            lik, acc = batch_assess(self.model, self.assess_model, X, Y)
            self.train_lik.append(lik)
            self.train_acc.append(acc)
            
            # calculate the objective, averaged over S samples 
            ob = 0
            for _ in range(1):
                ob += self.model.compute_log_likelihood()/float(1)
            self.ob.append(ob)
            
            st = 'it: {}, ob: {:.1f}, train lik: {:.4f}, train acc {:.4f}'
            print st.format(self.i, ob, lik, acc)

Now we're ready to go

The sparse GP:

In [11]:
cb_sgp = CB(m_sgp, assess_model_sgp)

In [12]:
m_sgp.optimize(tf.train.AdamOptimizer(0.01), maxiter=10000, callback=cb_sgp.cb)

it: 100, ob: -364953.7, train lik: -1.2899, train acc 0.7831
it: 200, ob: -226536.2, train lik: -0.6776, train acc 0.8724
it: 300, ob: -130017.8, train lik: -0.4011, train acc 0.8982
it: 400, ob: -78092.8, train lik: -0.2813, train acc 0.9201
it: 500, ob: -67103.4, train lik: -0.2514, train acc 0.9301
it: 600, ob: -59758.3, train lik: -0.2338, train acc 0.9360
it: 700, ob: -54670.9, train lik: -0.2257, train acc 0.9390
it: 800, ob: -53411.8, train lik: -0.2188, train acc 0.9418
it: 900, ob: -51405.3, train lik: -0.2129, train acc 0.9442
it: 1000, ob: -48470.8, train lik: -0.2088, train acc 0.9459
it: 1100, ob: -48456.3, train lik: -0.2031, train acc 0.9473
it: 1200, ob: -46191.2, train lik: -0.2030, train acc 0.9484
it: 1300, ob: -45351.6, train lik: -0.1992, train acc 0.9504
it: 1400, ob: -43376.3, train lik: -0.1960, train acc 0.9510
it: 1500, ob: -43182.0, train lik: -0.1957, train acc 0.9516
it: 1600, ob: -42432.6, train lik: -0.1918, train acc 0.9526
it: 1700, ob: -43986.7, train 

     fun: 38445.024020967016
     jac: array([-0.        , -0.        , -0.        , ..., -0.2239916 ,
       -0.22270208,  0.33455356])
 message: 'Finished iterations.'
  status: 'Finished iterations.'
 success: True
       x: array([ 0.        ,  0.        ,  0.        , ..., -0.05303204,
       -0.04260791,  0.23412898])

In [13]:
print 'sgp total train time {:.4f}'.format(cb_sgp.train_time)
l, a = batch_assess(m_sgp, assess_model_sgp, Xs, Ys)
print 'spg test lik: {:.4f}, test acc {:.4f}'.format(l, a)

sgp total train time 1895.5112
spg test lik: -0.1594, test acc 0.9592


Using more inducing points improves things, but at the expense of very slow computation (500 inducing points takes about a day)

The two layer DGP:

In [14]:
cb_dgp2 = CB(m_dgp2, assess_model_dgp)
m_dgp2.optimize(tf.train.AdamOptimizer(0.01), maxiter=10000, callback=cb_dgp2.cb)
print 'dgp2 total train time {:.4f}'.format(cb_dgp2.train_time)
l, a = batch_assess(m_dgp2, assess_model_dgp, Xs, Ys)
print 'dgp2 test lik: {:.4f}, test acc {:.4f}'.format(l, a)

(TensorShape([Dimension(None), Dimension(None)]), TensorShape([Dimension(None), Dimension(None)]))


ValueError: Cannot feed value of shape (1000,) for Tensor u'Placeholder_10:0', which has shape '(?, ?)'

And the three layer:

In [None]:
cb_dgp3 = CB(m_dgp3, assess_model_dgp)
m_dgp3.optimize(tf.train.AdamOptimizer(0.01), maxiter=10000, callback=cb_dgp3.cb)
print 'dgp3 total train time {:.4f}'.format(cb_dgp3.train_time)
l, a = batch_assess(m_dgp3, assess_model_dgp, Xs, Ys)
print 'dgp3 test lik: {:.4f}, test acc {:.4f}'.format(l, a)

The 3 layer DGP is best on both accuracy and likelihood, though the improvement over the 2 layer is slight. 

We can see how they've done over the training

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.plot(cb_sgp.train_acc, label='sgp')
plt.plot(cb_dgp2.train_acc, label='dgp2')
plt.plot(cb_dgp3.train_acc, label='dgp3')
plt.title('train accuray')
plt.legend()
plt.show()

In [None]:
plt.plot(cb_sgp.train_lik, label='sgp')
plt.plot(cb_dgp2.train_lik, label='dgp2')
plt.plot(cb_dgp3.train_lik, label='dgp3')
plt.title('train likelihood')
plt.legend()
plt.show()