In [None]:
import GPy
import numpy as np
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

from IPython.display import display

%matplotlib inline
%config InlineBackend.figure_format = 'retina'   ##QUALITY FIGURES!!
plt.rcParams["figure.figsize"] = [8,8]

### Generate a GP model

In [None]:
kernel = GPy.kern.RBF(input_dim=1, variance=1., lengthscale=1)
# Many kernels are already implemented in GPy. 
# A comprehensive list can be list can be found but typing GPy.kern.<tab>)
# Also, kernels can be combined: https://nbviewer.jupyter.org/github/SheffieldML/notebook/blob/master/GPy/basic_kernels.ipynb

m = GPy.models.GPRegression(X,Y,kernel)
m.optimize(messages=True)
# or, with several random initializations
m.optimize_restarts(num_restarts = 10)

display(m)
fig = m.plot(plot_density=True)

### Generate samples of f(x) i.e. functions

In [None]:
X_true = np.linspace(-5,5,1000)[:,None]
posteriorYtest = m.posterior_samples_f(X_true, full_cov=True, size=nsamples).reshape([-1,nsamples]

### Automatic Relevance Determination in RBF kernel (ARD-RBF)
**Automatic Relevance Determination** kernel: In the RBF-ARD kernel,  each input dimension has a different lengthscale, hence enabling interpretability:

\begin{align}
k(\mathbf{x},\mathbf{x}^*) = \sigma^2 \exp \left(- \sum_{d=1}^{D} \frac{(x_d-x_d^*)^2}{2\ell_d^2}\right)
\end{align}

*That is, larger $\ell_d$ means that feature d decays faster and therefore shows less correlation between close-by points.*

*In short, larger $\ell_d$ means that feature d is less relevant.*



In [None]:
ker4 = GPy.kern.RBF(2,ARD=True)
m4 = GPy.models.GPRegression(X,Y,ker4)
m4.optimize(messages=True,max_f_eval = 1000)

# Let's get a list of all kernel hyperparameters and their values
print(ker4.parameter_names())
print(ker4.rbf.variance.values)
print(ker4.rbf.lengthscale.values)
print(ker4.white.variance.values)

### GP classification

### Sparse GPs
GPs are kernelized methods that involve the inversion of the full kernel matrix (size NxN, where N is the number of training points) in order to make predictions. This is a computationally expensive procedure ( $\mathcal{O}(N^3)$ ).

Therefore, sparse GPs try to select a subset of points N' that represent the full GP, and make the K matrix much smaller ($N' << N$).

In [None]:
Z = np.random.rand(npoints,1)
m = GPy.models.SparseGPRegression(X,y,Z=Z)

m.optimize('bfgs')
m.plot()
m_full.plot()
print(m.log_likelihood(), m_full.log_likelihood())