# This notebook reproduces results from the doubly stochastic variational inference paper on UCI datasets

In [1]:
# Types 
from torch import Tensor  


# Imports 
import torch 
import pandas as pd 
from torch.utils.data import DataLoader
from torch.utils.data.dataloader import default_collate
from gpytorch.variational import VariationalStrategy, CholeskyVariationalDistribution
from gpytorch.models.deep_gps import DeepGPLayer, DeepGP
from gpytorch.kernels import RBFKernel, ScaleKernel, MaternKernel
from gpytorch.means import ConstantMean, LinearMean
from gpytorch.distributions import MultivariateNormal
from gpytorch.likelihoods import GaussianLikelihood, MultitaskGaussianLikelihood
from gpytorch.mlls import VariationalELBO, DeepApproximateMLL
from gpytorch.metrics import negative_log_predictive_density
from tqdm.autonotebook import tqdm
from math import ceil
from scipy.special import logsumexp
from scipy.cluster.vq import kmeans2, ClusterError
from mdgp.experiments.uci.data.datasets import UCIDataset, Power, Kin8mn, Energy

  from tqdm.autonotebook import tqdm
INFO: Using numpy backend


# Global settings
The datasets are small and can fit on a GPU, so there is not need to move data in and out of the GPU. Thus, simply setting the default device should be a harmless way to run on GPU if available.

In [2]:
DTYPE = torch.float32
torch.set_default_dtype(DTYPE)
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Settings taken from the paper 

In [3]:
# Data
TEST_SIZE = 0.1

# Model 
LIKELIHOOD_VARIANCE = 0.01
LENGTHSCALE = 2.0
INNER_LAYER_VARIANCE = 1e-5
OUTPUT_LAYER_VARIANCE = 1.0 # This is a (reasonable) guess
NUM_INDUCING_POINTS = 100
MAX_HIDDEN_DIMS = 30

# Training 
LR = 0.01
NUM_ITERATIONS = 20_000
BATCH_SIZE = 10_000

# Euclidean deep GP initialized according to the paper

In [4]:
from gpytorch.variational import IndependentMultitaskVariationalStrategy


def get_hidden_dims(dataset: UCIDataset) -> int:
    return min(MAX_HIDDEN_DIMS, dataset.dimension)


def empty_cluster_safe_kmeans(x: Tensor, k: int, num_retries: int = 100) -> Tensor:
    """
    Initialize inducing points using kmeans. (from paper)
    """
    for _ in range(num_retries):
        try:
            return torch.from_numpy(kmeans2(x, k, missing='raise')[0]).to(x.device, x.dtype)
        except ClusterError:
            continue 
    return torch.from_numpy(kmeans2(x, k)[0]).to(x.device, x.dtype)
    raise ClusterError(f"Failed to find {k} clusters in {num_retries} retries.")


def get_inducing_points(dataset: UCIDataset, num_inducing_points: int) -> Tensor:
    """
    Initialize inducing points using kmeans. (from paper)
    """
    return empty_cluster_safe_kmeans(dataset.train_x, num_inducing_points)


class EuclideanDeepGPLayer(DeepGPLayer):
    def __init__(self, inducing_points, output_dims, hidden: bool = False):
        input_dims = inducing_points.size(-1)
        batch_shape = torch.Size([output_dims]) if output_dims is not None else torch.Size([])

        variational_distribution = CholeskyVariationalDistribution(
            num_inducing_points=inducing_points.size(0), 
            batch_shape=batch_shape,
        )
        variational_strategy = VariationalStrategy(
            self,
            inducing_points,
            variational_distribution,
            learn_inducing_locations=True,
        )

        super().__init__(variational_strategy, input_dims, output_dims)

        # base_kernel = MaternKernel(nu=1.5, batch_shape=batch_shape)
        base_kernel = RBFKernel(batch_shape=batch_shape)

        base_kernel.lengthscale = LENGTHSCALE
        # Use ard_num_dims=input_dims adds a lengthscale for each input dimension 
        # "we choose the RBF kernel with a lengthscale for each dimension" (from paper)
        self.covar_module = ScaleKernel(base_kernel, batch_shape=batch_shape, ard_num_dims=input_dims)
        if hidden:
            self.mean_module = LinearMean(input_dims, batch_shape=batch_shape)
            self.covar_module.outputscale = INNER_LAYER_VARIANCE
        else:
            self.mean_module = ConstantMean(batch_shape=batch_shape)
            self.covar_module.outputscale = OUTPUT_LAYER_VARIANCE

    def forward(self, x):
        covar = self.covar_module(x)
        mean = self.mean_module(x)
        return MultivariateNormal(mean, covar)
    

class EuclideanDeepGP(DeepGP):
    def __init__(self, dataset: UCIDataset, num_layers: int, num_inducing_points: int = NUM_INDUCING_POINTS):
        super().__init__()
        num_hidden_dims = get_hidden_dims(dataset)
        inducing_points = get_inducing_points(dataset, num_inducing_points)

        self.layers = torch.nn.ModuleList(
            [EuclideanDeepGPLayer(inducing_points, num_hidden_dims, hidden=True) for _ in range(num_layers - 1)] + 
            [EuclideanDeepGPLayer(inducing_points, dataset.num_outputs, hidden=False)]
        )
        self.likelihood = MultitaskGaussianLikelihood(dataset.num_outputs)
        self.likelihood.noise = LIKELIHOOD_VARIANCE

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x 


# Train and evaluate model according to the paper

In [30]:
def collate_and_to_device(x):
    return tuple(_x.to(DEVICE) for _x in default_collate(x))


def batch_size(dataset):
    return min(BATCH_SIZE, dataset.train_x.size(0))
        

def num_epochs(dataset) -> int:
    iterations_per_epoch = ceil(dataset.train_x.size(0) / batch_size(dataset))
    return ceil(NUM_ITERATIONS / iterations_per_epoch)


def train_step(x: Tensor, y: Tensor, model: EuclideanDeepGP, optimizer: torch.optim.Optimizer, elbo: VariationalELBO) -> float:
    optimizer.zero_grad()
    output = model(x)
    loss = elbo(output, y)
    loss.backward()
    optimizer.step()
    return loss.item()


def train(dataset: UCIDataset, model: EuclideanDeepGP, epochs: int = 1000) -> list[float]: 
    optimizer = torch.optim.Adam(model.parameters(), lr=LR, maximize=True)
    elbo = DeepApproximateMLL(VariationalELBO(model.likelihood, model, dataset.train_y.size(0)))
    train_loader = DataLoader(dataset.train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_and_to_device)

    losses = []
    for _ in (pbar := tqdm(range(epochs), desc='Epochs')):
        epoch_loss = 0
        for x_batch, y_batch in train_loader:
            loss = train_step(x=x_batch, y=y_batch, model=model, optimizer=optimizer, elbo=elbo)
            epoch_loss += loss
        losses.append(epoch_loss)
        pbar.set_postfix({'ELBO': epoch_loss})

    return losses 

        
def test_log_likelihood(outputs: MultivariateNormal, targets: Tensor, y_std: Tensor) -> Tensor:
    mean, stddev = outputs.mean, outputs.stddev
    logpdf = torch.distributions.Normal(loc=mean, scale=stddev).log_prob(targets) - torch.log(y_std)
    # average over likelihood samples 
    logpdf = torch.atleast_2d(logpdf)
    logpdf = logsumexp(logpdf.numpy(), axis=0, b=1 / mean.size(0))
    # average over data points
    return torch.from_numpy(logpdf).mean()


def mean_squared_error(outputs: MultivariateNormal, targets: Tensor, y_std: Tensor) -> Tensor:
    mean = outputs.mean.mean(0) if outputs.mean.ndim > 1 else outputs.mean
    return ((mean - targets) ** 2 * y_std ** 2).mean()
        

def evaluate(dataset: UCIDataset, model: EuclideanDeepGP) -> dict[str, float]:
    with torch.no_grad():
        out = model.likelihood(model(dataset.test_x)).to_data_independent_dist()
        tll = test_log_likelihood(out, dataset.test_y, dataset.test_y_std)
        mse = mean_squared_error(out, dataset.test_y, dataset.test_y_std)
        metrics = {
            'tll': tll.mean().item(), 
            'mse': mse.mean().item(),
            'nlpd': negative_log_predictive_density(out, dataset.test_y).mean().item()
        }
        print(f"TLL: {metrics['tll']}, MSE: {metrics['mse']}")
    return metrics 


def reproduce_results(dataset, num_layers: int, num_inducing_points: int = NUM_INDUCING_POINTS, num_runs: int = 5):
    print(f"Reproducing results for {dataset.name}".center(80, '-') + '\n')

    metrics = []
    for run in range(num_runs):
        print(f"Run {run + 1}".center(80, '-'))
        torch.random.manual_seed(run)
        model = EuclideanDeepGP(dataset, num_layers=num_layers, num_inducing_points=num_inducing_points)
        train(dataset, model)
        run_metrics = evaluate(dataset, model)
        metrics.append(run_metrics)
    df = pd.DataFrame(metrics)

    print("Metrics mean".center(80, '-'))
    print(df.mean())

    print("Metrics STD".center(80, '-'))
    print(df.std())

    return df 

# Test Shallow Euclidean GP on Energy

In [23]:
from gpytorch.models import ApproximateGP


class SGP(ApproximateGP):
    def __init__(self, inducing_points: Tensor):
        batch_shape = torch.Size([])
        variational_distribution = CholeskyVariationalDistribution(
            num_inducing_points=inducing_points.size(0), 
            batch_shape=batch_shape,
        )
        variational_strategy = VariationalStrategy(
            self,
            inducing_points,
            variational_distribution,
            learn_inducing_locations=True,
        )
        super().__init__(variational_strategy)

        self.mean_module = ConstantMean(batch_shape=batch_shape)
        self.covar_module = ScaleKernel(RBFKernel(batch_shape=batch_shape), batch_shape=batch_shape)
        self.covar_module.base_kernel.lengthscale = LENGTHSCALE
        self.likelihood = GaussianLikelihood()
        self.likelihood.noise = LIKELIHOOD_VARIANCE

    def forward(self, x):
        mean = self.mean_module(x)
        covar = self.covar_module(x)
        return MultivariateNormal(mean, covar)


In [34]:
from mdgp.experiments.uci.data.datasets import Energy

dataset = Energy()
x, y = dataset.train_x, dataset.train_y.squeeze()

inducing_points = empty_cluster_safe_kmeans(x, 100)
model = SGP(inducing_points)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01, maximize=True)
elbo = VariationalELBO(model.likelihood, model, y.size(0))


for _ in (pbar := tqdm(range(10000), desc='Epochs')):
    optimizer.zero_grad()
    output = model(x)
    loss = elbo(output, y)
    loss.backward()
    optimizer.step()
    pbar.set_postfix({'ELBO': loss.item()})



Epochs:   0%|          | 0/10000 [00:00<?, ?it/s]

In [35]:
with torch.no_grad():
    out = model(dataset.test_x)
    mean = out.mean
    print((dataset.test_y_std ** 2 * (mean - dataset.test_y) ** 2).mean().sqrt())
    print(mean_squared_error(out, dataset.test_y, dataset.test_y_std))
    print(negative_log_predictive_density(out, dataset.test_y).mean().item())

tensor(13.5059)
tensor(182.4084)
783.7306518554688


In [25]:
from datasets_dsvi import Energy as EnergyDSVI


data = EnergyDSVI().get_data()
X, Y, Xs, Ys, Y_std = [data[_] for _ in ['X', 'Y', 'Xs', 'Ys', 'Y_std']]
x, y = torch.from_numpy(X).to(DTYPE), torch.from_numpy(Y).to(DTYPE).squeeze(-1)
test_x, test_y = torch.from_numpy(Xs).to(DTYPE), torch.from_numpy(Ys).to(DTYPE).squeeze(-1)
test_y_std = torch.from_numpy(Y_std).to(DTYPE).squeeze(-1)

Normalizing X with mean [[7.63950796e-01 6.71782200e+02 3.18748191e+02 1.76517004e+02
  5.25759768e+00 3.47756874e+00 2.32995658e-01 2.82344428e+00]] and std [[ 0.10901257 89.79381901 41.44677061 45.98855734  1.74867228  1.16290081
   0.13683384  1.56965762]]
Normalizing Y with mean [[22.33689725]] and std [[9.9405047]]


In [26]:
inducing_points = empty_cluster_safe_kmeans(x, 100)
model = SGP(inducing_points)



In [27]:
inducing_points = empty_cluster_safe_kmeans(x, 100)
model = SGP(inducing_points)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01, maximize=True)
elbo = VariationalELBO(model.likelihood, model, y.size(0))


for _ in (pbar := tqdm(range(20000), desc='Epochs')):
    optimizer.zero_grad()
    output = model(x)
    loss = elbo(output, y)
    loss.backward()
    optimizer.step()
    pbar.set_postfix({'ELBO': loss.item()})

Epochs:   0%|          | 0/20000 [00:00<?, ?it/s]

In [33]:
with torch.no_grad():
    out = model(test_x)
    mean = out.mean
    print((test_y_std ** 2 * (mean - test_y) ** 2).mean().sqrt())
    print(mean_squared_error(out, test_y, test_y_std))
    print(negative_log_predictive_density(out, test_y).mean().item())

tensor(0.6582)
tensor(0.4333)
-0.7124626636505127


In [13]:
with torch.no_grad():
    out = model.likelihood(model(test_x))
    tll = test_log_likelihood(out, test_y, test_y_std)
    mse = mean_squared_error(out, test_y, test_y_std)
    metrics = {
        'tll': tll.mean().item(), 
        'mse': mse.mean().item(),
        'nlpd': negative_log_predictive_density(out, test_y).mean().item()
    }
    print(f"TLL: {metrics['tll']}, MSE: {metrics['mse']}")

TLL: -8.11833064314576, MSE: 98.90326690673828
