# This notebook reproduces results from the doubly stochastic variational inference paper on UCI datasets

In [1]:
# Types 
from torch import Tensor  


# Imports 
import os
import torch 
import pandas as pd 
from torch.utils.data import TensorDataset, DataLoader, Dataset
from torch.utils.data.dataloader import default_collate
from io import BytesIO
from urllib.request import urlopen
from zipfile import ZipFile
from gpytorch.variational import VariationalStrategy, CholeskyVariationalDistribution
from gpytorch.models.deep_gps import DeepGPLayer, DeepGP
from gpytorch.kernels import RBFKernel, ScaleKernel
from gpytorch.means import ConstantMean, LinearMean
from gpytorch.distributions import MultivariateNormal
from gpytorch.likelihoods import GaussianLikelihood
from gpytorch.mlls import VariationalELBO, DeepApproximateMLL
from tqdm.autonotebook import tqdm
from math import ceil
from scipy.special import logsumexp
from scipy.cluster.vq import kmeans2, ClusterError

  from tqdm.autonotebook import tqdm


# Global settings
The datasets are small and can fit on a GPU, so there is not need to move data in and out of the GPU. Thus, simply setting the default device should be a harmless way to run on GPU if available.

In [2]:
DTYPE = torch.float32
torch.set_default_dtype(DTYPE)
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Settings taken from the paper 

In [3]:
# Data
TEST_SIZE = 0.1

# Model 
LIKELIHOOD_VARIANCE = 0.01
LENGTHSCALE = 2.0
INNER_LAYER_VARIANCE = 1e-5
OUTPUT_LAYER_VARIANCE = 1.0 # This is a (reasonable) guess
NUM_INDUCING_POINTS = 100
MAX_HIDDEN_DIMS = 30

# Training 
LR = 0.01
NUM_ITERATIONS = 20_000
BATCH_SIZE = 10_000

# UCI Datasets processed according to the paper 

In [4]:
def normalize(x: Tensor) -> Tensor:
    return (x - x.mean(dim=0)) / x.std(dim=0, keepdim=True)


def train_test_split(x: Tensor, y: Tensor, test_size: float = TEST_SIZE) -> tuple[Tensor, Tensor, Tensor, Tensor]: 
    """
    Split the dataset into train and test sets.
    """
    split_idx = int(test_size * x.size(0))
    return x[split_idx:], y[split_idx:], x[:split_idx], y[:split_idx]


def joint_shuffle(*args: Tensor, generator: torch.Generator) -> tuple[Tensor, Tensor]:
    perm_idx = torch.randperm(args[0].size(0), generator=generator)
    return tuple(x[perm_idx] for x in args)


class UCIDataset:

    UCI_BASE_URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/'

    def __init__(self, name: str, url: str, path: str = '../../data/uci/', seed: int | None = None): 
        self.generator = torch.Generator()
        if seed is not None: 
            self.generator.manual_seed(seed)

        self.name = name 
        self.url = url
        self.path = path 
        self.csv_path = os.path.join(self.path, self.name + '.csv')

        # Load, shuffle, split, and normalize data (input and outputs).
        x, y = self.load_data()
        x, y = joint_shuffle(x, y, generator=self.generator)     
        train_x, train_y, test_x, test_y = train_test_split(x, y)
        self.train_x, self.train_y, self.test_x, self.test_y = map(normalize, (train_x, train_y, test_x, test_y))

        # Keeping the stardard deviation allows up to "restore the output scaling for evaluation" (from paper)
        self.test_y_std = test_y.std(dim=0, keepdim=True)
        
    @property
    def dimension(self) -> int:
        return self.train_x.shape[-1]

    @property 
    def train_dataset(self) -> Dataset:
        return TensorDataset(self.train_x, self.train_y)
    
    @property
    def test_dataset(self) -> Dataset:
        return TensorDataset(self.test_x, self.test_y)
    
    def download_data(self) -> None:
        NotImplementedError

    def read_data(self) -> tuple[Tensor, Tensor]:
        xy = torch.from_numpy(pd.read_csv(self.csv_path).values).to(DTYPE)
        return xy[:, :-1], xy[:, -1]

    def load_data(self, overwrite: bool = False) -> tuple[Tensor, Tensor]:
        if overwrite or not os.path.isfile(self.csv_path):
            self.download_data()
        return self.read_data()


class Kin8mn(UCIDataset):

    DEFAULT_URL = 'https://raw.githubusercontent.com/liusiyan/UQnet/master/datasets/UCI_datasets/kin8nm/dataset_2175_kin8nm.csv'

    def __init__(self, path: str = '../../data/uci/', seed: int | None = None, url: str | None = None):
        url = url or Kin8mn.DEFAULT_URL
        super().__init__(name='kin8nm', path=path, url=url, seed=seed)

    def download_data(self) -> None:
        df = pd.read_csv(self.url)
        os.makedirs(self.path, exist_ok=True)
        df.to_csv(self.csv_path, index=False)


class Power(UCIDataset):

    DEFAULT_URL = UCIDataset.UCI_BASE_URL + "00294/CCPP.zip"

    def __init__(self, path: str = '../../data/uci/', seed: int | None = None, url: str | None = None):
        url = url or Power.DEFAULT_URL
        super().__init__(name='power', path=path, url=url, seed=seed)

    def download_data(self):
        with urlopen(self.url) as zipresp:
            with ZipFile(BytesIO(zipresp.read())) as zfile:
                zfile.extractall('/tmp/')

        df = pd.read_excel('/tmp/CCPP//Folds5x2_pp.xlsx')
        os.makedirs(self.path, exist_ok=True)
        df.to_csv(self.csv_path, index=False)


class Concrete(UCIDataset):

    DEFAULT_URL = UCIDataset.UCI_BASE_URL + 'concrete/compressive/Concrete_Data.xls'

    def __init__(self, path: str = '../../data/uci/', seed: int | None = None, url: str | None = None):
        url = url or Concrete.DEFAULT_URL
        super().__init__(name='concrete', url=url, seed=seed)

    def download_data(self):
        df = pd.read_excel(self.url)
        os.makedirs(self.path, exist_ok=True)
        df.to_csv(self.csv_path, index=False)

# Euclidean deep GP initialized according to the paper

In [5]:
def get_hidden_dims(dataset: UCIDataset) -> int:
    return min(MAX_HIDDEN_DIMS, dataset.dimension)


def empty_cluster_safe_kmeans(x: Tensor, k: int, num_retries: int = 10) -> Tensor:
    """
    Initialize inducing points using kmeans. (from paper)
    """
    for _ in range(num_retries):
        try:
            return torch.from_numpy(kmeans2(x, k, missing='raise')[0]).to(x.device, x.dtype)
        except ClusterError:
            continue 
    raise ClusterError(f"Failed to find {k} clusters in {num_retries} retries.")


def get_inducing_points(dataset: UCIDataset, num_inducing_points: int) -> Tensor:
    """
    Initialize inducing points using kmeans. (from paper)
    """
    return empty_cluster_safe_kmeans(dataset.train_x, num_inducing_points)


class EuclideanDeepGPLayer(DeepGPLayer):
    def __init__(self, inducing_points, output_dims, hidden: bool = False):
        input_dims = inducing_points.size(-1)
        batch_shape = torch.Size([output_dims]) if output_dims is not None else torch.Size([])

        variational_distribution = CholeskyVariationalDistribution(
            num_inducing_points=inducing_points.size(0), 
            batch_shape=batch_shape,
        )
        variational_strategy = VariationalStrategy(
            self,
            inducing_points,
            variational_distribution,
            learn_inducing_locations=True,
        )
        super().__init__(variational_strategy, input_dims, output_dims)

        base_kernel = RBFKernel(batch_shape=batch_shape)
        base_kernel.lengthscale = LENGTHSCALE
        # Use ard_num_dims=input_dims adds a lengthscale for each input dimension 
        # "we choose the RBF kernel with a lengthscale for each dimension" (from paper)
        self.covar_module = ScaleKernel(base_kernel, batch_shape=batch_shape, ard_num_dims=input_dims)
        if hidden:
            self.mean_module = LinearMean(input_dims, batch_shape=batch_shape)
            self.covar_module.outputscale = INNER_LAYER_VARIANCE
        else:
            self.mean_module = ConstantMean(batch_shape=batch_shape)
            self.covar_module.outputscale = OUTPUT_LAYER_VARIANCE

    def forward(self, x):
        covar = self.covar_module(x)
        mean = self.mean_module(x)
        return MultivariateNormal(mean, covar)
    

class EuclideanDeepGP(DeepGP):
    def __init__(self, dataset: UCIDataset, num_layers: int, num_inducing_points: int = NUM_INDUCING_POINTS):
        super().__init__()
        num_hidden_dims = get_hidden_dims(dataset)
        inducing_points = get_inducing_points(dataset, num_inducing_points)

        self.layers = torch.nn.ModuleList(
            [EuclideanDeepGPLayer(inducing_points, num_hidden_dims, hidden=True) for _ in range(num_layers - 1)] + 
            [EuclideanDeepGPLayer(inducing_points, None, hidden=False)]
        )
        self.likelihood = GaussianLikelihood()
        self.likelihood.noise = LIKELIHOOD_VARIANCE

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x 


def get_model(dataset: UCIDataset, num_layers: int, num_inducing_points: int = NUM_INDUCING_POINTS) -> EuclideanDeepGP:
    """
    Creates a model and moves it to the device that the experiment is running on.
    """
    return EuclideanDeepGP(dataset, num_layers, num_inducing_points).to(DEVICE)

# Train and evaluate model according to the paper

In [6]:
def collate_and_to_device(x):
    return tuple(_x.to(DEVICE) for _x in default_collate(x))


def batch_size(dataset):
    return min(BATCH_SIZE, dataset.train_x.size(0))
        

def num_epochs(dataset) -> int:
    iterations_per_epoch = ceil(dataset.train_x.size(0) / batch_size(dataset))
    return ceil(NUM_ITERATIONS / iterations_per_epoch)


def train_step(x: Tensor, y: Tensor, model: EuclideanDeepGP, optimizer: torch.optim.Optimizer, elbo: VariationalELBO) -> float:
    optimizer.zero_grad()
    output = model(x)
    loss = elbo(output, y)
    loss.backward()
    optimizer.step()
    return loss.item()


def train(dataset: UCIDataset, model: EuclideanDeepGP) -> list[float]: 
    optimizer = torch.optim.Adam(model.parameters(), lr=LR, maximize=True)
    elbo = DeepApproximateMLL(VariationalELBO(model.likelihood, model, dataset.train_y.size(0)))
    train_loader = DataLoader(dataset.train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_and_to_device)

    losses = []
    for _ in (pbar := tqdm(range(num_epochs(dataset)), desc='Epochs')):
        epoch_loss = 0
        for x_batch, y_batch in train_loader:
            loss = train_step(x=x_batch, y=y_batch, model=model, optimizer=optimizer, elbo=elbo)
            epoch_loss += loss
        losses.append(epoch_loss)
        pbar.set_postfix({'ELBO': epoch_loss})

    return losses 

        
def test_log_likelihood(outputs: MultivariateNormal, targets: Tensor, y_std: Tensor) -> Tensor:
    mean, stddev = outputs.mean, outputs.stddev
    logpdf = torch.distributions.Normal(loc=mean, scale=stddev).log_prob(targets) - torch.log(y_std)
    # average over likelihood samples 
    logpdf = logsumexp(logpdf.numpy(), axis=0, b=1 / mean.size(0))
    # average over data points
    return torch.from_numpy(logpdf).mean()


def mean_squared_error(outputs: MultivariateNormal, targets: Tensor, y_std: Tensor) -> Tensor:
    mean = outputs.mean.mean(0)
    return y_std ** 2 * ((mean - targets) ** 2).mean(0)        
        

def evaluate(dataset: UCIDataset, model: EuclideanDeepGP) -> dict[str, float]:
    with torch.no_grad():
        out = model.likelihood(model(dataset.test_x))
        tll = test_log_likelihood(out, dataset.test_y, dataset.test_y_std)
        mse = mean_squared_error(out, dataset.test_y, dataset.test_y_std)
        metrics = {
            'tll': tll.mean().item(), 
            'mse': mse.mean().item(),
        }
        print(f"TLL: {metrics['tll']}, MSE: {metrics['mse']}")
    return metrics 


def reproduce_results(dataset, num_layers: int, num_inducing_points: int = 100, num_runs: int = 5):
    print(f"Reproducing results for {dataset.name}".center(80, '-') + '\n')

    metrics = []
    for run in range(num_runs):
        print(f"Run {run + 1}".center(80, '-'))
        torch.random.manual_seed(run)
        model = get_model(dataset, num_layers=num_layers, num_inducing_points=num_inducing_points)
        train(dataset, model)
        run_metrics = evaluate(dataset, model)
        metrics.append(run_metrics)
    df = pd.DataFrame(metrics)

    print("Metrics mean".center(80, '-'))
    print(df.mean())

    print("Metrics STD".center(80, '-'))
    print(df.std())

    return df 

# Test on Kin8mn

In [7]:
dataset = Kin8mn()
num_layers = 1
model = get_model(dataset, num_layers=num_layers)
train(dataset, model)

Epochs:   0%|          | 0/20000 [00:00<?, ?it/s]

KeyboardInterrupt: 