# Train and evaluate a PC

In [2]:
import random
import torch
import numpy as np
import matplotlib.pyplot as plt

In [3]:
device = torch.device("cpu")  # The device to use, e.g., "cpu", "cuda", "cuda:1"

%load_ext autoreload
%autoreload 2

In [4]:
%reload_ext autoreload

Set the random seeds.

In [5]:
random.seed(4)
np.random.seed(4)
torch.manual_seed(4)
# if 'cuda' in device.type:
#     torch.cuda.manual_seed(42)

<torch._C.Generator at 0x7fa9089f0890>

## Load MNIST Dataset

Load the training and test splits of MNIST, and preprocess them by flattening the tensor images.

In [None]:
from torchvision import transforms, datasets
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Lambda(lambda x: (255 * x.view(-1)).long())
])
data_train = datasets.MNIST('datasets', train=True, download=True, transform=transform)
data_test = datasets.MNIST('datasets', train=False, download=True, transform=transform)
num_variables = data_train[0][0].shape[0]
height, width = 28, 28
print(f"Number of variables: {num_variables}")

In [None]:
plt.matshow(data_train[0][0].reshape(28, 28), cmap='gray')
plt.title(f"Class: {data_train[0][1]}")
plt.show()

In [6]:
import torch.nn.functional as F

from uci_datasets import Dataset

from ignite.engine import Events, Engine
from ignite.metrics import Average, Loss
from ignite.contrib.handlers import ProgressBar

import gpytorch
from gpytorch.mlls import VariationalELBO
from gpytorch.likelihoods import GaussianLikelihood

import pandas as pd
import numpy as np


## Instantiating the region graph

In [7]:
from cirkit.region_graph.poon_domingos import PoonDomingos
from cirkit.region_graph.random_binary_tree import RandomBinaryTree
from cirkit.region_graph.fully_factorized import FullyFactorized

In [8]:
from cirkit.models.gp import CircuitGP, initial_values

Initialize a _Quad Graph_ region graph.

Others available region graphs are _Poon Domingos_ and _QuadTree_, whose imports are showed below.

## Choosing the layers

Now we have to choose both the input and inner layers of our circuit. As input layer we select the _CategoricalLayer_ with 256 categories (the number of pixel values). For the inner layer instead, we choose the _uncollapsed CP_ layer with rank 1.

In [9]:
from cirkit.layers.input.exp_family import CategoricalLayer
from cirkit.layers.sum_product import CPLayer
from cirkit.layers.input.rbf_kernel_flatten import RBFKernelFlattenLayer

efamily_cls = RBFKernelFlattenLayer   # Flatten
efamily_kwargs = {}
layer_cls = CPLayer
layer_kwargs = {'rank': 1}

## Building the tensorized PC

We can now build our tensorized PC by specifying the region graph and layers we chose previously. In addition, we can scale the architecture by increasing the number of input and inner units. We can also have circuits with multiple output units by choosing _num_classes > 1_. However, in this notebook we only estimate the distribution of the images and marginalize out the class variable.

To ensure weights are non-negative we reparametrize them via exponentiation. Several reparametrization functions are available.

In [10]:
from cirkit.region_graph.quad_tree import QuadTree
# region_graph = QuadTree(width, height, struct_decomp=False)
# region_graph = RandomBinaryTree(num_vars=8, depth=3, num_repetitions=6)
region_graph = FullyFactorized(num_vars=8)

In [11]:
from cirkit.reparams.leaf import ReparamExp, ReparamLogSoftmax, ReparamSoftmax
from cirkit.models.tensorized_circuit import TensorizedPC
pc = TensorizedPC.from_region_graph(
    region_graph,
    num_inner_units=10,
    num_input_units=10,
    efamily_cls=efamily_cls,
    efamily_kwargs=efamily_kwargs,
    layer_cls=layer_cls,
    layer_kwargs=layer_kwargs,
    num_classes=1,
    reparam=ReparamSoftmax # ReparamLogSoftmax #  ReparamSoftmax
)
pc.to(device)
print(pc)

TensorizedPC(
  (input_layer): RBFKernelFlattenLayer(
    (params): ReparamExp()
  )
  (scope_layer): ScopeLayer()
  (inner_layers): ModuleList(
    (0): CollapsedCPLayer(
      (params_in): ReparamSoftmax()
    )
  )
)


In [12]:
for param in pc.parameters(): 
    print (param.shape)

torch.Size([8, 10])
torch.Size([1, 10, 1])


In [205]:
total_params = sum(p.numel() for p in pc.parameters() if p.requires_grad)
print(f"Total number of parameters: {total_params}")

Total number of parameters: 2700


In [13]:
data = Dataset("kin40k")
x_train, y_train, x_test, y_test = data.get_split(split=2)

kin40k dataset, N=40000, d=8


In [14]:
x_train.shape, x_test.shape

((36000, 8), (4000, 8))

In [15]:
x_train_real = x_train[:32000] #32000 # 2053   36584    36584     39063   13281    2672   # RE-RUN # 13279   # 1279   4701  824
y_train_real = y_train[:32000]
y_train_real = y_train_real.squeeze()
x_val = x_train[32000:]
y_val = y_train[32000:]
y_val = y_val.squeeze()
y_test = y_test.squeeze()

In [16]:
mean = x_train_real.mean(axis=0)
std = x_train_real.std(axis=0)

x_train_real_normalized = (x_train_real - mean) / std
x_val_normalized = (x_val - mean) / std
x_test_normalized = (x_test - mean) / std

In [17]:
import torch.nn as nn

class IdentityMapping(nn.Module):
    def __init__(self):
        super(IdentityMapping, self).__init__()
    
    def forward(self, x):
        return x

In [123]:
for i in range(5):
    print(ds_train[i])

(tensor([ 4.3568e+03,  2.0165e+02, -7.6162e-02,  5.0537e+01,  6.1556e+05,
         6.7965e+01,  8.5420e+02, -4.7975e+01, -6.8170e+00]), tensor(-1.8912))
(tensor([-2.8508e+03, -4.6753e+02,  6.0788e-02, -4.6812e+01, -4.3633e+05,
        -1.7177e+01,  8.4624e+01, -4.2975e+01, -1.1136e-01]), tensor(0.9737))
(tensor([ 4.6787e+03,  5.9385e+02, -5.4212e-02,  9.8102e+01,  6.6254e+05,
         1.0982e+02,  9.7232e+02,  6.9025e+01, -7.8412e+00]), tensor(-0.5134))
(tensor([ 1.1472e+04,  2.6860e+03, -3.5182e-02,  1.8665e+02,  1.5292e+06,
         1.7410e+02,  2.5700e+03,  2.1602e+02, -1.5885e+01]), tensor(-0.9630))
(tensor([ 7.2802e+03,  1.6162e+03, -3.2252e-02,  1.2412e+02,  1.0695e+06,
         1.3790e+02,  1.5315e+03,  2.8025e+01, -1.2329e+01]), tensor(-0.5702))


In [20]:
class SimpleNN(nn.Module):
    def __init__(self):
        super(SimpleNN, self).__init__()
        # Define the linear layer with input dimension 8 and output dimension 256
        self.linear = nn.Linear(in_features=8, out_features=128)
        # Define the ReLU activation function
        self.relu = nn.ReLU()
        
    def forward(self, x):
        # Apply linear layer and then ReLU activation to the input x
        x = self.linear(x)
        x = self.relu(x)
        return x

In [18]:
np.random.seed(24)
torch.manual_seed(24) ####################### CHANGE

batch_size = 32

# X_train, y_train = make_data(n_samples)
# X_test, y_test = X_train, y_train

# x_train, y_train, x_test, y_test

ds_train = torch.utils.data.TensorDataset(torch.from_numpy(x_train_real_normalized).float(), torch.from_numpy(y_train_real).float())
dl_train = torch.utils.data.DataLoader(ds_train, batch_size=batch_size, shuffle=True, drop_last=True) # suffle 

ds_val = torch.utils.data.TensorDataset(torch.from_numpy(x_val_normalized).float(), torch.from_numpy(y_val).float())
dl_val = torch.utils.data.DataLoader(ds_val, batch_size=512, shuffle=False)

ds_test = torch.utils.data.TensorDataset(torch.from_numpy(x_test_normalized).float(), torch.from_numpy(y_test).float())
dl_test = torch.utils.data.DataLoader(ds_test, batch_size=512, shuffle=False)

# steps = 5e3
epochs = 50
print(f"Training with {len(x_train_real)} datapoints for {epochs} epochs")

# Change this boolean to False for SNGP

input_dim = 8 # input di  # 128

num_outputs = 1 # regression with 1D output

feature_extractor = IdentityMapping()

n_inducing_points = 50
kernel = "HBF" ################# change 

initial_inducing_points, initial_lengthscale = initial_values(
        ds_train, feature_extractor, n_inducing_points
)

gp_model = CircuitGP(
    num_outputs=num_outputs,
    num_features=input_dim,          # CHANGE features / input_dim
    initial_lengthscale=initial_lengthscale,
    initial_inducing_points=initial_inducing_points,
    circuit=pc
    # kernel=kernel,
)
    
likelihood = GaussianLikelihood()
elbo_fn = VariationalELBO(likelihood, gp_model, num_data=len(ds_train))
loss_fn = lambda x, y: -elbo_fn(x, y)
    
# learning rate   
lr = 1e-3

parameters = [
    {"params": gp_model.parameters(), "lr": lr},
]

parameters.append({"params": likelihood.parameters(), "lr": lr})
    
    
optimizer = torch.optim.Adam(parameters)
pbar = ProgressBar()

step_counter = 0

def step(engine, batch):
    
    global step_counter
    step_counter += 1
    
    gp_model.train()
    likelihood.train()
    
    optimizer.zero_grad()
    
    x, y = batch
    if torch.cuda.is_available():
        x = x.cuda()
        y = y.cuda()

    y_pred = gp_model(x) # get y
    
    
    loss = loss_fn(y_pred, y) # loss
    # print("loss", loss)
    
    if torch.isnan(loss).any():
        print(f"Step {step_counter}: NaN detected in loss.")
        print("loss", loss)
        print("y_pred", y_pred)
    
    if torch.isnan(loss).any():
        print("NaN detected in loss, saving model and stopping.")
        # Save model weights before termination
        torch.save(gp_model.state_dict(), 'model_weights_before_nan.pt')
        engine.terminate()
        return
    
    loss.backward()
    optimizer.step()
    
    return loss.item()


def eval_step(engine, batch):
    gp_model.eval() # set to eval
    likelihood.eval()
    
    x, y = batch
    if torch.cuda.is_available():
        x = x.cuda()
        y = y.cuda()

    y_pred = gp_model(x)   
    return y_pred, y

    
trainer = Engine(step)
evaluator = Engine(eval_step)

metric = Average()
metric.attach(trainer, "loss")
pbar.attach(trainer)

metric = Loss(lambda y_pred, y: - likelihood.expected_log_prob(y, y_pred).mean())

metric.attach(evaluator, "loss")

@trainer.on(Events.EPOCH_COMPLETED(every=int(epochs/20) + 1))
def log_results(trainer):
    evaluator.run(dl_val) # val dataset
    print(f"Results - Epoch: {trainer.state.epoch} - "
          f"Val Loss: {evaluator.state.metrics['loss']:.2f} - "
          f"Train Loss: {trainer.state.metrics['loss']:.2f}")



Training with 32000 datapoints for 50 epochs
f_X_samples torch.Size([1000, 8])
initial_lengthscale tensor(3.8970)
All circuit parameters shape: 
torch.Size([8, 10])
torch.Size([1, 10, 1])


  super()._check_params_vs_input(X, default_n_init=3)
Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md

  from tqdm.autonotebook import tqdm


In [19]:
for index, param in enumerate(gp_model.parameters()): 
    # if (index==2):
    print(param.shape)

torch.Size([50, 8])
torch.Size([50])
torch.Size([50, 50])
torch.Size([8, 10])
torch.Size([1, 10, 1])
torch.Size([1])
torch.Size([])


In [20]:
trainer.run(dl_train, max_epochs=epochs)

torch.linalg.solve_triangular has its arguments reversed and does not return a copy of one of the inputs.
X = torch.triangular_solve(B, A).solution
should be replaced with
X = torch.linalg.solve_triangular(A, B). (Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/aten/src/ATen/native/BatchLinearAlgebra.cpp:2198.)
  res = torch.triangular_solve(right_tensor, self.evaluate(), upper=self.upper).solution


[1/1000]   0%|           [00:00<?]

[1/1000]   0%|           [00:00<?]

[1/1000]   0%|           [00:00<?]

Results - Epoch: 3 - Val Loss: 1.18 - Train Loss: 1.22


[1/1000]   0%|           [00:00<?]

[1/1000]   0%|           [00:00<?]

[1/1000]   0%|           [00:00<?]

Results - Epoch: 6 - Val Loss: 1.03 - Train Loss: 1.03


[1/1000]   0%|           [00:00<?]

[1/1000]   0%|           [00:00<?]

[1/1000]   0%|           [00:00<?]

Results - Epoch: 9 - Val Loss: 0.99 - Train Loss: 0.98


[1/1000]   0%|           [00:00<?]

[1/1000]   0%|           [00:00<?]

[1/1000]   0%|           [00:00<?]

Results - Epoch: 12 - Val Loss: 0.96 - Train Loss: 0.95


[1/1000]   0%|           [00:00<?]

[1/1000]   0%|           [00:00<?]

[1/1000]   0%|           [00:00<?]

Results - Epoch: 15 - Val Loss: 0.93 - Train Loss: 0.91


[1/1000]   0%|           [00:00<?]

[1/1000]   0%|           [00:00<?]

[1/1000]   0%|           [00:00<?]

Results - Epoch: 18 - Val Loss: 0.89 - Train Loss: 0.88


[1/1000]   0%|           [00:00<?]

[1/1000]   0%|           [00:00<?]

[1/1000]   0%|           [00:00<?]

Results - Epoch: 21 - Val Loss: 0.84 - Train Loss: 0.84


[1/1000]   0%|           [00:00<?]

[1/1000]   0%|           [00:00<?]

[1/1000]   0%|           [00:00<?]

Results - Epoch: 24 - Val Loss: 0.81 - Train Loss: 0.81


[1/1000]   0%|           [00:00<?]

[1/1000]   0%|           [00:00<?]

[1/1000]   0%|           [00:00<?]

Results - Epoch: 27 - Val Loss: 0.78 - Train Loss: 0.78


[1/1000]   0%|           [00:00<?]

[1/1000]   0%|           [00:00<?]

[1/1000]   0%|           [00:00<?]

Results - Epoch: 30 - Val Loss: 0.75 - Train Loss: 0.75


[1/1000]   0%|           [00:00<?]

[1/1000]   0%|           [00:00<?]

[1/1000]   0%|           [00:00<?]

Results - Epoch: 33 - Val Loss: 0.73 - Train Loss: 0.73


[1/1000]   0%|           [00:00<?]

[1/1000]   0%|           [00:00<?]

[1/1000]   0%|           [00:00<?]

Results - Epoch: 36 - Val Loss: 0.70 - Train Loss: 0.70


[1/1000]   0%|           [00:00<?]

[1/1000]   0%|           [00:00<?]

[1/1000]   0%|           [00:00<?]

Results - Epoch: 39 - Val Loss: 0.67 - Train Loss: 0.67


[1/1000]   0%|           [00:00<?]

[1/1000]   0%|           [00:00<?]

[1/1000]   0%|           [00:00<?]

Results - Epoch: 42 - Val Loss: 0.65 - Train Loss: 0.65


[1/1000]   0%|           [00:00<?]

[1/1000]   0%|           [00:00<?]

[1/1000]   0%|           [00:00<?]

Results - Epoch: 45 - Val Loss: 0.63 - Train Loss: 0.64


[1/1000]   0%|           [00:00<?]

[1/1000]   0%|           [00:00<?]

[1/1000]   0%|           [00:00<?]

Results - Epoch: 48 - Val Loss: 0.61 - Train Loss: 0.62


[1/1000]   0%|           [00:00<?]

[1/1000]   0%|           [00:00<?]

State:
	iteration: 50000
	epoch: 50
	epoch_length: 1000
	max_epochs: 50
	output: 0.52861088514328
	batch: <class 'list'>
	metrics: <class 'dict'>
	dataloader: <class 'torch.utils.data.dataloader.DataLoader'>
	seed: <class 'NoneType'>
	times: <class 'dict'>

In [None]:
for param in likelihood.parameters(): 
    print (param.shape)

In [21]:
from ignite.metrics import RootMeanSquaredError
import torch

# Assuming you have a function to compute RMSE, or you're using Ignite's RMSE metric

def eval_step(engine, batch):
    gp_model.eval()  # Ensure model is in evaluation mode
    likelihood.eval()
    
    x, y = batch
    if torch.cuda.is_available():
        x = x.cuda()
        y = y.cuda()

    # Assuming your model outputs a distribution, e.g., MultivariateNormal
    with torch.no_grad():  # Disable gradient computation for evaluation
        distribution = gp_model(x)
        y_pred = distribution.mean  # Use the mean of the distribution as the prediction

    return y_pred, y

# Update the evaluator engine
evaluator = Engine(eval_step)

# Attach the RMSE metric to the evaluator
rmse = RootMeanSquaredError()
rmse.attach(evaluator, "RMSE")

# After training, run the evaluator on the test dataset to compute the RMSE
evaluator.run(dl_test)

# Retrieve and display the RMSE
test_rmse = evaluator.state.metrics['RMSE']
print(f"Test RMSE: {test_rmse:.2f}")


Test RMSE: 0.37


In [None]:
pc.input_layer.params.param.shape
# (self.num_vars, self.num_output_units, self.num_replicas, self.num_suff_stats)

In [None]:
pc.scope_layer.scope.shape

In [None]:
pc.inner_layers[0].params_in() #.param #.shape #.param.shape
# (F, H, I, O)
# (fold count, arity, input, output)

In [None]:
from cirkit.models.rbf_kernel import RBFCircuitKernel

circuit_kernel = RBFCircuitKernel(pc, batch_shape=torch.Size([]))


In [None]:
circuit_kernel(x1.squeeze(), x2.squeeze()).evaluate()

In [None]:
x1.squeeze().shape

In [None]:
# set parameters

pc.input_layer.params.param = torch.nn.Parameter(torch.log(torch.ones(tuple(pc.input_layer.params.shape))*3.3))
# pc.inner_layers[0].params_in.param = torch.nn.Parameter(torch.log(0.25*torch.ones(tuple(pc.inner_layers[0].params_in.shape))))
# pc.inner_layers[0].params_in = torch.nn.Parameter(torch.ones(tuple(pc.inner_layers[0].params_in.shape))*3.3)
# pc.inner_layers[1].params_in = torch.nn.Parameter(torch.ones(tuple(pc.inner_layers[1].params_in.shape))*3.3)
# pc.inner_layers[2].params_in = torch.nn.Parameter(torch.ones(tuple(pc.inner_layers[2].params_in.shape))*3.3)
# pc.inner_layers[3].params_in = torch.nn.Parameter(torch.ones(tuple(pc.inner_layers[3].params_in.shape))*3.3)

In [None]:
pc.inner_layers[0].params_in() #.shape

In [None]:
x1 = torch.randn(3, 8, 1)
x2 = torch.randn(3, 8, 1)

In [None]:
pc(x1, x2).squeeze()

In [None]:
def eval_pc(x1, x2): 
    return pc(x1.unsqueeze(-1), x2.unsqueeze(-1)).squeeze(-1)

eval_pc(x1.squeeze(), x2.squeeze())

In [None]:
from gpytorch.kernels import RBFKernel

# x = torch.randn(3, 5)
covar_module = RBFKernel()
covar_module.lengthscale = torch.tensor(3.3)
covar_module(x1.squeeze(), x2.squeeze()).evaluate()

In [None]:
x1.squeeze().shape

In [None]:
from gpytorch.kernels import RBFKernel
x = torch.randn(3, 2)
RBFKernel().lengthscale = torch.tensor(3.3)

In [None]:
# Test RBF input output = RBF kernel 

In [None]:
from gpytorch.kernels import RBFKernel, SpectralMixtureKernel

x = torch.randn(3, 5)
covar_module = SpectralMixtureKernel(num_mixtures=2, ard_num_dims=5)
covar_module.mixture_scales = torch.tensor(3.3).expand(1, 2, 1, 5)
covar_module.mixture_means = torch.tensor(2.2).expand(1, 2, 1, 5)
covar_module.mixture_weights = torch.tensor([0.5]).expand(1, 2, 1, 5)
covar_module(x).evaluate()
# covar_module.lengthscale

In [None]:
from cirkit.layers.input.sm_kernel import SMKernelLayer
input_la = SMKernelLayer(num_vars=5, num_output_units=1)

input_la.params = torch.nn.Parameter(torch.ones((5,1))*3.3)

# input_la(x1, x2).squeeze().shape

# input_la(x.unsqueeze(-1), x.unsqueeze(-1)).shape

torch.prod(torch.exp(input_la(x.unsqueeze(-1), x.unsqueeze(-1)).squeeze()), dim=2)

In [None]:
input_la = RBFKernelLayer(num_vars=20, num_output_units=1)

input_la.params = torch.nn.Parameter(torch.ones((20,1))*3.3)

# input_la(x1, x2).squeeze().shape
torch.prod(input_la(x1, x1).squeeze(), dim=2)

In [None]:
from gpytorch.kernels import RBFKernel

x = torch.randn(3, 5)
covar_module = RBFKernel()
covar_module.lengthscale = torch.tensor(3.3)
covar_module(x).evaluate()
# covar_module.lengthscale

In [None]:
from cirkit.layers.input.rbf_kernel import RBFKernelLayer
input_la = RBFKernelLayer(num_vars=5, num_output_units=1)

input_la.params.param = torch.nn.Parameter(torch.log(torch.ones(tuple(input_la.params.shape))*3.3))
# pc.input_layer.params.param = torch.nn.Parameter(torch.log(torch.ones(tuple(pc.input_layer.params.shape))*3.3))

# input_la(x1, x2).squeeze().shape

# input_la(x.unsqueeze(-1), x.unsqueeze(-1)).shape

torch.prod(torch.exp(input_la(x.unsqueeze(-1), x.unsqueeze(-1)).squeeze()), dim=2)

In [None]:
train_x = torch.linspace(0, 1, 3)
torch.sin(train_x * (2 * math.pi))

In [None]:
import math


# train_x = torch.linspace(0, 1, 3)
# train_y = torch.sin(train_x * (2 * math.pi))
train_x = torch.rand((3, 5))
train_y = torch.rand((3))

covar_module = SpectralMixtureKernel(num_mixtures=4, ard_num_dims=5)
covar_module.initialize_from_data(train_x, train_y)
covar_module(train_x).evaluate()

In [None]:
from cirkit.layers.input.sm_kernel import SMKernelLayer
input_la = SMKernelLayer(num_vars=1, num_output_units=4)

input_la.params_mu.param = torch.nn.Parameter(covar_module.mixture_means)
input_la.params_sigma.param = torch.nn.Parameter(torch.log(covar_module.mixture_scales))


to_be_weighted = input_la(train_x.unsqueeze(-1), train_x.unsqueeze(-1))

to_be_weighted = torch.prod(to_be_weighted, dim=2, keepdim=True) / 5

tensor1_expanded = covar_module.mixture_weights.expand_as(to_be_weighted.squeeze(-1))

# Element-wise multiplication and then sum over the inner product dimension (dimension 3 after squeeze)
(tensor1_expanded * to_be_weighted.squeeze(-1)).sum(dim=3).squeeze()

# torch.prod(finalfinal, dim=-1, keepdim=False)

In [None]:
to_be_weighted.shape

In [None]:
covar_module.mixture_scales.shape

In [None]:
covar_module(x1).evaluate().shape

In [None]:
train_x.unsqueeze(-1).unsqueeze(-1).shape

In [None]:
x_2 = torch.tensor([[-0.6281], [ 0.1011], [ 0.0664]])

In [None]:
from cirkit.layers.input.rbf_kernel import RBFKernelLayer
input_la = RBFKernelLayer(num_vars=2, num_output_units=1)

input_la.params = torch.nn.Parameter(torch.ones((1,1))*3.3)

input_la(x_2.unsqueeze(-1), x_2.unsqueeze(-1)).squeeze()

In [None]:
input_la.params

In [None]:
torch.ones((2,1))*3.3

In [None]:
train_x.unsqueeze(-1).unsqueeze(-1).shape

In [None]:
input_la(x.unsqueeze(-1), x.unsqueeze(-1)).squeeze()

In [None]:
x_2.unsqueeze(-1).shape

In [None]:
torch.cdist(x1, x2, p=2)

In [None]:
from torch import optim
from torch.utils.data import DataLoader
train_dataloader = DataLoader(data_train, shuffle=True, batch_size=256)
test_dataloader = DataLoader(data_test, shuffle=False, batch_size=256)
optimizer = optim.SGD(pc.parameters(), lr=0.1, momentum=0.9)

Since the constructed PC is not necessarily normalized, we construct the integral circuit that will compute the partition function. Note that parameters are shared and therefore there is no additional memory required.

In [None]:
from cirkit.models.functional import integrate
pc_pf = integrate(pc)

Finally, we optimize the parameters for 5 epochs by minimizing the negative log-likelohood.

In [None]:
num_epochs = 5
for epoch_idx in range(num_epochs):
    running_loss = 0.0
    for batch, _ in train_dataloader:
        batch = batch.to(device).unsqueeze(dim=-1)  # Add a channel dimension
        log_score = pc(batch)
        log_pf = pc_pf(batch)     # Compute the partition function
        lls = log_score - log_pf  # Compute the log-likelihood
        loss = -torch.mean(lls)   # The loss is the negative average log-likelihood
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        running_loss += loss * len(batch)
        # Clamp the parameters to ensure they are in the intended domain
        # This is needed if we do not use any reparametrization to ensure parameters non-negativity
        # In our case, clamping is disable becuase we reparameterize via exponentiation (see above)
        #for layer in model.inner_layers:
        #    layer.clamp_params()
    print(f"Epoch {epoch_idx}: Average NLL: {running_loss / len(data_train):.3f}")

We then evaluate our model on test data by computing the average log-likelihood and bits per dimension.

In [None]:
with torch.no_grad():
    pc.eval()
    log_pf = pc_pf(torch.empty((), device=device))  # Compute the partition function once for testing
    test_lls = 0.0
    for batch, _ in test_dataloader:
        log_score = pc(batch.to(device).unsqueeze(dim=-1))
        lls = log_score - log_pf
        test_lls += lls.sum().item()
    average_ll = test_lls / len(data_test)
    bpd = -average_ll / (num_variables * np.log(2.0))
    print(f"Average test LL: {average_ll:.3f}")
    print(f"Bits per dimension: {bpd}")

In [None]:
#!/usr/bin/env python3

import gpytorch

# from ..functions import RBFCovariance
# from ..settings import trace_mode
from gpytorch.kernels import Kernel


def postprocess_rbf(dist_mat):
    return dist_mat.div_(-2).exp_()


class TestRBFKernel(Kernel):
    r"""
    Computes a covariance matrix based on the RBF (squared exponential) kernel
    between inputs :math:`\mathbf{x_1}` and :math:`\mathbf{x_2}`:

    .. math::

       \begin{equation*}
          k_{\text{RBF}}(\mathbf{x_1}, \mathbf{x_2}) = \exp \left( -\frac{1}{2}
          (\mathbf{x_1} - \mathbf{x_2})^\top \Theta^{-2} (\mathbf{x_1} - \mathbf{x_2}) \right)
       \end{equation*}

    where :math:`\Theta` is a :attr:`lengthscale` parameter.
    See :class:`gpytorch.kernels.Kernel` for descriptions of the lengthscale options.

    .. note::

        This kernel does not have an `outputscale` parameter. To add a scaling parameter,
        decorate this kernel with a :class:`gpytorch.kernels.ScaleKernel`.

    Args:
        :attr:`ard_num_dims` (int, optional):
            Set this if you want a separate lengthscale for each
            input dimension. It should be `d` if :attr:`x1` is a `n x d` matrix. Default: `None`
        :attr:`batch_shape` (torch.Size, optional):
            Set this if you want a separate lengthscale for each
            batch of input data. It should be `b` if :attr:`x1` is a `b x n x d` tensor. Default: `torch.Size([])`.
        :attr:`active_dims` (tuple of ints, optional):
            Set this if you want to compute the covariance of only a few input dimensions. The ints
            corresponds to the indices of the dimensions. Default: `None`.
        :attr:`lengthscale_prior` (Prior, optional):
            Set this if you want to apply a prior to the lengthscale parameter.  Default: `None`.
        :attr:`lengthscale_constraint` (Constraint, optional):
            Set this if you want to apply a constraint to the lengthscale parameter. Default: `Positive`.
        :attr:`eps` (float):
            The minimum value that the lengthscale can take (prevents divide by zero errors). Default: `1e-6`.

    Attributes:
        :attr:`lengthscale` (Tensor):
            The lengthscale parameter. Size/shape of parameter depends on the
            :attr:`ard_num_dims` and :attr:`batch_shape` arguments.

    Example:
        >>> x = torch.randn(10, 5)
        >>> # Non-batch: Simple option
        >>> covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel())
        >>> # Non-batch: ARD (different lengthscale for each input dimension)
        >>> covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel(ard_num_dims=5))
        >>> covar = covar_module(x)  # Output: LazyTensor of size (10 x 10)
        >>>
        >>> batch_x = torch.randn(2, 10, 5)
        >>> # Batch: Simple option
        >>> covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel())
        >>> # Batch: different lengthscale for each batch
        >>> covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel(batch_shape=torch.Size([2])))
        >>> covar = covar_module(x)  # Output: LazyTensor of size (2 x 10 x 10)
    """

    has_lengthscale = True

    def forward(self, x1, x2, diag=False, **params):

        x1_ = x1.div(self.lengthscale)
        x2_ = x2.div(self.lengthscale)
        
        # print ("x1, x2", x1_, x2_)
        
        return self.covar_dist(
            x1_, x2_, square_dist=True, diag=diag, dist_postprocess_func=postprocess_rbf, postprocess=True, **params
        )

In [None]:
test_kernel = TestRBFKernel()
test_kernel.lengthscale = torch.tensor(3.3)

In [None]:
test_kernel.lengthscale

In [None]:
test_kernel(x1.squeeze(),x2.squeeze()).evaluate()

In [None]:
x1.shape