# Train and evaluate a PC

In [1]:
import random
import torch
import numpy as np
import matplotlib.pyplot as plt

In [2]:
device = torch.device("cpu")  # The device to use, e.g., "cpu", "cuda", "cuda:1"

%load_ext autoreload
%autoreload 2

In [3]:
%reload_ext autoreload

Set the random seeds.

In [4]:
random.seed(4)
np.random.seed(4)
torch.manual_seed(4)
# if 'cuda' in device.type:
#     torch.cuda.manual_seed(42)

<torch._C.Generator at 0x7f84f0a348d0>

## Load MNIST Dataset

Load the training and test splits of MNIST, and preprocess them by flattening the tensor images.

In [None]:
from torchvision import transforms, datasets
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Lambda(lambda x: (255 * x.view(-1)).long())
])
data_train = datasets.MNIST('datasets', train=True, download=True, transform=transform)
data_test = datasets.MNIST('datasets', train=False, download=True, transform=transform)
num_variables = data_train[0][0].shape[0]
height, width = 28, 28
print(f"Number of variables: {num_variables}")

In [None]:
plt.matshow(data_train[0][0].reshape(28, 28), cmap='gray')
plt.title(f"Class: {data_train[0][1]}")
plt.show()

In [None]:
import torch
from torch.utils.data import DataLoader, random_split
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torch import nn
import numpy as np

# Set random seed for reproducibility
torch.manual_seed(24)

# MNIST Data loading and preprocessing
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))  # MNIST mean and std
])

# Load datasets
mnist_train = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
mnist_test = datasets.MNIST(root='./data', train=False, download=True, transform=transform)

# Split training data into training and validation sets
train_size = int(0.8 * len(mnist_train))
val_size = len(mnist_train) - train_size
ds_train, ds_val = random_split(mnist_train, [train_size, val_size])

# Data loaders
batch_size = 128
dl_train = DataLoader(ds_train, batch_size=batch_size, shuffle=True)
dl_val = DataLoader(ds_val, batch_size=512, shuffle=False)
dl_test = DataLoader(mnist_test, batch_size=512, shuffle=False)

In [None]:
ds_train

In [None]:


# Model parameters
epochs = 50
input_dim = 28 * 28  # MNIST images are 28x28 pixels
num_classes = 10  # MNIST has 10 classes

# Define your GP model here. Assuming GP, GaussianLikelihood, VariationalELBO are already defined or available through a library
# Since we are working with MNIST, we can use a simpler feature extractor or even use the raw pixels directly

feature_extractor = IdentityMapping()

n_inducing_points = 50
initial_inducing_points, initial_lengthscale = initial_values(
            ds_train, feature_extractor, n_inducing_points
)

gp_model = CircuitGP(
        num_outputs=num_classes,
        num_features=input_dim,          # CHANGE features / input_dim
        initial_lengthscale=initial_lengthscale,
        initial_inducing_points=initial_inducing_points,
        circuit=pc
        # kernel=kernel,
)

    # model = DKL(feature_extractor, gp)
model = gp_model

likelihood = GaussianLikelihood(num_classes=num_classes)
elbo_fn = VariationalELBO(likelihood, model, num_data=len(ds_train), classification=True)
loss_fn = lambda x, y: -elbo_fn(x, y)



likelihood = GaussianLikelihood(num_classes=num_classes)  # Adapted for classification
elbo_fn = VariationalELBO(likelihood, model, num_data=len(ds_train), classification=True)
loss_fn = lambda x, y: -elbo_fn(x, y)

# Optimizer
optimizer = torch.optim.Adam([
    {"params": model.parameters(), "lr": 1e-3},
    {"params": likelihood.parameters(), "lr": 1e-3}
])

# Training and evaluation loops
# Define `step` and `eval_step` functions similarly to the CIFAR example but adapted for MNIST

# Train the model
for epoch in range(epochs):
    model.train()
    for x_batch, y_batch in dl_train:
        optimizer.zero_grad()
        output = model(x_batch)
        loss = loss_fn(output, y_batch)
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch}, Loss: {loss.item()}')

# Evaluation logic here, similar to the training loop but without backpropagation steps


## Instantiating the region graph

Initialize a _Quad Graph_ region graph.

In [109]:
from cirkit.region_graph.quad_tree import QuadTree
# region_graph = QuadTree(28, 28, struct_decomp=True)
# region_graph = RandomBinaryTree(num_vars=8, depth=3, num_repetitions=1)
region_graph = FullyFactorized(num_vars=8)

In [None]:
region_graph

In [None]:
region_graph._nodes

Others available region graphs are _Poon Domingos_ and _QuadTree_, whose imports are showed below.

In [110]:
from cirkit.region_graph.poon_domingos import PoonDomingos
from cirkit.region_graph.random_binary_tree import RandomBinaryTree
from cirkit.region_graph.fully_factorized import FullyFactorized

## Choosing the layers

Now we have to choose both the input and inner layers of our circuit. As input layer we select the _CategoricalLayer_ with 256 categories (the number of pixel values). For the inner layer instead, we choose the _uncollapsed CP_ layer with rank 1.

In [122]:
from cirkit.layers.input.exp_family import CategoricalLayer
from cirkit.layers.sum_product import CPLayer
from cirkit.layers.input.rbf_kernel import RBFKernelLayer
from cirkit.layers.input.sm_kernel import SMKernelLayer

efamily_cls = SMKernelLayer # RBFKernelLayer
efamily_kwargs = {}
layer_cls = CPLayer
layer_kwargs = {'rank': 1}

## Building the tensorized PC

We can now build our tensorized PC by specifying the region graph and layers we chose previously. In addition, we can scale the architecture by increasing the number of input and inner units. We can also have circuits with multiple output units by choosing _num_classes > 1_. However, in this notebook we only estimate the distribution of the images and marginalize out the class variable.

To ensure weights are non-negative we reparametrize them via exponentiation. Several reparametrization functions are available.

In [123]:
from cirkit.reparams.leaf import ReparamExp, ReparamLogSoftmax, ReparamSoftmax
from cirkit.models.tensorized_circuit import TensorizedPC
pc = TensorizedPC.from_region_graph(
    region_graph,
    num_inner_units=100, # 76
    num_input_units=100,
    efamily_cls=efamily_cls,
    efamily_kwargs=efamily_kwargs,
    layer_cls=layer_cls,
    layer_kwargs=layer_kwargs,
    num_classes=1,
    reparam=ReparamSoftmax # ReparamLogSoftmax # ReparamExp
)
pc.to(device)
print(pc)

TensorizedPC(
  (input_layer): SMKernelLayer(
    (params_sigma): ReparamExp()
    (params_mu): ReparamIdentity()
  )
  (scope_layer): ScopeLayer()
  (inner_layers): ModuleList(
    (0): CollapsedCPLayer(
      (params_in): ReparamSoftmax()
    )
  )
)


In [124]:
for param in pc.parameters(): 
    print (param.shape)

torch.Size([100, 1, 8])
torch.Size([100, 1, 8])
torch.Size([1, 8, 100, 1])


In [63]:
from cirkit.models.rbf_kernel import RBFCircuitKernel


circuit_kernel = RBFCircuitKernel(pc, batch_shape=torch.Size([]))

All circuit parameters shape: 
torch.Size([8, 81])
torch.Size([1, 8, 81, 1])


In [17]:
from cirkit.models.gp import CircuitGP, initial_values

In [114]:
import torch.nn.functional as F

from uci_datasets import Dataset

from ignite.engine import Events, Engine
from ignite.metrics import Average, Loss
from ignite.contrib.handlers import ProgressBar

import gpytorch
from gpytorch.mlls import VariationalELBO
from gpytorch.likelihoods import GaussianLikelihood

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

In [115]:
data = Dataset("kin40k")
x_train, y_train, x_test, y_test = data.get_split(split=1)

kin40k dataset, N=40000, d=8


In [102]:
x_train.shape, x_test.shape

((36000, 8), (4000, 8))

In [116]:
x_train_real = x_train[:32000] #32000 # 2053   36584    36584     39063   13281    2672   # RE-RUN # 13279   # 1279   4701  824
y_train_real = y_train[:32000]
y_train_real = y_train_real.squeeze()
x_val = x_train[32000:]
y_val = y_train[32000:]
y_val = y_val.squeeze()
y_test = y_test.squeeze()

In [117]:
mean = x_train_real.mean(axis=0)
std = x_train_real.std(axis=0)

x_train_real_normalized = (x_train_real - mean) / std
x_val_normalized = (x_val - mean) / std
x_test_normalized = (x_test - mean) / std

In [None]:
class SimpleNN(nn.Module):
    def __init__(self):
        super(SimpleNN, self).__init__()
        # Define the linear layer with input dimension 8 and output dimension 256
        self.linear = nn.Linear(in_features=8, out_features=256)
        # Define the ReLU activation function
        self.relu = nn.ReLU()
        
    def forward(self, x):
        # Apply linear layer and then ReLU activation to the input x
        x = self.linear(x)
        x = self.relu(x)
        return x

In [104]:
import torch.nn as nn

class IdentityMapping(nn.Module):
    def __init__(self):
        super(IdentityMapping, self).__init__()
    
    def forward(self, x):
        return x

In [125]:
np.random.seed(24)
torch.manual_seed(24) ####################### CHANGE

batch_size = 32 # 64

# X_train, y_train = make_data(n_samples)
# X_test, y_test = X_train, y_train

# x_train, y_train, x_test, y_test

ds_train = torch.utils.data.TensorDataset(torch.from_numpy(x_train_real_normalized).float(), torch.from_numpy(y_train_real).float())
dl_train = torch.utils.data.DataLoader(ds_train, batch_size=batch_size, shuffle=True, drop_last=True) # suffle 

ds_val = torch.utils.data.TensorDataset(torch.from_numpy(x_val_normalized).float(), torch.from_numpy(y_val).float())
dl_val = torch.utils.data.DataLoader(ds_val, batch_size=512, shuffle=False)

ds_test = torch.utils.data.TensorDataset(torch.from_numpy(x_test_normalized).float(), torch.from_numpy(y_test).float())
dl_test = torch.utils.data.DataLoader(ds_test, batch_size=512, shuffle=False)

# steps = 5e3
epochs = 50
print(f"Training with {len(x_train_real)} datapoints for {epochs} epochs")

# Change this boolean to False for SNGP
DUE = True

input_dim = 8 # input di  # 128
# features = 1024 # hidden    128
# depth = 2   # 4  6
num_outputs = 1 # regression with 1D output
# spectral_normalization = True
# coeff = 0.95
# n_power_iterations = 1
# dropout_rate = 0.01

# feature_extractor = FCResNet(
#     input_dim=input_dim, 
#     features=features, 
#     depth=depth, 
#     spectral_normalization=spectral_normalization, 
#     coeff=coeff, 
#     n_power_iterations=n_power_iterations,
#     dropout_rate=dropout_rate
# )

feature_extractor = IdentityMapping()

if DUE:
    n_inducing_points = 10
    kernel = "HBF" ################# change 
    
    initial_inducing_points, initial_lengthscale = initial_values(
            ds_train, feature_extractor, n_inducing_points
    )

    gp_model = CircuitGP(
        num_outputs=num_outputs,
        num_features=input_dim,          # CHANGE features / input_dim
        initial_lengthscale=initial_lengthscale,
        initial_inducing_points=initial_inducing_points,
        circuit=pc
        # kernel=kernel,
    )

    # model = DKL(feature_extractor, gp)

    likelihood = GaussianLikelihood()
    elbo_fn = VariationalELBO(likelihood, gp_model, num_data=len(ds_train))
    loss_fn = lambda x, y: -elbo_fn(x, y)
    
    # mse_loss_fn = F.mse_loss
# else:
    # Nothing 
#     num_gp_features = 128
#     num_random_features = 1024
#     normalize_gp_features = True
#     feature_scale = 2
#     ridge_penalty = 1
    
#     model = Laplace(feature_extractor,
#                     features,
#                     num_gp_features,
#                     normalize_gp_features,
#                     num_random_features,
#                     num_outputs,
#                     len(ds_train),
#                     batch_size,
#                     ridge_penalty=ridge_penalty,
#                     feature_scale=feature_scale
#                    )

#     loss_fn = F.mse_loss # MSE

if torch.cuda.is_available():
    gp_model = gp_model.cuda()
    if DUE:
        likelihood = likelihood.cuda()

# learning rate   
lr = 1e-3

parameters = [
    {"params": gp_model.parameters(), "lr": lr},
]

if DUE:
    parameters.append({"params": likelihood.parameters(), "lr": lr})
    
    
optimizer = torch.optim.Adam(parameters)
pbar = ProgressBar()

def step(engine, batch):
    gp_model.train()
    if DUE:
        likelihood.train()
    
    optimizer.zero_grad()
    
    x, y = batch
    if torch.cuda.is_available():
        x = x.cuda()
        y = y.cuda()

    y_pred = gp_model(x) # get y
    
    if not DUE:
        y_pred.squeeze_()
    
#     print("y_pred", y_pred)
#     print("y_pred_real", likelihood(y_pred).mean.cpu())
#     print("y", y)
    loss = loss_fn(y_pred, y) # loss
    
    loss.backward()
    optimizer.step()
    
    return loss.item()


def eval_step(engine, batch):
    gp_model.eval() # set to eval
    if DUE:
        likelihood.eval()
    
    x, y = batch
    if torch.cuda.is_available():
        x = x.cuda()
        y = y.cuda()

    y_pred = gp_model(x)
    
    # eval_mes_loss = mse_loss_fn(y_pred, y) # MSE eval
            
    return y_pred, y

    
trainer = Engine(step)
evaluator = Engine(eval_step)

metric = Average()
metric.attach(trainer, "loss")
pbar.attach(trainer)

if DUE:
    metric = Loss(lambda y_pred, y: - likelihood.expected_log_prob(y, y_pred).mean())
    # metric = Loss(lambda y_pred, y: F.mse_loss(likelihood(y_pred).mean.cpu(), y))
else:
    metric = Loss(lambda y_pred, y: F.mse_loss(y_pred[0].squeeze(), y))


metric.attach(evaluator, "loss")

@trainer.on(Events.EPOCH_COMPLETED(every=int(epochs/20) + 1))
def log_results(trainer):
    evaluator.run(dl_val) # val dataset
    print(f"Results - Epoch: {trainer.state.epoch} - "
          f"Val Loss: {evaluator.state.metrics['loss']:.2f} - "
          f"Train Loss: {trainer.state.metrics['loss']:.2f}")

    
if not DUE:
    @trainer.on(Events.EPOCH_STARTED)
    def reset_precision_matrix(trainer):
        gp_model.reset_precision_matrix()

Training with 32000 datapoints for 50 epochs
f_X_samples torch.Size([1000, 8])
initial_lengthscale tensor(3.8926)
All circuit parameters shape: 
torch.Size([100, 1, 8])
torch.Size([100, 1, 8])
torch.Size([1, 8, 100, 1])


  super()._check_params_vs_input(X, default_n_init=3)


AttributeError: 'SMKernelLayer' object has no attribute 'params'

In [119]:
for param in gp_model.parameters(): 
    print(param.shape)
    # print(param)

torch.Size([10, 8])
torch.Size([10])
torch.Size([10, 10])
torch.Size([8, 100])
torch.Size([1, 8, 100, 1])
torch.Size([1])
torch.Size([])


In [120]:
trainer.run(dl_train, max_epochs=epochs)

[1/1000]   0%|           [00:00<?]

[1/1000]   0%|           [00:00<?]

Engine run is terminating due to exception: 


KeyboardInterrupt: 

In [108]:
gp_model.eval()
if DUE:
    likelihood.eval()

all_mse = []
    
with torch.no_grad(), gpytorch.settings.num_likelihood_samples(100):
    
    xx_split = np.array_split(x_test, 40)       ############# CHANGE
    yy_split = np.array_split(y_test, 40)
    
    for index in range(len(xx_split)):
    
        xx = torch.from_numpy(xx_split[index]).float()
        yy = torch.from_numpy(yy_split[index]).float()
        pred_test = gp_model(xx)
        ol = likelihood(pred_test)
        output = ol.mean.cpu()
        mse = F.mse_loss(output, yy)
        all_mse.append(mse)
    
    
average_mse = sum(all_mse) / len(all_mse)
average_mse

tensor(0.6853)

In [None]:
pc.input_layer.params.param.shape
# (self.num_vars, self.num_output_units, self.num_replicas, self.num_suff_stats)

In [None]:
pc.scope_layer.scope.shape

In [None]:
pc.inner_layers[0].params_in() #.param #.shape #.param.shape
# (F, H, I, O)
# (fold count, arity, input, output)

In [None]:
from cirkit.models.rbf_kernel import RBFCircuitKernel

circuit_kernel = RBFCircuitKernel(pc, batch_shape=torch.Size([]))


In [None]:
circuit_kernel(x1.squeeze(), x2.squeeze()).evaluate()

In [None]:
x1.squeeze().shape

In [None]:
# set parameters

pc.input_layer.params.param = torch.nn.Parameter(torch.log(torch.ones(tuple(pc.input_layer.params.shape))*3.3))
# pc.inner_layers[0].params_in.param = torch.nn.Parameter(torch.log(0.25*torch.ones(tuple(pc.inner_layers[0].params_in.shape))))
# pc.inner_layers[0].params_in = torch.nn.Parameter(torch.ones(tuple(pc.inner_layers[0].params_in.shape))*3.3)
# pc.inner_layers[1].params_in = torch.nn.Parameter(torch.ones(tuple(pc.inner_layers[1].params_in.shape))*3.3)
# pc.inner_layers[2].params_in = torch.nn.Parameter(torch.ones(tuple(pc.inner_layers[2].params_in.shape))*3.3)
# pc.inner_layers[3].params_in = torch.nn.Parameter(torch.ones(tuple(pc.inner_layers[3].params_in.shape))*3.3)

In [None]:
pc.inner_layers[0].params_in() #.shape

In [None]:
x1 = torch.randn(3, 8, 1)
x2 = torch.randn(3, 8, 1)

In [None]:
pc(x1, x2).squeeze()

In [None]:
def eval_pc(x1, x2): 
    return pc(x1.unsqueeze(-1), x2.unsqueeze(-1)).squeeze(-1)

eval_pc(x1.squeeze(), x2.squeeze())

In [None]:
from gpytorch.kernels import RBFKernel

# x = torch.randn(3, 5)
covar_module = RBFKernel()
covar_module.lengthscale = torch.tensor(3.3)
covar_module(x1.squeeze(), x2.squeeze()).evaluate()

In [None]:
x1.squeeze().shape

In [None]:
from gpytorch.kernels import RBFKernel
x = torch.randn(3, 2)
RBFKernel().lengthscale = torch.tensor(3.3)

In [None]:
# Test RBF input output = RBF kernel 

In [None]:
from gpytorch.kernels import RBFKernel

x = torch.randn(3, 5)
covar_module = RBFKernel()
covar_module.lengthscale = torch.tensor(3.3)
covar_module(x).evaluate()
# covar_module.lengthscale

In [None]:
from cirkit.layers.input.rbf_kernel import RBFKernelLayer
input_la = RBFKernelLayer(num_vars=5, num_output_units=1)

input_la.params = torch.nn.Parameter(torch.ones((5,1))*3.3)

# input_la(x1, x2).squeeze().shape

# input_la(x.unsqueeze(-1), x.unsqueeze(-1)).shape

torch.prod(torch.exp(input_la(x.unsqueeze(-1), x.unsqueeze(-1)).squeeze()), dim=2)

In [None]:
input_la = RBFKernelLayer(num_vars=20, num_output_units=1)

input_la.params = torch.nn.Parameter(torch.ones((20,1))*3.3)

# input_la(x1, x2).squeeze().shape
torch.prod(input_la(x1, x1).squeeze(), dim=2)

In [None]:
covar_module(x1).evaluate().shape

In [None]:
x1.shape

In [None]:
x_2 = torch.tensor([[-0.6281], [ 0.1011], [ 0.0664]])

In [None]:
from cirkit.layers.input.rbf_kernel import RBFKernelLayer
input_la = RBFKernelLayer(num_vars=2, num_output_units=1)

input_la.params = torch.nn.Parameter(torch.ones((1,1))*3.3)

input_la(x_2.unsqueeze(-1), x_2.unsqueeze(-1)).squeeze()

In [None]:
input_la.params

In [None]:
torch.ones((2,1))*3.3

In [None]:
input_la(x.unsqueeze(-1), x.unsqueeze(-1)).squeeze()

In [None]:
x_2.unsqueeze(-1).shape

In [None]:
torch.cdist(x1, x2, p=2)

In [None]:
from torch import optim
from torch.utils.data import DataLoader
train_dataloader = DataLoader(data_train, shuffle=True, batch_size=256)
test_dataloader = DataLoader(data_test, shuffle=False, batch_size=256)
optimizer = optim.SGD(pc.parameters(), lr=0.1, momentum=0.9)

Since the constructed PC is not necessarily normalized, we construct the integral circuit that will compute the partition function. Note that parameters are shared and therefore there is no additional memory required.

In [None]:
from cirkit.models.functional import integrate
pc_pf = integrate(pc)

Finally, we optimize the parameters for 5 epochs by minimizing the negative log-likelohood.

In [None]:
num_epochs = 5
for epoch_idx in range(num_epochs):
    running_loss = 0.0
    for batch, _ in train_dataloader:
        batch = batch.to(device).unsqueeze(dim=-1)  # Add a channel dimension
        log_score = pc(batch)
        log_pf = pc_pf(batch)     # Compute the partition function
        lls = log_score - log_pf  # Compute the log-likelihood
        loss = -torch.mean(lls)   # The loss is the negative average log-likelihood
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        running_loss += loss * len(batch)
        # Clamp the parameters to ensure they are in the intended domain
        # This is needed if we do not use any reparametrization to ensure parameters non-negativity
        # In our case, clamping is disable becuase we reparameterize via exponentiation (see above)
        #for layer in model.inner_layers:
        #    layer.clamp_params()
    print(f"Epoch {epoch_idx}: Average NLL: {running_loss / len(data_train):.3f}")

We then evaluate our model on test data by computing the average log-likelihood and bits per dimension.

In [None]:
with torch.no_grad():
    pc.eval()
    log_pf = pc_pf(torch.empty((), device=device))  # Compute the partition function once for testing
    test_lls = 0.0
    for batch, _ in test_dataloader:
        log_score = pc(batch.to(device).unsqueeze(dim=-1))
        lls = log_score - log_pf
        test_lls += lls.sum().item()
    average_ll = test_lls / len(data_test)
    bpd = -average_ll / (num_variables * np.log(2.0))
    print(f"Average test LL: {average_ll:.3f}")
    print(f"Bits per dimension: {bpd}")

In [None]:
#!/usr/bin/env python3

import gpytorch

# from ..functions import RBFCovariance
# from ..settings import trace_mode
from gpytorch.kernels import Kernel


def postprocess_rbf(dist_mat):
    return dist_mat.div_(-2).exp_()


class TestRBFKernel(Kernel):
    r"""
    Computes a covariance matrix based on the RBF (squared exponential) kernel
    between inputs :math:`\mathbf{x_1}` and :math:`\mathbf{x_2}`:

    .. math::

       \begin{equation*}
          k_{\text{RBF}}(\mathbf{x_1}, \mathbf{x_2}) = \exp \left( -\frac{1}{2}
          (\mathbf{x_1} - \mathbf{x_2})^\top \Theta^{-2} (\mathbf{x_1} - \mathbf{x_2}) \right)
       \end{equation*}

    where :math:`\Theta` is a :attr:`lengthscale` parameter.
    See :class:`gpytorch.kernels.Kernel` for descriptions of the lengthscale options.

    .. note::

        This kernel does not have an `outputscale` parameter. To add a scaling parameter,
        decorate this kernel with a :class:`gpytorch.kernels.ScaleKernel`.

    Args:
        :attr:`ard_num_dims` (int, optional):
            Set this if you want a separate lengthscale for each
            input dimension. It should be `d` if :attr:`x1` is a `n x d` matrix. Default: `None`
        :attr:`batch_shape` (torch.Size, optional):
            Set this if you want a separate lengthscale for each
            batch of input data. It should be `b` if :attr:`x1` is a `b x n x d` tensor. Default: `torch.Size([])`.
        :attr:`active_dims` (tuple of ints, optional):
            Set this if you want to compute the covariance of only a few input dimensions. The ints
            corresponds to the indices of the dimensions. Default: `None`.
        :attr:`lengthscale_prior` (Prior, optional):
            Set this if you want to apply a prior to the lengthscale parameter.  Default: `None`.
        :attr:`lengthscale_constraint` (Constraint, optional):
            Set this if you want to apply a constraint to the lengthscale parameter. Default: `Positive`.
        :attr:`eps` (float):
            The minimum value that the lengthscale can take (prevents divide by zero errors). Default: `1e-6`.

    Attributes:
        :attr:`lengthscale` (Tensor):
            The lengthscale parameter. Size/shape of parameter depends on the
            :attr:`ard_num_dims` and :attr:`batch_shape` arguments.

    Example:
        >>> x = torch.randn(10, 5)
        >>> # Non-batch: Simple option
        >>> covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel())
        >>> # Non-batch: ARD (different lengthscale for each input dimension)
        >>> covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel(ard_num_dims=5))
        >>> covar = covar_module(x)  # Output: LazyTensor of size (10 x 10)
        >>>
        >>> batch_x = torch.randn(2, 10, 5)
        >>> # Batch: Simple option
        >>> covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel())
        >>> # Batch: different lengthscale for each batch
        >>> covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel(batch_shape=torch.Size([2])))
        >>> covar = covar_module(x)  # Output: LazyTensor of size (2 x 10 x 10)
    """

    has_lengthscale = True

    def forward(self, x1, x2, diag=False, **params):

        x1_ = x1.div(self.lengthscale)
        x2_ = x2.div(self.lengthscale)
        
        # print ("x1, x2", x1_, x2_)
        
        return self.covar_dist(
            x1_, x2_, square_dist=True, diag=diag, dist_postprocess_func=postprocess_rbf, postprocess=True, **params
        )

In [None]:
test_kernel = TestRBFKernel()
test_kernel.lengthscale = torch.tensor(3.3)

In [None]:
test_kernel.lengthscale

In [None]:
test_kernel(x1.squeeze(),x2.squeeze()).evaluate()

In [126]:
x1.shape

NameError: name 'x1' is not defined

In [129]:
import random
import torch
import numpy as np
import matplotlib.pyplot as plt

device = torch.device("cpu")

random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
if 'cuda' in device.type:
    torch.cuda.manual_seed(42)

In [152]:
from torchvision import transforms, datasets
# transform = transforms.Compose([
#     transforms.ToTensor(),
#     transforms.Lambda(lambda x: (255 * x.view(-1)).long())
# ])
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Lambda(lambda x: x.view(-1))
])
data_train = datasets.MNIST('datasets', train=True, download=True, transform=transform)
data_test = datasets.MNIST('datasets', train=False, download=True, transform=transform)
num_variables = data_train[0][0].shape[0]
height, width = 28, 28
print(f"Number of variables: {num_variables}")

Number of variables: 784


In [153]:
data_train[0][0].shape

torch.Size([784])

: 

In [138]:
from cirkit.region_graph.quad_tree import QuadTree
region_graph = QuadTree(width, height, struct_decomp=False)

from cirkit.layers.input.exp_family import CategoricalLayer
from cirkit.layers.sum_product import CPLayer
efamily_cls = CategoricalLayer
efamily_kwargs = {'num_categories': 256}
layer_cls = CPLayer
layer_kwargs = {'rank': 1}

from cirkit.reparams.leaf import ReparamExp
from cirkit.models.tensorized_circuit import TensorizedPC
pc = TensorizedPC.from_region_graph(
    region_graph,
    num_inner_units=2,
    num_input_units=2,
    efamily_cls=efamily_cls,
    efamily_kwargs=efamily_kwargs,
    layer_cls=layer_cls,
    layer_kwargs=layer_kwargs,
    num_classes=1,
    reparam=ReparamExp
)
pc.to(device)
print(pc)

TensorizedPC(
  (input_layer): CategoricalLayer(
    (params): ReparamEFCategorical()
  )
  (scope_layer): ScopeLayer()
  (inner_layers): ModuleList(
    (0-1): 2 x CollapsedCPLayer(
      (params_in): ReparamExp()
    )
    (2): SumLayer(
      (params): ReparamExp()
    )
    (3): CollapsedCPLayer(
      (params_in): ReparamExp()
    )
    (4): SumLayer(
      (params): ReparamExp()
    )
    (5): CollapsedCPLayer(
      (params_in): ReparamExp()
    )
    (6): SumLayer(
      (params): ReparamExp()
    )
    (7): CollapsedCPLayer(
      (params_in): ReparamExp()
    )
    (8): SumLayer(
      (params): ReparamExp()
    )
    (9): CollapsedCPLayer(
      (params_in): ReparamExp()
    )
    (10): SumLayer(
      (params): ReparamExp()
    )
    (11-12): 2 x CollapsedCPLayer(
      (params_in): ReparamExp()
    )
    (13): SumLayer(
      (params): ReparamExp()
    )
    (14-15): 2 x CollapsedCPLayer(
      (params_in): ReparamExp()
    )
    (16): SumLayer(
      (params): ReparamExp(

In [142]:
from torch import optim
from torch.utils.data import DataLoader
train_dataloader = DataLoader(data_train, shuffle=True, batch_size=32)
test_dataloader = DataLoader(data_test, shuffle=False, batch_size=32)
optimizer = optim.SGD(pc.parameters(), lr=0.1, momentum=0.9)

In [143]:
from cirkit.models.functional import integrate
pc_pf = integrate(pc)

In [145]:
num_epochs = 5
for epoch_idx in range(num_epochs):
    running_loss = 0.0
    for batch, _ in train_dataloader:
        log_score = pc.forward(batch)
        log_pf = pc_pf()          # Compute the partition function
        lls = log_score - log_pf  # Compute the log-likelihood
        loss = -torch.mean(lls)   # The loss is the negative average log-likelihood
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        running_loss += loss * len(batch)
        # Clamp the parameters to ensure they are in the intended domain
        # This is needed if we do not use any reparametrization to ensure parameters non-negativity
        # In our case, clamping is disable becuase we reparameterize via exponentiation (see above)
        #for layer in model.inner_layers:
        #    layer.clamp_params()
    print(f"Epoch {epoch_idx}: Average NLL: {running_loss / len(data_train):.3f}")

suff_stats torch.Size([32, 200704])


RuntimeError: einsum(): subscript d has size 32 for operand 1 which does not broadcast with previously seen size 784