# Train and Visualize a PC on the Moons Dataset

In [None]:
import random
import torch
import numpy as np
import matplotlib.pyplot as plt

In [None]:
device = torch.device("cpu")  # The device to use, e.g., "cpu", "cuda", "cuda:1"

Set the random seeds.

In [None]:
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
if 'cuda' in device.type:
    torch.cuda.manual_seed(42)

Generate the Dataset

In [None]:
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split

data = make_moons(n_samples = 3000, noise=0.1, random_state=0)

X, y = data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
) 

## create the PC

In [None]:
import cirkit.new.region_graph as rg

# region_graph = rg.algorithms.FullyFactorized(num_vars=2)
region_graph = rg.algorithms.QuadTree(width, height, struct_decomp=False)

In [None]:
assert region_graph.is_smooth and region_graph.is_decomposable and region_graph.is_structured_decomposable and region_graph.is_omni_compatible
print(region_graph)

In [None]:
from cirkit.new.symbolic import (
    SymbolicTensorizedCircuit,
)
from cirkit.new.layers import (
    CPLayer,
    NormalLayer,
)
from cirkit.new.utils.type_aliases import ReparamFactory, SymbLayerCfg
from cirkit.new.reparams import EFNormalReparam, SoftmaxReparam


In [None]:
input_layer_cls = NormalLayer
input_reparam: ReparamFactory = EFNormalReparam

sum_layer_cls = CPLayer
sum_layer_kwargs = {}
sum_reparam = SoftmaxReparam

prod_layer_cls = CPLayer
prod_layer_kwargs = {}

symb_circuit = SymbolicTensorizedCircuit(region_graph,
                                         num_input_units=1024,
                                         num_sum_units=512,
                                         input_cfg=SymbLayerCfg(
                                            layer_cls=input_layer_cls,
                                            reparam_factory=input_reparam,
                                        ),
                                        sum_cfg=SymbLayerCfg(
                                            layer_cls=sum_layer_cls,
                                            layer_kwargs=sum_layer_kwargs, 
                                            reparam_factory=sum_reparam,
                                        ),
                                        prod_cfg=SymbLayerCfg(
                                            layer_cls=prod_layer_cls, 
                                            layer_kwargs=prod_layer_kwargs 
                                        ),
)

In [None]:
from cirkit.new.model.tensorized_circuit import TensorizedCircuit
tens_circuit = TensorizedCircuit(symb_circuit)

In [None]:
tens_circuit.to(device)
print(tens_circuit)

prepare the dataset for pytorch

In [None]:
from torch import optim
from torch.utils.data import DataLoader
train_dataloader = DataLoader(X_train, shuffle=True, batch_size=64)
test_dataloader = DataLoader(X_test, shuffle=False, batch_size=256)
#optimizer = optim.SGD(tens_circuit.parameters(), lr=0.5, momentum=0.9)
optimizer = optim.Adam(tens_circuit.parameters(), lr=0.05)

setting up the data for the plots

In [None]:
x1_bounds = (-1.5,2.5)
x2_bounds = (-1,1.5)
num_samples = 400
x2 = np.linspace(0, 1, num_samples)*(x2_bounds[1] - x2_bounds[0]) + x2_bounds[0]
x1 = np.linspace(0, 1, num_samples)*(x1_bounds[1] - x1_bounds[0]) + x1_bounds[0]
x1v, x2v = np.meshgrid(x1, x2)
X_meshgrid_np = np.stack((x1v,x2v), axis=-1).reshape(-1,2)
X_meshgrid_np.shape
X_meshgrid = torch.from_numpy(X_meshgrid_np).float()

In [None]:
X_meshgrid.min(0).values, X_meshgrid.max(0).values

In [None]:
from matplotlib.pyplot import xlim, ylim

def print_density(title=None, scatter=False):
    log_score = tens_circuit(X_meshgrid.to(device).float().unsqueeze(dim=-1))
    log_score = log_score.reshape(num_samples,num_samples)
    plt.imshow(log_score.cpu().detach().numpy(), extent=(*x1_bounds, *x2_bounds), origin="lower")
    if title is not None:
        plt.title(title)
    if scatter:
        plt.scatter(X_train[:, 0], X_train[:, 1], alpha=0.5)
    plt.show()

In [None]:
print_density("initial density")

In [None]:
plt.title("data samples")
plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, edgecolors="k")
plt.show()

In [None]:
losses = []

In [None]:
num_epochs = 500
for epoch_idx in range(num_epochs):
    running_loss = 0.0
    for batch in train_dataloader:
        batch = batch.to(device).float().unsqueeze(dim=-1)  # Add a channel dimension
        log_score = tens_circuit(batch)
        lls = log_score
        loss = -torch.mean(lls)   # The loss is the negative average log-likelihood
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        running_loss += loss * len(batch)

    losses.append((running_loss / len(X_train)).cpu().detach().numpy())
    if epoch_idx % (num_epochs // 5) == 0:
        print_density(f"Epoch: {epoch_idx}")
    if epoch_idx % (num_epochs // 100) == 0:
        print(f"Epoch {epoch_idx}: Average NLL: {running_loss / len(X_train):.3f}")
print_density(f"final Epoch: {epoch_idx}", scatter=False)
print_density(f"final Epoch: {epoch_idx} \nwith training data-set", scatter=True)

test hold-out average log-likelihood

In [None]:
test_running_loss = 0.0
with torch.no_grad():
    for batch in test_dataloader:
        batch = batch.to(device).float().unsqueeze(dim=-1)  # Add a channel dimension
        log_score = tens_circuit(batch)
        lls = log_score
        loss = -torch.mean(lls)   # The loss is the negative average log-likelihood
        test_running_loss += loss * len(batch)
train_running_loss = 0.0
with torch.no_grad():
    for batch in train_dataloader:
        batch = batch.to(device).float().unsqueeze(dim=-1)  # Add a channel dimension
        log_score = tens_circuit(batch)
        lls = log_score
        loss = -torch.mean(lls)   # The loss is the negative average log-likelihood
        train_running_loss += loss * len(batch)
print(f"hold-out avg log-like.: {test_running_loss/len(X_test):.2f} vs train: {train_running_loss/len(X_train):.2f}")

A 3d-plot better visualizes our resulting density

In [None]:
from matplotlib import cbook, cm
from matplotlib.colors import LightSource
from matplotlib import colormaps


log_score = tens_circuit(X_meshgrid.to(device).float().unsqueeze(dim=-1)).detach().cpu()
log_score = log_score.reshape(num_samples,num_samples).numpy()
nrows, ncols = log_score.shape

# Set up plot
fig, ax = plt.subplots(figsize=(15,5), subplot_kw=dict(projection='3d'))
ax.set_title("3D surface plot")
ls = LightSource(270, 25)

# To use a custom hillshading mode, override the built-in shading and pass
# in the rgb colors of the shaded surface calculated from "shade".
rgb = ls.shade(log_score, cmap=colormaps["magma"], vert_exag=1.0, blend_mode='soft')
ax.view_init(elev=35.)
surf = ax.plot_surface(x1v, x2v, log_score, rstride=1, cstride=1, facecolors=rgb,
                       linewidth=0, antialiased=False, shade=False)

plt.show()

our log-likelihood over iterations

In [None]:
plt.title("loss over iterations")
plt.plot(losses)
plt.show()

our circuit is normalized by construction, which we can verify like this:

In [None]:
from cirkit.new.model.functional import integrate
pc_pf = integrate(tens_circuit)

In [None]:
log_pf = pc_pf(batch)
assert torch.allclose(log_pf, torch.tensor(0.), atol=1e-6)

In [None]:
#!/usr/bin/env python3

import gpytorch

# from ..functions import RBFCovariance
# from ..settings import trace_mode
from gpytorch.kernels import Kernel


def postprocess_rbf(dist_mat):
    return dist_mat.div_(-2).exp_()


class TestRBFKernel(Kernel):
    r"""
    Computes a covariance matrix based on the RBF (squared exponential) kernel
    between inputs :math:`\mathbf{x_1}` and :math:`\mathbf{x_2}`:

    .. math::

       \begin{equation*}
          k_{\text{RBF}}(\mathbf{x_1}, \mathbf{x_2}) = \exp \left( -\frac{1}{2}
          (\mathbf{x_1} - \mathbf{x_2})^\top \Theta^{-2} (\mathbf{x_1} - \mathbf{x_2}) \right)
       \end{equation*}

    where :math:`\Theta` is a :attr:`lengthscale` parameter.
    See :class:`gpytorch.kernels.Kernel` for descriptions of the lengthscale options.

    .. note::

        This kernel does not have an `outputscale` parameter. To add a scaling parameter,
        decorate this kernel with a :class:`gpytorch.kernels.ScaleKernel`.

    Args:
        :attr:`ard_num_dims` (int, optional):
            Set this if you want a separate lengthscale for each
            input dimension. It should be `d` if :attr:`x1` is a `n x d` matrix. Default: `None`
        :attr:`batch_shape` (torch.Size, optional):
            Set this if you want a separate lengthscale for each
            batch of input data. It should be `b` if :attr:`x1` is a `b x n x d` tensor. Default: `torch.Size([])`.
        :attr:`active_dims` (tuple of ints, optional):
            Set this if you want to compute the covariance of only a few input dimensions. The ints
            corresponds to the indices of the dimensions. Default: `None`.
        :attr:`lengthscale_prior` (Prior, optional):
            Set this if you want to apply a prior to the lengthscale parameter.  Default: `None`.
        :attr:`lengthscale_constraint` (Constraint, optional):
            Set this if you want to apply a constraint to the lengthscale parameter. Default: `Positive`.
        :attr:`eps` (float):
            The minimum value that the lengthscale can take (prevents divide by zero errors). Default: `1e-6`.

    Attributes:
        :attr:`lengthscale` (Tensor):
            The lengthscale parameter. Size/shape of parameter depends on the
            :attr:`ard_num_dims` and :attr:`batch_shape` arguments.

    Example:
        >>> x = torch.randn(10, 5)
        >>> # Non-batch: Simple option
        >>> covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel())
        >>> # Non-batch: ARD (different lengthscale for each input dimension)
        >>> covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel(ard_num_dims=5))
        >>> covar = covar_module(x)  # Output: LazyTensor of size (10 x 10)
        >>>
        >>> batch_x = torch.randn(2, 10, 5)
        >>> # Batch: Simple option
        >>> covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel())
        >>> # Batch: different lengthscale for each batch
        >>> covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel(batch_shape=torch.Size([2])))
        >>> covar = covar_module(x)  # Output: LazyTensor of size (2 x 10 x 10)
    """

    has_lengthscale = True

    def forward(self, x1, x2, diag=False, **params):

        x1_ = x1.div(self.lengthscale)
        x2_ = x2.div(self.lengthscale)
        
        print ("x1, x2", x1_, x2_)
        
        return self.covar_dist(
            x1_, x2_, square_dist=True, diag=diag, dist_postprocess_func=postprocess_rbf, postprocess=True, **params
        )

In [None]:
x = torch.tensor([[-0.6281,  2.3329], [ 0.1011, -0.2601], [ 0.0664, -0.8459]])
x

In [None]:
x_2 = torch.tensor([[-0.6281], [ 0.1011], [ 0.0664]])

In [None]:
x_2.shape

In [None]:
test_kernel = TestRBFKernel(ard_num_dims=2)
test_kernel.lengthscale = torch.tensor(3.3)

In [None]:
test_kernel.lengthscale

In [None]:
test_kernel(x_2,x_2).evaluate()

In [None]:
test_kernel = TestRBFKernel(ard_num_dims=1)
test_kernel.lengthscale = torch.tensor(3.3)

test_kernel(x_2,x_2).evaluate()

In [None]:
sq_dist(x_2, x_2)

In [None]:
torch.cdist(x_2, x_2, p=2)**2

In [None]:
from gpytorch.kernels.kernel import sq_dist

In [None]:
def sq_dist(x1, x2, x1_eq_x2=False):
    """Equivalent to the square of `torch.cdist` with p=2."""
    # TODO: use torch squared cdist once implemented: https://github.com/pytorch/pytorch/pull/25799
    adjustment = x1.mean(-2, keepdim=True)
    x1 = x1 - adjustment

    # Compute squared distance matrix using quadratic expansion
    x1_norm = x1.pow(2).sum(dim=-1, keepdim=True)
    x1_pad = torch.ones_like(x1_norm)
    if x1_eq_x2 and not x1.requires_grad and not x2.requires_grad:
        x2, x2_norm, x2_pad = x1, x1_norm, x1_pad
    else:
        x2 = x2 - adjustment  # x1 and x2 should be identical in all dims except -2 at this point
        x2_norm = x2.pow(2).sum(dim=-1, keepdim=True)
        x2_pad = torch.ones_like(x2_norm)
    x1_ = torch.cat([-2.0 * x1, x1_norm, x1_pad], dim=-1)
    x2_ = torch.cat([x2, x2_pad, x2_norm], dim=-1)
    res = x1_.matmul(x2_.transpose(-2, -1))

    if x1_eq_x2 and not x1.requires_grad and not x2.requires_grad:
        res.diagonal(dim1=-2, dim2=-1).fill_(0)

    # Zero out negative values
    return res.clamp_min_(0)

In [None]:
from gpytorch.kernels import SpectralMixtureKernel
import math

In [None]:
train_x = torch.linspace(0, 1, 15)
train_y = torch.sin(train_x * (2 * math.pi))

In [None]:
covar_module = gpytorch.kernels.SpectralMixtureKernel(num_mixtures=4)
covar_module.initialize_from_data(train_x, train_y)

In [None]:
class SpectralMixtureGPModel(gpytorch.models.ExactGP):
    def __init__(self, train_x, train_y, likelihood):
        super(SpectralMixtureGPModel, self).__init__(train_x, train_y, likelihood)
        self.mean_module = gpytorch.means.ConstantMean()
        self.covar_module = gpytorch.kernels.SpectralMixtureKernel(num_mixtures=4)
        self.covar_module.initialize_from_data(train_x, train_y)

    def forward(self,x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)


likelihood = gpytorch.likelihoods.GaussianLikelihood()
model = SpectralMixtureGPModel(train_x, train_y, likelihood)

In [None]:
for param in model.parameters(): 
    print (param.shape)

In [None]:
from cirkit.models.rbf_kernel import RBFCircuitKernel

RBFCircuitKernel

In [None]:
import torch

def pdist_per_dim(input_tensor):
    # Expand the input tensor to form all pairs for differences calculation
    t1 = input_tensor.unsqueeze(1)  # Shape: (1000, 1, 8)
    t2 = input_tensor.unsqueeze(0)  # Shape: (1, 1000, 8)

    # Compute pairwise differences for each dimension (broadcasting)
    diffs = torch.abs(t1 - t2)  # Shape: (1000, 1000, 8)

    # Mask to extract the upper triangular part without the diagonal
    mask = torch.triu(torch.ones(input_tensor.shape[0], input_tensor.shape[0]), diagonal=1).bool()

    # Apply mask and reshape to get the final shape (499500, 8)
    result = diffs[mask].reshape(-1, input_tensor.shape[1])

    return result

# Example usage
input_tensor = torch.rand(3, 2)  # Example input tensor of shape (1000, 8)
result = efficient_custom_pdist(input_tensor)
print(result.shape)  # Should print torch.Size([499500, 8])


In [None]:
import torch

def test_efficient_custom_pdist():
    # Define a small, manually verifiable input tensor
    input_tensor = torch.tensor([[5.0, 6.0], [2.0, -4.0], [-5.0, -6.0], [12.0, 7.0]])
    
    # Expected output for this input, calculated manually or using an alternative method
    # Here we calculate the absolute differences for simplicity and clarity
    expected_output = torch.tensor([
        [3.0, 10.0],  # Difference between points 1 and 2
        [10.0, 12.0],  # Difference between points 1 and 3
        [7.0, 1.0],   # Difference between points 2 and 3
        [7.0, 2.0],   # Difference between points 2 and 3
        [10.0, 11.0],   # Difference between points 2 and 3
        [17.0, 13.0]   # Difference between points 2 and 3
    ])
    
    # Calculate the output using the function under test
    actual_output = efficient_custom_pdist(input_tensor)
    
    # Verify the shape of the output
    assert actual_output.shape == expected_output.shape, "Output shape is incorrect."
    
    print("actual_output", actual_output)
    print("expected_output", expected_output)
    # Verify the contents of the output
    assert torch.allclose(actual_output, expected_output, atol=1e-5), "Output values are incorrect."
    
    print("Test passed!")

# Run the test function
test_efficient_custom_pdist()
