In [1]:
import numpy as np

### Image data and discrete and continuous likelihoods

#### Dequantization

Pixels generally take a finite number of brightness values ranging from e.g. $z_i\in[0,255]$. Modeling discretized data using a real-valued distribution $p(\mathbf{x})$ can lead to arbitrarily high density values, by locating narrow high density spike on each of the possible discrete values. In order to avoid this ‘cheating’ solution, one should add noise uniformly distributed between 0 and 1 to the value of each pixel and then divide by 256, making each pixel take a value in the range [0, 1] and the image have a smooth distribution over pixel values [RNADE: The real-valued neural autoregressive density-estimator, 2013].
$$
x_i = \frac{z_i + u}{256}, \quad u\sim\mathcal{U}(0,1)
$$
This preprocessing was used in [NICE: Non-Linear Independent Components Estimation, 2015].

A somewhat common alternative preprocessing is to compute the log-likelihood in "logit-space" by transforming
$$
x_i = \text{logit}\left(\lambda+(1-2\lambda)\frac{z_i}{256}\right)
$$
where $\lambda$ is a a small number a bit larger than the smallest value of $z_i/256$. This preprocessing was used in [Masked Autoregressive Flow for Density Estimation, 2018].

#### Conversion of continuous log-likelihood to continuous log-likelhood

By the change of variables formula for probability density functions, we can compute the probability distribution $p_z(z)$ where $\mathbf{z}=g^{-1}(\mathbf{x})$ and we know $p_x(\mathbf{x})$.
$$
p_\mathbf{z}(\mathbf{z}) = p_\mathbf{x}(g(\mathbf{z})) \left| \frac{d\mathbf{x}}{d\mathbf{z}} \right|
$$
In $D$ dimensions, the derivative corresponds to the Jacobian and then we take the determinant of it. With an element-wise transform as the above, this Jacobian is diagonal.

For the first transformation
$$
J_{g,ii} = \frac{d}{dz_{i}} \left(\frac{z_i + u}{256}\right) = \frac{1}{256}\\
\text{det}\;\mathbf{J} = 256^{-D}
$$
such that 
$$
p_\mathbf{z}(\mathbf{z}) = p_\mathbf{x}(\mathbf{x}) 256^{-D}\\
\log p_\mathbf{z}(\mathbf{z}) = \log p_\mathbf{x}(\mathbf{x}) - D\log(256)
$$

#### Bits per dimension	

$$
\text{nats}/\text{dim} = -\left( \left(\dfrac{\log_e p(x)}{hwc}\right)-\log_e q \right)
$$

where $\log_e p(x)$ is the data log-likelihood in nats, $h, w$ and $c$ are the height, width and depth dimensions of the data (colour image) and $q$ is the number of pixel values allowed in the orignal quantized data before each quantized pixel $p_q$ was transformed by
$$
p_c = \frac{p_q + u}{q}, \quad u \sim \mathcal{U}(0,1)
$$



In [2]:
def nats_to_bits(nats):
    return nats / np.log(2)


def bits_to_nats(bits):
    return bits / np.log2(np.e)


def px_to_pz_scale_transform(log_e_px, z_bits, dim):
    """
    Convert a loglikelihood in x space to a loglikelihood in z space where z is the original
    data space and x is the input fed to the model on which the px loglikelihood is computed.
    
    The transform is:
        x = (z + u) / 2**z_bits
    
    Args:
        px (array like): Loglikelihoods in x space.
        z_bits (int): Number of bits used to encode a single number in z space, e.g. 8 for 256 values.
        dim (int): Dimensionality of the input e.g. 28 x 28 image has dim=784.
    """
    return log_e_px - dim * np.log(2**z_bits)

## Datasets

### MNIST

#### Binarized

In [3]:
dim = 28**2
z_bits = 8

In [4]:
log_e_px = -84

In [5]:
log_e_pz = log_e_px
print(log_e_pz)

-84


In [6]:
log_2_pz = nats_to_bits(log_e_pz)
print(log_2_pz)

-121.18638343467293


In [7]:
bpd = - log_2_pz / dim
print(bpd)

0.15457446866667465


#### Continuous

In [8]:
dim = 784
z_bits = 8

In [9]:
log_e_px = 3400

In [10]:
log_e_pz = px_to_pz_scale_transform(log_e_px, z_bits, dim)
print(log_e_pz)

-947.4191164719768


In [11]:
log_2_pz = nats_to_bits(log_e_pz)
print(log_2_pz)

-1366.8368609775243


In [12]:
bpd = - log_2_pz / dim
print(bpd)

1.7434143634917403


### FashionMNIST

#### Binarized

In [13]:
dim = 28**2
z_bits = 8

In [14]:
log_e_px = -230

In [15]:
log_e_pz = log_e_px
print(log_e_pz)

-230


In [16]:
log_2_pz = nats_to_bits(log_e_pz)
print(log_2_pz)

-331.81985940446157


In [17]:
bpd = - log_2_pz / dim
print(bpd)

0.42323961658732345


#### Continuous

In [18]:
dim = 28**2
z_bits = 8

In [19]:
log_e_px = 2350

In [20]:
log_e_pz = px_to_pz_scale_transform(log_e_px, z_bits, dim)
print(log_e_pz)

-1997.4191164719768


In [21]:
log_2_pz = nats_to_bits(log_e_pz)
print(log_2_pz)

-2881.666653910936


In [22]:
bpd = - log_2_pz / dim
print(bpd)

3.6755952218251733


## Paper references

### NICE

#### Continuous MNIST

In [23]:
dim= 28**2
z_bits = 8

In [24]:
log_e_px = 1980.50

In [25]:
log_e_pz = px_to_pz_scale_transform(log_e_px, z_bits, dim)
print(log_e_pz)

-2366.919116471977


In [26]:
log_2_pz = nats_to_bits(log_e_pz)
print(log_2_pz)

-3414.742471519408


In [27]:
bpd = - log_2_pz / dim
print(bpd)

4.355538866733939


#### Continuous CIFAR10

In [28]:
dim = 32**2 * 3
z_bits = 7

In [29]:
log_e_px = 5371.78

In [30]:
log_e_pz = px_to_pz_scale_transform(log_e_px, z_bits, dim)
print(log_e_pz)

-9533.656970761065


In [31]:
log_2_pz = nats_to_bits(log_e_pz)
print(log_2_pz)

-13754.159633253486


In [32]:
bpd = - log_2_pz / dim
print(bpd)

4.477265505616369


#### Continuous SVHN

In [33]:
dim = 32**2 * 3
z_bits = 7

In [34]:
log_e_px = 11496.55

In [35]:
log_e_pz = px_to_pz_scale_transform(log_e_px, z_bits, dim)
print(log_e_pz)

-3408.8869707610647


In [36]:
log_2_pz = nats_to_bits(log_e_pz)
print(log_2_pz)

-4917.984327667989


In [37]:
bpd = - log_2_pz / dim
print(bpd)

1.600906356662757


###

###

###