# Simple Gaussian Model with BBVI

In [1]:
import numpy as np
from scipy.stats import norm
import matplotlib.pyplot as plt
import seaborn as sns

# Generate data from a simple model: Normal(10, 1)
data = np.random.normal(loc = 10, scale = 1, size = 100)

# Manual estimation of the gradient of the ELBO for the above model

In [2]:
# Gradient estimator using sampling -- vanilla BBVI
# We here assume the model X ~ Normal(mu, 1)
# with unknown mu, that in itself is Normal, mean 0 and standard deviation 1000, 
# so effectively an uniformed prior. 
# The variational dstribution for mu is also Normal, with parameter q_mu_lambda
# -- taking the role of lambda in the calculations -- and variance 1.

def grad_estimate(q_mu_lambda, samples = 1):
    # sum_grad_estimate will hold the sum as we move along over the <samples> samples. 
    sum_grad_estimate = 0
    for i in range(samples):
        # Sample one example from current best guess for the variational distribution
        mu_sample = np.random.normal(loc=q_mu_lambda, scale=1, size=1)
        
        # Now we want to calculate the contribution from this sample, namely 
        # [log p(x, mu_sample) - log q(mu|lambda) ] * grad( log q(mu_sample|lambda) )
        #
        # First log p(x|mu_sample) + log p(mu_sample) - log q(mu_sample|lambda) 
        value = np.sum(norm.logpdf(data, loc=mu_sample, scale=1)) 
        + norm.logpdf(mu_sample, loc = 0, scale = 1000)  
        - norm.logpdf(mu_sample, loc= q_mu_lambda, scale = 1)
        
        # Next grad (log q(mu_sample|lambda))
        # The Normal distribution gives the score function with known variance as <value> - <mean>
        grad_q = mu_sample - q_mu_lambda
        
        # grad ELBO for this sample is therefore in total given by
        sum_grad_estimate = sum_grad_estimate + grad_q * value
        
    # Divide by number of samples to get average value -- the estimated expectation  
    return sum_grad_estimate/samples

In [3]:
q_mu_lambda = -10

for i in range(1000):
    grad = grad_estimate(q_mu_lambda, samples = 1)
    q_mu_lambda = q_mu_lambda + 0.0001 * grad
    
    if i % 10 == 0:
        print(f"Iteration {i}: {q_mu_lambda}")

Iteration 0: [-11.60604789]
Iteration 10: [-5.33705271]
Iteration 20: [-5.58847896]
Iteration 30: [-3.01285956]
Iteration 40: [0.64420794]
Iteration 50: [-2.25654332]
Iteration 60: [0.59912873]
Iteration 70: [4.42624524]
Iteration 80: [4.00348581]
Iteration 90: [5.2545006]
Iteration 100: [5.77074475]
Iteration 110: [6.3213011]
Iteration 120: [6.57219151]
Iteration 130: [6.69834762]
Iteration 140: [7.27265731]
Iteration 150: [7.43427746]
Iteration 160: [7.73134501]
Iteration 170: [8.10168]
Iteration 180: [8.24512834]
Iteration 190: [8.30006127]
Iteration 200: [8.18304935]
Iteration 210: [8.42605926]
Iteration 220: [8.56945759]
Iteration 230: [8.9500815]
Iteration 240: [8.85885982]
Iteration 250: [8.95452735]
Iteration 260: [9.03092096]
Iteration 270: [9.10848151]
Iteration 280: [9.06763766]
Iteration 290: [9.25002527]
Iteration 300: [9.28287303]
Iteration 310: [9.28340427]
Iteration 320: [9.48627303]
Iteration 330: [9.74189249]
Iteration 340: [9.7073375]
Iteration 350: [9.67765973]
Iter

# Check effect of sample count

In [None]:
import time
no_loops = 500
for sample_count in [1, 2, 3, 4, 5, 10, 15, 20, 25, 30, 40, 50]:
    ##### Starting point
    q_mu = -10
    start = time.time()

    #loop a couple of times
    for t in range(no_loops):
        q_grad = grad_estimate(q_mu, samples=sample_count)
        # Adjust learning rate according to the formula <start>/((1 + <t>/100)**1.5)
        lr = 1E-4#*np.power((t//500. + 1), -1.5) 
        q_mu = q_mu + lr * q_grad

    print("{:4d} sample(s) -- Estimate: {:9.5f}; error {:5.1f}%  --  Calc.time: {:5.2f} sec.".format(
        sample_count, float(q_mu), float(10*np.abs(q_mu-10)), time.time() - start))

# Checking the variation in gradient estimate

In [None]:
# To check the variation / "unreliability" of the gradient estimate we repeat 
# several times for the same lambda value and notice difference

# Location to check -- close to the data mean (at +10). 
# The prior will move the variational optimium **slightly** away from the data mean, 
# but due to the large prior variance of mu this should be a very limited effect.
# We should therefore expect a positive derivative (since we want to move 
# q_mu_lambda towards the data mean, that is, **increase** it)
q_mu_lambda = 9

plt.figure(figsize=(8,6))
sns.set()
# Do with different sample sizes
for sample_count in [1, 2, 3, 4, 5, 10, 25]:

    #loop
    q_grad = []
    for t in range(500):
        q_grad.append(grad_estimate(q_mu_lambda, samples=sample_count))
    
    sns.distplot(q_grad, hist=False, label="$M = {:d}$".format(sample_count))
    
    # Report back
    print("M = {:2d} sample(s) in BBVI -- Mean of gradient: {:7.3f}; Std.dev. of gradient: {:7.3f}".format(
        sample_count, np.mean(q_grad), np.std(q_grad)))

plt.xlim([-500, 500])
plt.show()      



In [None]:
for t in range(500):
        lr = np.power((t/500. + 1), -1.5) 
        print(lr)