# Simple exercise on Bayesian analysis

For the first hands on exercise, we will perform a simple Bayesian analysis example with only one dataset.

**These exercises are interactive**: a number of cells include `_______`, which signal that you should fill in the required value. If you've unable to execute a cell, carefully check whether you've filled in all of the required values.

## Step 0: load all relevant modules

In [None]:
from __future__ import print_function

import subprocess

from sklearn.gaussian_process import GaussianProcessRegressor as GPR
from sklearn.gaussian_process import kernels
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import joblib

import matplotlib.cm as cm
import matplotlib.pyplot as plt

from IPython.display import display, clear_output

from scipy.linalg import lapack
from scipy import stats
import emcee
import numpy as np

import os
import pickle
from pathlib import Path

import src.reader as Reader

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

## Step 1: prepare input files

We need to perform some setup to load and format the data properly for the Bayesian analysis framework.

### Load measured data, design points, and prediction from text files

Check the available files in the `input/SimpleExample` directory.

In [None]:
# Read data files
RawData1       = Reader.ReadData('input/SimpleExample/______')

# Read design points
RawDesign      = Reader.ReadDesign('input/SimpleExample/______')

# Read model prediction
# Each prediction corresponds to a design point
RawPrediction1 = Reader.ReadPrediction('input/SimpleExample/______')

### Run this block to prepare the inputs

Now we combine the input data into a dictionary formatted according to the input expected by the framework. `AllData` will conatain all data, design points, predictions, and covariance between the data points. It also conatins general information about the input information (parameters, ranges, etc).

For our example, we are measuring some observable `Y`. We label each measurement of an observable as `C{N}` (for example, `C0` for the first measurement) (where `C` is a "column name" in data).

Once we've fully constructed the input information, it will be stored onto the filesystem. Later steps will access this information.

In [None]:
# Initialize empty dictionary
AllData = {}

# Basic information
AllData["systems"] = ["PbPb5020"]
AllData["keys"] = RawDesign["Parameter"]
AllData["labels"] = RawDesign["Parameter"]
AllData["ranges"] = [(0, 1), (0, 1), (0, 1)]
AllData["observables"] = [('Y', ['C0'])]

# Data points
Data = {"PbPb5020": {"Y": {"C0": RawData1["Data"]}}}

# Model predictions
Prediction = {"PbPb5020": {"Y": {"C0": {"Y": RawPrediction1["Prediction"], "x": RawData1["Data"]['x']}}}}

# Covariance matrices - the indices are [system][measurement1][measurement2], each one is a block of matrix
Covariance = Reader.InitializeCovariance(Data)
Covariance["PbPb5020"][("Y", "C0")][("Y", "C0")] = Reader.EstimateCovariance(RawData1, RawData1, SysLength = {"default": 0.2})

# This is how we can add off-diagonal matrices
# Covariance["PbPb5020"][("R_AA", "C0")][("R_AA", "C1")] = Reader.EstimateCovariance(RawData1, RawData2, SysLength = {"default": 100}, SysStrength = {"default": 0.1})
# Covariance["PbPb5020"][("R_AA", "C1")][("R_AA", "C0")] = Reader.EstimateCovariance(RawData2, RawData1, SysLength = {"default": 100}, SysStrength = {"default": 0.1})

# This is how we can supply external pre-generated matrices
# Covariance["PbPb5020"][("R_AA", "C0")][("R_AA", "C0")] = RawCov55E["Matrix"]
# Covariance["PbPb5020"][("R_AA", "C1")][("R_AA", "C1")] = RawCov66E["Matrix"]


# Assign data to the dictionary
AllData["design"] = RawDesign["Design"]
AllData["model"] = Prediction
AllData["data"] = Data
AllData["cov"] = Covariance

# Save to the desired pickle file
with open('input/default.p', 'wb') as handle:
    pickle.dump(AllData, handle, protocol = pickle.HIGHEST_PROTOCOL)

### Exercise 1-a: plot data

The measured data are stored under the `data` key in AllData.  Take a look at the print statement output to understand the structure.

In [None]:
print(AllData['data'])

In [None]:
figure, axes = plt.subplots(figsize = (5, 5), ncols = 1, nrows = 1)

DataX    = ______
DataY    = ______
DataStat = ______
DataSys  = ______
DataErr  = np.sqrt(DataStat**2 + DataSys**2)

axes.errorbar(DataX, DataY, yerr = DataErr, fmt='ro', label="Measurements")

### Exercise 1-b: plot "theory" predictions on top of the data

The theory predictions are stored under the `model` key in AllData.  Take a look at the print statement output to understand the structure.

In [None]:
print(AllData["model"])

In [None]:
# This is just repeating the defintion from above.
DataX    = ______
DataY    = ______
DataStat = ______
DataSys  = ______
DataErr  = np.sqrt(DataStat**2 + DataSys**2)

PredictionsY = ______
PredictionsX = ______

figure, axes = plt.subplots(figsize = (5, 5), ncols = 1, nrows = 1)

axes.errorbar(DataX, DataY, yerr = DataErr, fmt='ro', label="Measurements")
for Item in PredictionsY:
    axes.plot(PredictionsX, Item, 'b-', alpha=0.25)

### Exercise 1-c: plot the design points

The deisgn points are stored under the `design` key in AllData. Take a look at the print statement output to understand the structure.

In [None]:
print(AllData['design'])

In [None]:
figure, axes = plt.subplots(figsize = (15, 5), ncols = 3, nrows = 1)

# Plot A vs B
axes[0].set_xlabel('A')
axes[0].set_ylabel('B')
axes[0].scatter(______, ______)
# Plot A vs C
axes[1].set_xlabel('A')
axes[1].set_ylabel('C')
axes[1].scatter(______, ______)
# Plot B vs C
axes[2].set_xlabel('B')
axes[2].set_ylabel('C')
axes[2].scatter(______, ______)

### Exercise 1-d: plot covariance matrix

The covariance matrix between the measured data is stored under the `cov` key in AllData. Take a look at the print statement output to understand the structure.

In [None]:
print(AllData['cov'])

In [None]:
figure, axes = plt.subplots(figsize = (5, 5), ncols = 1, nrows = 1)

Covariance = ______

axes.set_xlabel('Bin index')
axes.set_ylabel('Bin index')
axes.imshow(Covariance, cmap = 'Blues', interpolation = 'nearest', extent=[0,6,6,0])

### Optional: clean past files

Results are cached automatically so that we can avoid repeating expensive calculations when possible.

In [None]:
# Clean past MCMC samples
if os.path.exists('cache/mcmc_chain.hdf'):
    os.remove("cache/mcmc_chain.hdf")

# Clean past emulator
for system in AllData["systems"]:
    if os.path.exists('cache/emulator/' + system + ".pkl"):
        os.remove('cache/emulator/' + system + ".pkl")

## Step 2: run emulator

Now that we've setup our data, we can setup the Gaussian Process Emulator. It will automatically load our data and train according to the settings specified in this section.

After training, we access the emulator predictions via the `EmulatorPbPb5020` object defined below.

In [None]:
! python3 -m src.emulator --retrain --npc 3 --nrestarts 50 

In [None]:
from src import lazydict, emulator
EmulatorPbPb5020 = emulator.Emulator.from_cache('PbPb5020')

### Exercise 2: plot emulator prediction for some random point, compare to "truth"

We can check the performance of the emulator by asking it to predict a random point, and the comparing it to the truth. Remeber that our "truth" is generated according to:

$$y = A + B\frac{x}{100} + C(\frac{x}{100})^{2}$$

In [None]:
RandomPoint = [0.1, 0.6, 0.8]

# NOTE: Since we ask for a single point, the prediction array has shape (1, 6)
Prediction = {"PbPb5020": EmulatorPbPb5020.predict([RandomPoint])}

print(Prediction)

In [None]:
figure, axes = plt.subplots(figsize = (5, 5), ncols = 1, nrows = 1)

TruthX = np.array(range(100))
TruthY = ______

DataX     = ______
Predicted = ______

axes.set_xlabel('X')
axes.set_ylabel('Y')
axes.plot(TruthX, TruthY, 'r-')
axes.plot(DataX, Predicted, 'bo')

## Step 3: MCMC sampling

We now use MCMC to calculate the posterior, utilizing the GPE and the experimental data.

In [None]:
if os.path.exists('cache/mcmc_chain.hdf'):
    os.remove("cache/mcmc_chain.hdf")
! python3 -m src.mcmc --nwalkers 100 --nburnsteps 1000 1000

In [None]:
import src
src.Initialize()
from src import mcmc
chain = mcmc.Chain()
MCMCSamples = chain.load()

### Exercise 3-a: plot posterior function directly for A = 0.3

In [None]:
A = 0.3
Grid = [[A, B * 0.01, C * 0.01] for B in range(10, 100, 10) for C in range(10, 100, 10)]

In [None]:
Posterior = ______

figure, axes = plt.subplots(figsize = (5, 5), ncols = 1, nrows = 1)

axes.set_xlabel('B')
axes.set_ylabel('C')
axes.imshow(Posterior.reshape(9, 9), cmap = 'Blues', extent = [0.05,0.95,0.95,0.05])

### Exercise 3-b: plot "chi^2" from data and "truth", again for A = 0.3

In [None]:
DataX    = ______
DataY    = ______
DataStat = ______
DataSys  = ______
DataErr  = np.sqrt(DataStat**2 + DataSys**2)

Chi2 = np.zeros(len(Grid))
for i, p in enumerate(Grid):
    TruthY = [______ for x in DataX]
    Chi2[i] = ______
    
figure, axes = plt.subplots(figsize = (5, 5), ncols = 1, nrows = 1)

axes.set_xlabel('B')
axes.set_ylabel('C')
axes.imshow(Chi2.reshape(9, 9), cmap = 'Greens', extent = [0.05,0.95,0.95,0.05])

## Step 4: analyze the posterior samples

### MCMC samples plot

Plot the parameters explored by each walker of the MCMC. They should cover the parameter values of interest.

In [None]:
with chain.dataset() as d:
    W = d.shape[0]     # number of walkers
    S = d.shape[1]     # number of steps
    N = d.shape[2]     # number of paramters
    T = int(S / 200)   # "thinning"
    A = 20 / W
    figure, axes = plt.subplots(figsize = (15, 2 * N), ncols = 1, nrows = N)
    for i, ax in enumerate(axes):
        for j in range(0, W):
            ax.plot(range(0, S, T), d[j, ::T, i], alpha = A)
    plt.tight_layout(True)
    plt.savefig('plots/MCMCSamples.pdf', dpi = 192)

### Posterior on parameters

Shows the probability distribution for parameters according to the posterior.

In [None]:
NDimension = len(AllData["labels"])
Ranges = np.array(AllData["ranges"]).T
figure, axes = plt.subplots(figsize = (3 * NDimension, 3 * NDimension), ncols = NDimension, nrows = NDimension)
Names = AllData["labels"]
for i, row in enumerate(axes):
    for j, ax in enumerate(row):
        if i==j:
            ax.hist(MCMCSamples[:,i], bins=50,
                    range=Ranges[:,i], histtype='step', color='green')
            ax.set_xlabel(Names[i])
            ax.set_xlim(*Ranges[:,j])
        if i>j:
            ax.hist2d(MCMCSamples[:, j], MCMCSamples[:, i], 
                      bins=50, range=[Ranges[:,j], Ranges[:,i]], 
                      cmap='Greens')
            ax.set_xlabel(Names[j])
            ax.set_ylabel(Names[i])
            ax.set_xlim(*Ranges[:,j])
            ax.set_ylim(*Ranges[:,i])
        if i<j:
            ax.axis('off')
plt.tight_layout(True)
plt.savefig('plots/Correlation.pdf', dpi = 192)
# figure

### Posterior on top of data

Compare the parameters determined according to the posterior to the experimental data

In [None]:
Examples = MCMCSamples[ np.random.choice(range(len(MCMCSamples)), 5000), :]

TempPrediction = {"PbPb5020": EmulatorPbPb5020.predict(Examples)}

figure, axes = plt.subplots(figsize = (5, 5), ncols = 1, nrows = 1)

axes.set_xlabel(r"$X$")
axes.set_ylabel(r"$Y$")
DataX    = AllData["data"]['PbPb5020']['Y']['C0']['x']
DataY    = AllData["data"]['PbPb5020']['Y']['C0']['y']
DataStat = AllData["data"]['PbPb5020']['Y']['C0']['yerr']['stat'][:,0]
DataSys  = AllData["data"]['PbPb5020']['Y']['C0']['yerr']['sys'][:,0]
DataErr  = np.sqrt(DataStat**2 + DataSys**2)

for i, y in enumerate(TempPrediction['PbPb5020']['Y']['C0']):
    axes.plot(DataX, y, 'b-', alpha=0.0025, label="Posterior" if i==0 else '')
axes.errorbar(DataX, DataY, yerr = DataErr, fmt='ro', label="Measurements")

plt.tight_layout(True)
figure.savefig('plots/ObservablePosterior.pdf', dpi = 192)
# figure

In [None]:
# close all plots to save memory
plt.close('all')