# Simple exercise on Bayesian analysis

For the next hands on exercise, we will an additional dataset.

**These exercises are interactive**: a number of cells include `_______`, which signal that you should fill in the required value. If you've unable to execute a cell, carefully check whether you've filled in all of the required values.

## Step 0: load all relevant modules

In [None]:
from __future__ import print_function

import subprocess

from sklearn.gaussian_process import GaussianProcessRegressor as GPR
from sklearn.gaussian_process import kernels
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import joblib

import matplotlib.cm as cm
import matplotlib.pyplot as plt

from IPython.display import display, clear_output

from scipy.linalg import lapack
from scipy import stats
import emcee
import numpy as np

import os
import pickle
from pathlib import Path

import src.reader as Reader

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

## Step 1: prepare input files

### Load stuff from text files

In [None]:
# Read data files
RawData1       = Reader.ReadData('input/SimpleExample/Data_Selection1.dat')
RawData2       = Reader.ReadData('input/SimpleExample/Data_Selection2.dat')

# Read design points
RawDesign      = Reader.ReadDesign('input/SimpleExample/Design.dat')

# Read model prediction
RawPrediction1 = Reader.ReadPrediction('input/SimpleExample/Prediction_Selection1.dat')
RawPrediction2 = Reader.ReadPrediction('input/SimpleExample/Prediction_Selection2.dat')

### Run this block to prepare the inputs

In [None]:
# Initialize empty dictionary
AllData = {}

# Basic information
AllData["systems"] = ["PbPb5020"]
AllData["keys"] = RawDesign["Parameter"]
AllData["labels"] = RawDesign["Parameter"]
AllData["ranges"] = [(0, 1), (0, 1), (0, 1)]
AllData["observables"] = [('Y', ['C0', 'C1'])]

# Data points
Data = {"PbPb5020": {"Y": {"C0": RawData1["Data"], "C1": RawData2["Data"]}}}

# Model predictions
Prediction = {"PbPb5020": {"Y": {"C0": {"Y": RawPrediction1["Prediction"], "x": RawData1["Data"]['x']},
                                 "C1": {"Y": RawPrediction2["Prediction"], "x": RawData2["Data"]['x']}}}}

# Covariance matrices - the indices are [system][measurement1][measurement2], each one is a block of matrix
Covariance = Reader.InitializeCovariance(Data)
Covariance["PbPb5020"][("Y", "C0")][("Y", "C0")] = Reader.EstimateCovariance(RawData1, RawData1, SysLength = {"default": 0.2})
Covariance["PbPb5020"][("Y", "C1")][("Y", "C1")] = Reader.EstimateCovariance(RawData2, RawData2, SysLength = {"default": 0.2})

# This is how we can add off-diagonal matrices
# Covariance["PbPb5020"][("R_AA", "C0")][("R_AA", "C1")] = Reader.EstimateCovariance(RawData1, RawData2, SysLength = {"default": 100}, SysStrength = {"default": 0.1})
# Covariance["PbPb5020"][("R_AA", "C1")][("R_AA", "C0")] = Reader.EstimateCovariance(RawData2, RawData1, SysLength = {"default": 100}, SysStrength = {"default": 0.1})

# This is how we can supply external pre-generated matrices
# Covariance["PbPb5020"][("R_AA", "C0")][("R_AA", "C0")] = RawCov55E["Matrix"]
# Covariance["PbPb5020"][("R_AA", "C1")][("R_AA", "C1")] = RawCov66E["Matrix"]


# Assign data to the dictionary
AllData["design"] = RawDesign["Design"]
AllData["model"] = Prediction
AllData["data"] = Data
AllData["cov"] = Covariance

# Save to the desired pickle file
with open('input/default.p', 'wb') as handle:
    pickle.dump(AllData, handle, protocol = pickle.HIGHEST_PROTOCOL)

### Exercise 1-a: plot data

In [None]:
print(AllData['data'])

In [None]:
figure, axes = plt.subplots(figsize = (10, 5), ncols = 2, nrows = 1)

DataX1    = AllData["data"]['PbPb5020']['Y']['C0']['x']
DataY1    = AllData["data"]['PbPb5020']['Y']['C0']['y']
DataStat1 = AllData["data"]['PbPb5020']['Y']['C0']['yerr']['stat'][:,0]
DataSys1  = AllData["data"]['PbPb5020']['Y']['C0']['yerr']['sys'][:,0]
DataErr1  = np.sqrt(DataStat1**2 + DataSys1**2)

DataX2    = AllData["data"]['PbPb5020']['Y']['C1']['x']
DataY2    = AllData["data"]['PbPb5020']['Y']['C1']['y']
DataStat2 = AllData["data"]['PbPb5020']['Y']['C1']['yerr']['stat'][:,0]
DataSys2  = AllData["data"]['PbPb5020']['Y']['C1']['yerr']['sys'][:,0]
DataErr2  = np.sqrt(DataStat2**2 + DataSys2**2)

axes[0].errorbar(DataX1, DataY1, yerr = DataErr1, fmt='ro', label="Measurements")
axes[1].errorbar(DataX2, DataY2, yerr = DataErr2, fmt='ro', label="Measurements")

### Exercise 1-b: plot "theory" predictions on top of the data

In [None]:
print(AllData["model"])

In [None]:
DataX1    = AllData["data"]['PbPb5020']['Y']['C0']['x']
DataY1    = AllData["data"]['PbPb5020']['Y']['C0']['y']
DataStat1 = AllData["data"]['PbPb5020']['Y']['C0']['yerr']['stat'][:,0]
DataSys1  = AllData["data"]['PbPb5020']['Y']['C0']['yerr']['sys'][:,0]
DataErr1  = np.sqrt(DataStat1**2 + DataSys1**2)

DataX2    = AllData["data"]['PbPb5020']['Y']['C1']['x']
DataY2    = AllData["data"]['PbPb5020']['Y']['C1']['y']
DataStat2 = AllData["data"]['PbPb5020']['Y']['C1']['yerr']['stat'][:,0]
DataSys2  = AllData["data"]['PbPb5020']['Y']['C1']['yerr']['sys'][:,0]
DataErr2  = np.sqrt(DataStat2**2 + DataSys2**2)

PredictionsY1 = AllData["model"]['PbPb5020']['Y']['C0']['Y']
PredictionsX1 = AllData["model"]['PbPb5020']['Y']['C0']['x']
PredictionsY2 = AllData["model"]['PbPb5020']['Y']['C1']['Y']
PredictionsX2 = AllData["model"]['PbPb5020']['Y']['C1']['x']

figure, axes = plt.subplots(figsize = (10, 5), ncols = 2, nrows = 1)

axes[0].errorbar(DataX1, DataY1, yerr = DataErr1, fmt='ro', label="Measurements")
for Item in PredictionsY1:
    axes[0].plot(PredictionsX1, Item, 'b-', alpha=0.25)

axes[1].errorbar(DataX2, DataY2, yerr = DataErr2, fmt='ro', label="Measurements")
for Item in PredictionsY2:
    axes[1].plot(PredictionsX2, Item, 'b-', alpha=0.25)

### Exercise 1-c: plot the design points

In [None]:
print(AllData['design'])

In [None]:
figure, axes = plt.subplots(figsize = (15, 5), ncols = 3, nrows = 1)

# Plot A vs B
axes[0].set_xlabel('A')
axes[0].set_ylabel('B')
axes[0].scatter(AllData['design'][:,0], AllData['design'][:,1])
# Plot A vs C
axes[1].set_xlabel('A')
axes[1].set_ylabel('C')
axes[1].scatter(AllData['design'][:,0], AllData['design'][:,2])
# Plot B vs C
axes[2].set_xlabel('B')
axes[2].set_ylabel('C')
axes[2].scatter(AllData['design'][:,1], AllData['design'][:,2])

### Exercise 1-d: plot covariance matrix

In [None]:
print(AllData['cov'])

In [None]:
figure, axes = plt.subplots(figsize = (10, 5), ncols = 2, nrows = 1)

Covariance1 = AllData['cov']['PbPb5020'][('Y','C0')][('Y','C0')]
Covariance2 = AllData['cov']['PbPb5020'][('Y','C1')][('Y','C1')]

axes[0].set_xlabel('Bin index')
axes[0].set_ylabel('Bin index')
axes[0].imshow(Covariance1, cmap = 'Blues', interpolation = 'nearest', extent=[0,6,6,0])

axes[1].set_xlabel('Bin index')
axes[1].set_ylabel('Bin index')
axes[1].imshow(Covariance2, cmap = 'Blues', interpolation = 'nearest', extent=[0,11,11,0])

### Optional: clean past files

In [None]:
# Clean past MCMC samples
if os.path.exists('cache/mcmc_chain.hdf'):
    os.remove("cache/mcmc_chain.hdf")

# Clean past emulator
for system in AllData["systems"]:
    if os.path.exists('cache/emulator/' + system + ".pkl"):
        os.remove('cache/emulator/' + system + ".pkl")

## Step 2: run emulator

In [None]:
! python3 -m src.emulator --retrain --npc 8 --nrestarts 50 

In [None]:
from src import lazydict, emulator
EmulatorPbPb5020 = emulator.Emulator.from_cache('PbPb5020')

## Step 3: MCMC sampling

In [None]:
if os.path.exists('cache/mcmc_chain.hdf'):
    os.remove("cache/mcmc_chain.hdf")
! python3 -m src.mcmc --nwalkers 100 --nburnsteps 1000 1000

In [None]:
import src
src.Initialize()
from src import mcmc
chain = mcmc.Chain()
MCMCSamples = chain.load()

### Exercise 3-a: plot posterior function directly for A = 0.3

In [None]:
A = 0.3
Grid = [[A, B * 0.01, C * 0.01] for B in range(10, 100, 10) for C in range(10, 100, 10)]

In [None]:
Posterior = chain.log_posterior(Grid)

figure, axes = plt.subplots(figsize = (5, 5), ncols = 1, nrows = 1)

axes.set_xlabel('B')
axes.set_ylabel('C')
axes.imshow(Posterior.reshape(9, 9), cmap = 'Blues', extent = [0.05,0.95,0.95,0.05])

### Exercise 3-b: plot "chi^2" from data and "truth", again for A = 0.3

In [None]:
DataX1    = AllData["data"]['PbPb5020']['Y']['C0']['x']
DataY1    = AllData["data"]['PbPb5020']['Y']['C0']['y']
DataStat1 = AllData["data"]['PbPb5020']['Y']['C0']['yerr']['stat'][:,0]
DataSys1  = AllData["data"]['PbPb5020']['Y']['C0']['yerr']['sys'][:,0]
DataErr1  = np.sqrt(DataStat1**2 + DataSys1**2)

DataX2    = AllData["data"]['PbPb5020']['Y']['C1']['x']
DataY2    = AllData["data"]['PbPb5020']['Y']['C1']['y']
DataStat2 = AllData["data"]['PbPb5020']['Y']['C1']['yerr']['stat'][:,0]
DataSys2  = AllData["data"]['PbPb5020']['Y']['C1']['yerr']['sys'][:,0]
DataErr2  = np.sqrt(DataStat2**2 + DataSys2**2)

Chi2 = np.zeros(len(Grid))
for i, p in enumerate(Grid):
    TruthY1 = [(p[0] + p[1] * x * 0.01 + p[2] * x**2 * 0.01**2) for x in DataX1]
    TruthY2 = [(p[0] + p[1] * x * 0.01 + p[2] * x**2 * 0.01**2) for x in DataX2]
    Chi2[i] = np.sum(((TruthY1 - DataY1) / DataErr1)**2) + np.sum(((TruthY2 - DataY2) / DataErr2)**2)
    
figure, axes = plt.subplots(figsize = (5, 5), ncols = 1, nrows = 1)

axes.set_xlabel('B')
axes.set_ylabel('C')
axes.imshow(Chi2.reshape(9, 9), cmap = 'Greens_r', extent = [0.05,0.95,0.95,0.05])

## Step 4: analyze the posterior samples

### MCMC samples plot

In [None]:
with chain.dataset() as d:
    W = d.shape[0]     # number of walkers
    S = d.shape[1]     # number of steps
    N = d.shape[2]     # number of paramters
    T = int(S / 200)   # "thinning"
    A = 20 / W
    figure, axes = plt.subplots(figsize = (15, 2 * N), ncols = 1, nrows = N)
    for i, ax in enumerate(axes):
        for j in range(0, W):
            ax.plot(range(0, S, T), d[j, ::T, i], alpha = A)
    plt.tight_layout(True)
    plt.savefig('plots/MCMCSamples.pdf', dpi = 192)

### Posterior on parameters

In [None]:
NDimension = len(AllData["labels"])
Ranges = np.array(AllData["ranges"]).T
figure, axes = plt.subplots(figsize = (3 * NDimension, 3 * NDimension), ncols = NDimension, nrows = NDimension)
Names = AllData["labels"]
for i, row in enumerate(axes):
    for j, ax in enumerate(row):
        if i==j:
            ax.hist(MCMCSamples[:,i], bins=50,
                    range=Ranges[:,i], histtype='step', color='green')
            ax.set_xlabel(Names[i])
            ax.set_xlim(*Ranges[:,j])
        if i>j:
            ax.hist2d(MCMCSamples[:, j], MCMCSamples[:, i], 
                      bins=50, range=[Ranges[:,j], Ranges[:,i]], 
                      cmap='Greens')
            ax.set_xlabel(Names[j])
            ax.set_ylabel(Names[i])
            ax.set_xlim(*Ranges[:,j])
            ax.set_ylim(*Ranges[:,i])
        if i<j:
            ax.axis('off')
plt.tight_layout(True)
plt.savefig('plots/Correlation.pdf', dpi = 192)
# figure

### Posterior on top of data

In [None]:
Examples = MCMCSamples[ np.random.choice(range(len(MCMCSamples)), 5000), :]

TempPrediction = {"PbPb5020": EmulatorPbPb5020.predict(Examples)}

figure, axes = plt.subplots(figsize = (10, 5), ncols = 2, nrows = 1)

axes[0].set_xlabel(r"$X$")
axes[0].set_ylabel(r"$Y$")
axes[1].set_xlabel(r"$X$")
axes[1].set_ylabel(r"$Y$")

DataX1    = AllData["data"]['PbPb5020']['Y']['C0']['x']
DataY1    = AllData["data"]['PbPb5020']['Y']['C0']['y']
DataStat1 = AllData["data"]['PbPb5020']['Y']['C0']['yerr']['stat'][:,0]
DataSys1  = AllData["data"]['PbPb5020']['Y']['C0']['yerr']['sys'][:,0]
DataErr1  = np.sqrt(DataStat1**2 + DataSys1**2)

DataX2    = AllData["data"]['PbPb5020']['Y']['C1']['x']
DataY2    = AllData["data"]['PbPb5020']['Y']['C1']['y']
DataStat2 = AllData["data"]['PbPb5020']['Y']['C1']['yerr']['stat'][:,0]
DataSys2  = AllData["data"]['PbPb5020']['Y']['C1']['yerr']['sys'][:,0]
DataErr2  = np.sqrt(DataStat2**2 + DataSys2**2)

for i, y in enumerate(TempPrediction['PbPb5020']['Y']['C0']):
    axes[0].plot(DataX1, y, 'b-', alpha=0.0025, label="Posterior" if i==0 else '')
axes[0].errorbar(DataX1, DataY1, yerr = DataErr1, fmt='ro', label="Measurements")

for i, y in enumerate(TempPrediction['PbPb5020']['Y']['C1']):
    axes[1].plot(DataX2, y, 'b-', alpha=0.0025, label="Posterior" if i==0 else '')
axes[1].errorbar(DataX2, DataY2, yerr = DataErr2, fmt='ro', label="Measurements")

plt.tight_layout(True)
figure.savefig('plots/ObservablePosterior.pdf', dpi = 192)
# figure

In [None]:
# close all plots to save memory
plt.close('all')