# Demo notebook for the calibrated predictive distributions implementation in RAIL

Author: Luca Tortorelli, Bitrateep Dey

last run successfully: Nov 7, 2022

The purpose of this notebook is to demonstrate the implementation of the calibrated predictive distribution (Dey at al. 2022) in RAIL.
Bitrateep provided a test data in .npz format (/src/rail/examples/testdata/bpz_test_red.npz) that contained:
- a galaxy catalogue with spectroscopic redshifts, magnitudes and their errors
- conditional density estimates for each galaxy, PDFs evaluated with a photo-z method on representative sample of object
- redshift grid of the conditional density estimate

In [None]:
import numpy as np
import os
import pandas as pd
import qp
from src.rail.evaluation.metrics.pit import ConditionPIT

We do a small degree of preprocessing before feeding the data into RAIL

In [None]:
# read Bitrateep's test data

root = 'src/rail/examples/testdata/'
data = np.load(os.path.join(root, 'bpz_test_red.npz'), allow_pickle=True)

In [None]:
# display the keywords to access the data
for name in data.keys(): print(name)

In [None]:
# conveniently read the galaxy catalogue as pandas dataframe
cat = pd.DataFrame(data["test_cat"])

In [None]:
# create new features for training the method, in this case colours and their errors

cat["UG"] = cat["U"]-cat["G"]
cat["UGERR"] = np.sqrt(cat["UERR"]**2 + cat["GERR"]**2)
cat["UR"] = cat["U"]-cat["R"]
cat["URERR"] = np.sqrt(cat["UERR"]**2 + cat["RERR"]**2)
cat["UI"] = cat["U"]-cat["I"]
cat["UIERR"] = np.sqrt(cat["UERR"]**2 + cat["IERR"]**2)
cat["UZ"] = cat["U"]-cat["Z"]
cat["UZERR"] = np.sqrt(cat["UERR"]**2 + cat["ZERR"]**2)
cat["UY"] = cat["U"]-cat["Y"]
cat["UYERR"] = np.sqrt(cat["UERR"]**2 + cat["YERR"]**2)

cat["GR"] = cat["G"]-cat["R"]
cat["GRERR"] = np.sqrt(cat["GERR"]**2 + cat["RERR"]**2)
cat["GI"] = cat["G"]-cat["I"]
cat["GIERR"] = np.sqrt(cat["GERR"]**2 + cat["IERR"]**2)
cat["GZ"] = cat["G"]-cat["Z"]
cat["GZERR"] = np.sqrt(cat["GERR"]**2 + cat["ZERR"]**2)
cat["GY"] = cat["G"]-cat["Y"]
cat["GYERR"] = np.sqrt(cat["GERR"]**2 + cat["YERR"]**2)

cat["RI"] = cat["R"]-cat["I"]
cat["RIERR"] = np.sqrt(cat["RERR"]**2 + cat["IERR"]**2)
cat["RZ"] = cat["R"]-cat["Z"]
cat["RZERR"] = np.sqrt(cat["RERR"]**2 + cat["ZERR"]**2)
cat["RY"] = cat["R"]-cat["Y"]
cat["RYERR"] = np.sqrt(cat["RERR"]**2 + cat["YERR"]**2)

cat["IZ"] = cat["I"]-cat["Z"]
cat["IZERR"] = np.sqrt(cat["IERR"]**2 + cat["ZERR"]**2)
cat["IY"] = cat["I"]-cat["Y"]
cat["IYERR"] = np.sqrt(cat["IERR"]**2 + cat["YERR"]**2)

cat["ZY"] = cat["Z"]-cat["Y"]
cat["ZYERR"] = np.sqrt(cat["ZERR"]**2 + cat["YERR"]**2)

In [None]:
# normalise the conditional density estimates across the redshift grid
z_grid = data["z_grid"]

cde = data["cde_test"] # conditional density estimate
norm = np.trapz(cde, z_grid) # normalize across the redshift grid
norm[norm==0] = 1
cde = cde/norm[:,None]

In [None]:
# define the number of galaxies to train the method and split the sample into training and testing set
SEED = 299792458

num_calib = 800
n_gal = len(cat)
num_test = n_gal - num_calib

rng = np.random.default_rng(SEED)
indices = rng.permutation(n_gal) # creating index permutation for splitting in train and test

cde_calib = cde[indices[:num_calib]] # splitting cde in training set
cde_test = cde[indices[num_calib:]] # and test set

z_calib = cat["SPECZ"][indices[:num_calib]].values
z_test = cat["SPECZ"][indices[num_calib:]].values

cat_calib = cat.iloc[indices[:num_calib]]
cat_test = cat.iloc[indices[num_calib:]]

In [None]:
# define a list of features for the method
features = ["I", "UG", "GR", "RI", "IZ", "ZY", "IZERR", "RIERR", "GRERR", "UGERR", "IERR", "ZYERR"]

In [None]:
# store the conditional density estimates for the training and test set into qp ensembles
qp_ens_cde_calib = qp.Ensemble(qp.interp, data=dict(xvals=z_grid, yvals=cde_calib))
qp_ens_cde_test = qp.Ensemble(qp.interp, data=dict(xvals=z_grid, yvals=cde_test))

Initialisation of the ConditionPIT class

In [None]:
cond_pit = ConditionPIT(cde_calib, cde_test, z_grid, z_calib, z_test, cat_calib[features].values,
                        cat_test[features].values, qp_ens_cde_calib)

In [None]:
# train the method using the provided data
cond_pit.train(patience=10, n_epochs=10, lr=0.001, weight_decay=0.01, batch_size=100, frac_mlp_train=0.9,
               lr_decay=0.95, oversample=50, n_alpha=201, checkpt_path="checkpoint_GPZ_wide_CDE_test.pt",
               hidden_layers=[2, 2, 2])

In [None]:
# compute the local pit
pit_local, pit_local_fit = cond_pit.evaluate(model_checkpt_path='checkpoint_GPZ_wide_CDE_test.pt',
                                             model_hidden_layers=[2, 2, 2], nn_type='monotonic',
                                             batch_size=100, num_basis=40, num_cores=1)

In [None]:
# plot the local P-P plot diagnostics
cond_pit.diagnostics(pit_local, pit_local_fit)