# producing $p(z | photometry)$ for ELAsTiCC

_Alex Malz (GCCL@RUB)_

The goal here is to generate mock photo-$z$ posteriors for host galaxies. 
Ideally, we want them to contain no assumptions not present in the $p(z, photometry$ space from which they were drawn.
That's not really feasible. . .

TODO: explain why we can't do this

The next best thing to do is to aim for realistic complexity and make assumptions as similar to those of the underlying $p(z, photometry)$ model, by using [`pzflow`](https://github.com/jfcrenshaw/pzflow).

In [None]:
# import GCRCatalogs
# from GCRCatalogs import cosmodc2

In [None]:
import corner
import numpy as np
import pandas as pd

In [None]:
import pzflow
from pzflow import Flow
from pzflow.bijectors import Chain, ColorTransform, InvSoftplus, StandardScaler, RollingSplineCoupling, ShiftBounds
from pzflow.distributions import Uniform, Joint, Normal

In [None]:
import qp
# help(qp)

In [None]:
import rail
# from rail.creation import Creator, engines
from rail.creation.degradation import LSSTErrorModel

# awkwardly the rail dev branch is broken such that creators don't exist but degraders still do

In [None]:
import matplotlib as mpl
mpl.rcParams['text.usetex'] = False
mpl.rcParams['mathtext.rm'] = 'serif'
mpl.rcParams['font.family'] = 'serif'
mpl.rcParams['font.serif'] = 'DejaVu Serif'
# mpl.rcParams['axes.titlesize'] = 16
# mpl.rcParams['axes.labelsize'] = 14
# mpl.rcParams['savefig.dpi'] = 250
# mpl.rcParams['savefig.format'] = 'pdf'
# mpl.rcParams['savefig.bbox'] = 'tight'
import matplotlib.pyplot as plt
plt.rcParams["font.family"] = "serif"
plt.rcParams["mathtext.fontset"] = "dejavuserif"

Let's pick one hostlib for now.

TODO: loop through hostlibs later

In [None]:
hl_path = '/global/cfs/cdirs/desc-td/SN/SNANA/SURVEYS/LSST/ROOT/PLASTICC_DEV/HOSTLIB/TEMP_HOSTLIBS/SNIa_GHOST_PHOTOZ.HOSTLIB'
# skip 26lines
df = pd.read_csv(hl_path, skiprows=25, delimiter=' ', header=0)

In [None]:
df.columns

`pzflow` needs a grid upon which to evaluate redshift posteriors. 
We use a fine grid now but will compress it for the alert stream later.
And we can check what the redshift distribution of the hostlib is.

TODO: investigate the prevalence at $z \sim 3$ and maybe ask to re-run?

In [None]:
zgrid = np.logspace(-3., np.log10(3.), 300)
plt.hist(df['ZTRUE'], bins=zgrid);
plt.xlabel('z')
plt.ylabel('number of galaxies')
plt.title('hostlib redshift distribution')

John Franklin Crenshaw (UW) had a pre-trained normalizing flow trained on a representative set of $10^{6}$ LSST-DESC DC2 galaxies, so I'm using that for now.

TODO: explain things that would have been better but didn't work

In [None]:
# flow = Flow(file='../data_files/model_photo-zs_sharp10_splbin2_epoch30_flow.pkl')#'desc-dc2-K=16.pkl')#Flow(file='../data_files/pzflow_dc2small_nofilter_div1000.pkl')
flow = Flow(file='../data_files/model_photo-zs_sharp5_splbin8_epoch100_flow.pkl')

In [None]:
flow.conditional_columns

We trim and rename the hostlib data to match the parameters of the model.

In [None]:
# hl_df = df.rename(columns={'Y_obs':'y', 
#                    'r_obs':'r', 
#                    'u_obs':'u', 
#                    'g_obs':'g', 
#                    'z_obs':'z', 
#                    'i_obs':'i', 
#                    'ZTRUE':'redshift'})[['redshift', 'u', 'g', 'r', 'i', 'z', 'y']]
hl_df = df.rename(columns={'Y_obs':'y', 
                   'r_obs':'r', 
                   'u_obs':'u', 
                   'g_obs':'g', 
                   'z_obs':'z', 
                   'i_obs':'i', 
                    'ZTRUE':'redshift'})[['redshift', 'u', 'g', 'r', 'i', 'z', 'y']]

In [None]:
hl_df.columns

In [None]:
hl_df_colors = hl_df.copy()
quantities = hl_df.columns
for i in range(len(quantities)-2):
    hl_df_colors[quantities[i+1]+'-'+quantities[i+2]] = hl_df[quantities[i+1]] -hl_df[quantities[i+2]]
hl_df_colors = hl_df_colors.drop(columns = quantities)
hl_df_colors['r'] = hl_df['r']
hl_df_colors['redshift'] = hl_df['redshift']
hl_df = hl_df_colors[['redshift', 'u-g', 'g-r', 'r-i', 'i-z', 'z-y', 'r']]

In [None]:
hl_df.columns

In [None]:
# hl_named = hl_df.rename(columns={'mag_true_y_lsst':'y', 
#                    'mag_true_r_lsst':'r', 
#                    'mag_true_u_lsst':'u', 
#                    'mag_true_g_lsst':'g', 
#                    'mag_true_z_lsst':'z', 
#                    'mag_true_i_lsst':'i',
#                     'ZTRUE':'redshift'})

In [None]:
# hl_named.columns

Let's compare the distribution of data between the model and the hostlib.
Note how different they are! This is a very non-representative sample, so we expect pretty awful photo-$z$s.

In [None]:
nvis = 1000

In [None]:
# samples = flow.sample(nvis, seed=0)

# fig = plt.figure(figsize=(12,12))

# ranges = [(-0.1,2.4), (19.5,33), (19,32), (19,29), (19,29), (19,28), (19,28)]

# corner.corner(samples, fig=fig, color='r', bins=20, range=ranges, hist_bin_factor=2, data_kwargs={'ms':3}, contour_kwargs={'linewidths':2}, label='pz model')

# corner.corner(hl_df[:nvis], fig=fig, bins=20, range=ranges, hist_bin_factor=2, color='b', data_kwargs={'ms':3}, show_titles=True, label='hostlib');
# plt.legend()

In [None]:
# samples = flow.sample(10000, seed=0)
# print(samples.columns)

# for quality in ['redshift', 'u', 'g', 'r', 'i', 'z', 'y']:
#     plt.hist(samples[quality], bins=100, alpha=0.5, density=True, label='pz model samples');
#     plt.hist(hl_df[quality], bins=100, alpha=0.5, density=True, label='hostlib samples')
#     plt.title(quality)
#     plt.legend()
#     plt.show()

In [None]:
# flow.conditional_columns

In [None]:
# flow.info

In [None]:
# flow.latent

Now we can evaluate some posteriors and check that they look as expected.
They're way too narrow because there's no error model (and maybe the normalizing flow had too many knots).

TODO: use RAIL to convolve with error model before evaluating, or make a new flow with fewer knots

In [None]:
flow_z = flow.posterior(hl_df[['u-g', 'g-r', 'r-i', 'i-z', 'z-y', 'r']][:nvis], column='redshift', grid=zgrid)
# flow_z = flow.posterior(hl_df[['mag_true_u_lsst-mag_true_g_lsst',
#  'mag_true_g_lsst-mag_true_r_lsst',
#  'mag_true_r_lsst-mag_true_i_lsst',
#  'mag_true_i_lsst-mag_true_z_lsst',
#  'mag_true_z_lsst-mag_true_y_lsst',
#  'mag_true_r_lsst']][:nvis], column='redshift', grid=zgrid)

In [None]:
pdfs = flow_z

idx = np.arange(0, nvis, 99)
fig, axes = plt.subplots(1,len(idx),figsize=(2*len(idx),2), dpi=100)
for i,ax in zip(idx, axes):
    true_z = hl_df['redshift'][i]
    ax.axvline(true_z, 0, 1, c="C3",
               label='True z')
    ax.plot(zgrid, pdfs[i])
    ax.set(xlabel="redshift",
           # xticks=[0,0.5,1,1.5,2],
           yticks=[])
# axes[0].legend()
axes[0].set(ylabel='$p(z)$')
plt.show()

The point estimates are really weird here and need to be investigated. . .

In [None]:
plt.scatter(hl_df['redshift'][:nvis], zgrid[np.argmax(flow_z, axis=1)], s=1, c='k')
plt.plot([0., 3.], [0., 3.], c='r')#[min(hl_scaled['ZTRUE'][:nvis]), max(hl_scaled['ZTRUE'][:nvis])], [min(hl_scaled['ZTRUE'][:nvis]), max(hl_scaled['ZTRUE'][:nvis])], c='r')
plt.xlabel(r'$z_{true}$')
plt.ylabel(r'$z_{mode}$')

Anyway, let's cut to the important part, the compression of the posteriors using `qp`.

First, make a qp ensemble.

In [None]:
in_pdfs = qp.Ensemble(qp.interp, data=dict(xvals=zgrid, yvals=flow_z, check_input=True))

In [None]:
idx = np.arange(0, nvis, 99)
fig, axes = plt.subplots(1,len(idx),figsize=(2*len(idx),2), dpi=100)
for i,ax in zip(idx, axes):
    true_z = hl_df['redshift'][i]
    ax.axvline(true_z, 0, 1, c="C3",
               label='True z')
    ax.plot(zgrid, in_pdfs.pdf(zgrid)[i])
    ax.set(xlabel="redshift",
           # xticks=[0,0.5,1,1.5,2],
           yticks=[])
    ax.set_xlim(0, 0.25)
# axes[0].legend()
axes[0].set(ylabel='$p(z)$')
plt.show()

Then choose quantile values.

In [None]:
quants = np.linspace(0., 1., 11)[1:]
print(quants)

convert ensemble to quantiles

In [None]:
out_pdfs = in_pdfs.convert_to(qp.quant_piecewise_gen, quants=quants)

In [None]:
out_pdfs_eval = out_pdfs.pdf(zgrid)

TODO: fix qp quantile reconstruction bug!

Issue is that input PDFs aren't normalized, even when converted to qp

In [None]:
idx = np.arange(0, nvis, 99)
fig, axes = plt.subplots(1,len(idx),figsize=(2*len(idx),2), dpi=100)
for i,ax in zip(idx, axes):
    true_z = hl_df['redshift'][i]
    ax.axvline(true_z, 0, 1, c="C3",
               label='True z')
    ax.plot(zgrid, out_pdfs_eval[i])
    ax.set(xlabel="redshift",
           # xticks=[0,0.5,1,1.5,2],
           yticks=[])
# axes[0].legend()
    ax.set_xlim(0, 0.25)
axes[0].set(ylabel='$p(z)$')
plt.show()

In [None]:
with open('/global/u2/a/aimalz/ve3_elasticc/lib/python3.9/site-packages/qp/quant_pdf.py', 'r') as f:
    for l in f:
        print(l)

In [None]:
out_ppfs = out_pdfs.ppf(quants)

TODO: save these to file

In [None]:
# def quant_to_grid(qp_quant, )

In [None]:
# fig, axes = qp.plotting.plot_native(out_pdfs[1], xlim=(0, 3))

In [None]:
# zs = uncompressed * df['ZTRUE'].std() + df['ZTRUE'].mean()
# for quality in ['logmass', 'logSFRtot', 'mag_true_u_lsst', 'mag_true_g_lsst', 'mag_true_r_lsst', 'mag_true_i_lsst', 'mag_true_z_lsst', 'mag_true_y_lsst']:
#     data_scaled[quality] = (data[quality]-data[quality].mean())/data[quality].std()

# scratch below here, please ignore

why are the true posteriors so crazy compared to the true redshifts? maybe because of rescaling over all dimensions for the flow?

In [None]:
# cmap = plt.get_cmap('tab10')
# maxpdfs = 5
# for i in range(maxpdfs):
#     plt.plot(zgrid, flow_z[i], color=cmap(i/maxpdfs))
#     plt.vlines(hl_scaled['ZTRUE'][i], 0., max(flow_z[i]), color=cmap(i/maxpdfs))
# plt.xlabel(r'$z$')
# plt.ylabel(r'$p(z)$')
# plt.semilogx()
# plt.show()
# for i in range(maxpdfs):
#     plt.plot(zgrid, flow_z[i], color=cmap(i/maxpdfs))
#     plt.vlines(hl_scaled['ZTRUE'][i], 0., max(flow_z[i]), color=cmap(i/maxpdfs))
# plt.xlabel(r'$z$')
# plt.ylabel(r'$p(z)$')

In [None]:
# # flow_samps = flow.sample(1, conditions=hl_scaled[['mag_true_y_lsst',
# #  'mag_true_r_lsst',
# #  'mag_true_u_lsst',
# #  'mag_true_g_lsst',
# #  'mag_true_z_lsst',
# #  'mag_true_i_lsst',
# #  'logSFRtot',
# #  'logmass']][:nvis], seed=0)
# flow_samps = flow.sample(1, conditions=hl_named[['u', 'g', 'r', 'i', 'z', 'y']][:nvis], seed=0)
# plt.hist(flow_samps['redshift'], bins=zgrid, alpha=0.5, density=True, label='pz model samples');
# plt.hist(hl_df['ZTRUE'][:nvis], bins=zgrid, alpha=0.5, density=True, label='hostlib samples');
# plt.legend()

Why do these distributions still not even come close to matching? The flow should have coverage over the hostlib range but doesn't. The redshifts might be normalized somehow, too?

renormalize

Note DC2 SFRs are Msol/Gyr but hostlibs are log10 Msol/yr

In [None]:
# properties = {}
# properties['logmass'] = ( 6.6518884, 0.96087)
# properties['logSFRtot'] = ( 6.24535, 1.6989895)
# properties['mag_true_u_lsst'] = ( 30.86386, 3.0129747)
# properties['mag_true_g_lsst'] = ( 29.790226, 2.088292)
# properties['mag_true_r_lsst'] = ( 29.342756, 1.87558)
# properties['mag_true_i_lsst'] = ( 29.0863, 1.9085665)
# properties['mag_true_z_lsst'] = ( 28.870272, 1.9486268)
# properties['mag_true_y_lsst'] = ( 28.658136, 1.9420407)

In [None]:
# def make_normed(data, properties):
#     data_out = data.copy()
#     for quality in properties.keys():
#         data_out[quality] = (data[quality] - properties[quality][0]) / properties[quality][1]
#     return data_out

# def do_un_norm(data, properties):
#     data_out = data.copy()
#     for quality in properties.keys():
#         data_out[quality] = data[quality] * properties[quality][1] + properties[quality][0]
#     return data_out

In [None]:
# # data = hl_df.copy()
# # for quality in ['logmass', 'logSFRtot', 'mag_true_u_lsst', 'mag_true_g_lsst', 'mag_true_r_lsst', 'mag_true_i_lsst', 'mag_true_z_lsst', 'mag_true_y_lsst']:
# #     plt.hist(data[quality], bins=100)
# #     plt.title(quality+' pre-normalization')
# #     plt.show()
# hl_scaled = make_normed(hl_df, properties)
# # data.copy()
# # for quality in ['logmass', 'logSFRtot', 'mag_true_u_lsst', 'mag_true_g_lsst', 'mag_true_r_lsst', 'mag_true_i_lsst', 'mag_true_z_lsst', 'mag_true_y_lsst']:
# #     data_scaled[quality] = (data[quality]-properties[quality][0])/properties[quality][1]
# # for quality in ['redshift']:#['logmass', 'logSFRtot', 'mag_true_u_lsst', 'mag_true_g_lsst', 'mag_true_r_lsst', 'mag_true_i_lsst', 'mag_true_z_lsst', 'mag_true_y_lsst']:
# #     plt.hist(data_scaled[quality], bins=100)
# #     plt.title(quality+' post-normalization')
# #     plt.show()

make a NF from DC2? No, too slow!

In [None]:
# cosmo = GCRCatalogs.load_catalog("cosmoDC2_v1.1.4_small")
# quantities = ['redshift', 'mag_true_u_lsst', 'mag_true_g_lsst', 'mag_true_r_lsst', 'mag_true_i_lsst', 'mag_true_z_lsst', 'mag_true_y_lsst']#, 'stellar_mass', 'totalStarFormationRate']

# print("Reading CosmoDC2 small catalog")
# data = cosmo.get_quantities(quantities)
# print("Catalog read.")
# data = pd.DataFrame(data)
# # data['logSFRtot'] = onp.log10(data['totalStarFormationRate'])
# # data['logmass']   = onp.log10(data['stellar_mass'])
# # data.drop(columns=['totalStarFormationRate', 'stellar_mass'], inplace=True)

# # plt.figure(figsize=(10,7))
# # plt.plot(data['redshift'].sample(n=100000, random_state=1), (data['mag_true_g_lsst'] - data['mag_true_r_lsst']).sample(n=100000, random_state=1), 'o', ms=0.1)
# # plt.xlabel("DC2 Redshift")
# # plt.ylabel(r"$g-r$")
# # # plt.savefig("../plots/ogdc2_zvcolor_gr.png")
# # plt.clf()

In [None]:
# data_scaled[['mag_true_y_lsst',
#  'mag_true_r_lsst',
#  'mag_true_u_lsst',
#  'mag_true_g_lsst',
#  'mag_true_z_lsst',
#  'mag_true_i_lsst',
#  'logSFRtot',
#  'logmass']]

In [None]:
# test_samps = flow.sample(1, data_scaled[['mag_true_y_lsst',
#  'mag_true_r_lsst',
#  'mag_true_u_lsst',
#  'mag_true_g_lsst',
#  'mag_true_z_lsst',
#  'mag_true_i_lsst',
#  'logSFRtot',
#  'logmass']][:100])

In [None]:
# test_samps

In [None]:
# plt.hist(test_samps['redshift'] - flow_df['ZTRUE'][:100])

something has gone wrong here!