# modeling true $p(z | photometry)$ for ELAsTiCC

_Alex Malz (GCCL@RUB)_

In [None]:
import pzflow
from pzflow import Flow
from pzflow.bijectors import Chain, ColorTransform, InvSoftplus, StandardScaler, RollingSplineCoupling, ShiftBounds
from pzflow.distributions import Uniform, Joint, Normal

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

read in the normalizing flow made by `transient-host-sims/scripts/condition_pzflow.py`

In [None]:
flow = Flow(file='../data_files/pzflow_dc2small_nofilter_div1000.pkl')

In [None]:
flow.data_columns

In [None]:
flow.conditional_columns

In [None]:
flow.info

In [None]:
flow.latent

pick one hostlib for now, loop through later

retrieve the conditional columns from hostlibs

In [None]:
hl_path = '/global/cfs/cdirs/lsst/groups/SN/snana/SURVEYS/LSST/ROOT/PLASTICC_DEV/HOSTLIB/TEMP_HOSTLIBS/SNIa_GHOST_PHOTOZ.HOSTLIB'
# skip 26lines
df = pd.read_csv(hl_path, skiprows=25, delimiter=' ', header=0)

In [None]:
df.columns

define redshift grid for pre-compression posterior evaluation

In [None]:
zgrid = np.logspace(-3., np.log10(3.), 300)
plt.hist(df['ZTRUE'], bins=zgrid);

rename for pzflow

In [None]:
flow_df = df.rename(columns={'Y_obs':'mag_true_y_lsst', 
                   'r_obs':'mag_true_r_lsst', 
                   'u_obs':'mag_true_u_lsst', 
                   'g_obs':'mag_true_g_lsst', 
                   'z_obs':'mag_true_z_lsst', 
                   'i_obs':'mag_true_i_lsst', 
                   'LOG_SFR':'logSFRtot', 
                   'LOGMASS':'logmass'})

In [None]:
flow_df.columns

renormalize

Note DC2 SFRs are Msol/Gyr but hostlibs are log10 Msol/yr

In [None]:
properties = {}
properties['logmass'] = ( 6.6518884, 0.96087)
properties['logSFRtot'] = ( 6.24535, 1.6989895)
properties['mag_true_u_lsst'] = ( 30.86386, 3.0129747)
properties['mag_true_g_lsst'] = ( 29.790226, 2.088292)
properties['mag_true_r_lsst'] = ( 29.342756, 1.87558)
properties['mag_true_i_lsst'] = ( 29.0863, 1.9085665)
properties['mag_true_z_lsst'] = ( 28.870272, 1.9486268)
properties['mag_true_y_lsst'] = ( 28.658136, 1.9420407)

In [None]:
data = flow_df.copy()
for quality in ['logmass', 'logSFRtot', 'mag_true_u_lsst', 'mag_true_g_lsst', 'mag_true_r_lsst', 'mag_true_i_lsst', 'mag_true_z_lsst', 'mag_true_y_lsst']:
    plt.hist(data[quality], bins=100)
    plt.title(quality+' pre-normalization')
    plt.show()

In [None]:
data_scaled = data.copy()
for quality in ['logmass', 'logSFRtot', 'mag_true_u_lsst', 'mag_true_g_lsst', 'mag_true_r_lsst', 'mag_true_i_lsst', 'mag_true_z_lsst', 'mag_true_y_lsst']:
    data_scaled[quality] = (data[quality]-properties[quality][0])/properties[quality][1]

In [None]:
for quality in ['logmass', 'logSFRtot', 'mag_true_u_lsst', 'mag_true_g_lsst', 'mag_true_r_lsst', 'mag_true_i_lsst', 'mag_true_z_lsst', 'mag_true_y_lsst']:
    plt.hist(data_scaled[quality], bins=100)
    plt.title(quality+' post-normalization')
    plt.show()

evaluate the posteriors

In [None]:
uncompressed = flow.posterior(inputs=data_scaled[['mag_true_y_lsst',
 'mag_true_r_lsst',
 'mag_true_u_lsst',
 'mag_true_g_lsst',
 'mag_true_z_lsst',
 'mag_true_i_lsst',
 'logSFRtot',
 'logmass']][:100], column='redshift', grid=zgrid)

In [None]:
cmap = plt.get_cmap('tab10')
maxpdfs = 5
for i in range(maxpdfs):
    plt.plot(zgrid, uncompressed[i], color=cmap(i/maxpdfs))
    plt.vlines(df['ZTRUE'][i], 0., max(z_pdfs[i]), color=cmap(i/maxpdfs))
plt.xlabel(r'$z$')
plt.ylabel(r'$p(z)$')
plt.semilogx()
plt.show()
for i in range(maxpdfs):
    plt.plot(zgrid, uncompressed[i], color=cmap(i/maxpdfs))
    plt.vlines(df['ZTRUE'][i], 0., max(z_pdfs[i]), color=cmap(i/maxpdfs))
plt.xlabel(r'$z$')
plt.ylabel(r'$p(z)$')

why are the true posteriors so crazy compared to the true redshifts? maybe because of rescaling over all dimensions for the flow?

In [None]:
# zs = uncompressed * df['ZTRUE'].std() + df['ZTRUE'].mean()
# for quality in ['logmass', 'logSFRtot', 'mag_true_u_lsst', 'mag_true_g_lsst', 'mag_true_r_lsst', 'mag_true_i_lsst', 'mag_true_z_lsst', 'mag_true_y_lsst']:
#     data_scaled[quality] = (data[quality]-data[quality].mean())/data[quality].std()

# scratch below here

In [None]:
data_scaled[['mag_true_y_lsst',
 'mag_true_r_lsst',
 'mag_true_u_lsst',
 'mag_true_g_lsst',
 'mag_true_z_lsst',
 'mag_true_i_lsst',
 'logSFRtot',
 'logmass']]

In [None]:
test_samps = flow.sample(1, data_scaled[['mag_true_y_lsst',
 'mag_true_r_lsst',
 'mag_true_u_lsst',
 'mag_true_g_lsst',
 'mag_true_z_lsst',
 'mag_true_i_lsst',
 'logSFRtot',
 'logmass']][:100])

In [None]:
test_samps

In [None]:
plt.hist(test_samps['redshift'] - flow_df['ZTRUE'][:100])

something has gone wrong here!

next steps: read into qp, convert to quantiles, save in file (or put into hostlib?)