# CosmoDC realizations
Jaime Ruiz Zapatero - LSST 2024 July Meeting

### What does this tutorial do?
The goal of this tutorial is to show you generate possible n(z) realizations of the lens and sources photometric samples of Prat et al, 2022 (2212.09345).

In order to do so we will use the BPZ and FlexZBoost catalogs of photometric redshifts for the CosmoDC2 simulations generated by Sam Schmidt. These catalogs preceed the RAIL tools but their outoput is equivalent. 


## Dependencies

In [46]:
import os
import numpy as np
import pandas as pd
import random
import scipy
from scipy.interpolate import interp1d
import matplotlib.pyplot as plt
import h5py
%matplotlib inline

In [47]:
import GCRCatalogs
from GCR import GCRQuery

## Load catalogs

In [None]:
# Load CosmoDC2
# BPZ
cat = GCRCatalogs.load_catalog('CosmoDC2_v1.1.4_image_with_photozs_v1')
# FlexZBoost
#cat = GCRCatalogs.load_catalog('CosmoDC2_v1.1.4_image_with_photozs_flexzboost_v1')
photo_cat = cat.get_quantities(['galaxy_id', 'photoz_pdf'], return_iterator=True)
#basic_cuts = [GCRQuery('photoz_mask==1')]

# Assign to tomo bins and save
save_to = "/pscratch/sd/j/jaimerz/CosmoDC2_catalogs/{}_tracks".format(sample)
zgrid = cat.photoz_pdf_bin_centers

In [None]:
# Load Judit's catalog
sample = "shear"
path_2_cat = "/global/cfs/cdirs/lsst/groups/WL/projects/star-challenge/cosmodc2/TXPipe-full-output/"
fname = path_2_cat+"binned_{}_catalog.hdf5".format(sample)
hf = h5py.File(fname, "r")

## Reduce the tracks to something managable

First we cross match the ID's in the BPZ and FlexZBoost catalogs to the lens and source samples of Prat et al, 2022 in each tomographic bin. This allows us to avoid having to worry about assigning objects to tomographic objects ourselves. Since the CosmoDC2 catalogs are so large, we first cross match each track in the catalog to a tomographic and then we combine the all files associated with the same tomographic bin into a single catalog. 

In [None]:
def reduce_tracks(hf, photo_cat, save_to)
    for i, photo_q in enumerate(photo_cat):
        candidate_fname = save_to+"/lens_0_{}.npz".format(i)
        if os.path.isfile(candidate_fname):
            pass
        else:
            ids = photo_q['galaxy_id']
            sel_0 = np.isin(ids, hf["lens"]["bin_0"]["id"])
            sel_1 = np.isin(ids, hf["lens"]["bin_1"]["id"])
            sel_2 = np.isin(ids, hf["lens"]["bin_2"]["id"])
            sel_3 = np.isin(ids, hf["lens"]["bin_3"]["id"])
            sel_4 = np.isin(ids, hf["lens"]["bin_4"]["id"])
            id_0 = ids[sel_0]
            id_1 = ids[sel_1]
            id_2 = ids[sel_2]
            id_3 = ids[sel_3]
            id_4 = ids[sel_4]
            photoz_0 = list(photo_q['photoz_pdf'][sel_0])
            photoz_1 = list(photo_q['photoz_pdf'][sel_1])
            photoz_2 = list(photo_q['photoz_pdf'][sel_2])
            photoz_3 = list(photo_q['photoz_pdf'][sel_3])
            photoz_4 = list(photo_q['photoz_pdf'][sel_4])

            #pzdict_0 = {'id': id_0, 'pdf': photoz_0}
            #pzdict_1 = {'id': id_1, 'pdf': photoz_1}
            #pzdict_2 = {'id': id_2, 'pdf': photoz_2}
            #pzdict_3 = {'id': id_3, 'pdf': photoz_3}
            #pzdict_4 = {'id': id_4, 'pdf': photoz_4}

            fname_0 = save_to+"/lens_0_{}".format(i)
            fname_1 = save_to+"/lens_1_{}".format(i)
            fname_2 = save_to+"/lens_2_{}".format(i)
            fname_3 = save_to+"/lens_3_{}".format(i)
            fname_4 = save_to+"/lens_4_{}".format(i)

            np.savez(fname_0, ids=id_0, pdf=photoz_0)
            np.savez(fname_1, ids=id_1, pdf=photoz_1)
            np.savez(fname_2, ids=id_2, pdf=photoz_2)
            np.savez(fname_3, ids=id_3, pdf=photoz_3)
            np.savez(fname_4, ids=id_4, pdf=photoz_4)

            print(i, len(id_0), len(id_1), len(id_2), 
                 len(id_3), len(id_4))

In [None]:
reduce_tracks(hf, photo_cat, save_to)

## Compose tracks into tomo bins

In [33]:
def combine_tracks(load_from, save_to):
    for tomo_bin in np.arange(0, 5):
        tomo_name = "/lens_{}.npz".format(tomo_bin)
        tomo_path = save_to+tomo_name
        if os.path.isfile(tomo_path):
                pass
        else:
            print(tomo_bin)
            track = 0
            candidate_file = load_from+"/lens_{}_{}.npz".format(tomo_bin, track)
            total_ids = np.array([])
            total_pdfs = list([])
            while os.path.isfile(candidate_file):
                pzdict = dict(np.load(candidate_file, allow_pickle=True))
                ids = pzdict['ids']
                pdfs = pzdict['pdf']
                total_ids = np.append(total_ids, ids)
                for pdf in pdfs:
                    total_pdfs.append(pdf)
                print((track, len(total_ids)), end='\r', flush=True)
                track = track + 1
                candidate_file = load_from+"/lens_{}_{}.npz".format(tomo_bin, track)
            tomo = {'ids': total_ids, 'pdfs':total_pdfs}
            np.savez(tomo_path, ids=total_ids, pdfs=total_pdfs)


In [None]:
load_from = "/pscratch/sd/j/jaimerz/CosmoDC2_catalogs/{}_tracks".format(sample)
save_to = "/pscratch/sd/j/jaimerz/CosmoDC2_catalogs/{}".format(sample)
combine_tracks(load_from, save_to)

## Make samples 

Once we have devided the BPZ and FlexZBoost catalogs into tomographic bins, we loop over each object in the catalogs and generate 1000 samples from each indivudual p(z). This effectively generates 1000 possible catalogs distributed according to the uncertaintity in the photometric redshift of each object. 

In [53]:
def make_samples(load_from, save_to,
                   normalize=False, n_samples=1_000):
    rs = np.array([np.random.uniform() for i in np.arange(n_samples)])
    for i in range(0, 5):
        candidate_fname = save_to+"/{}_{}.npz".format(sample, i)
        if os.path.isfile(candidate_fname):
            pass
        else:
            print(i)
            tomo_name = "/{}_{}.npz".format(sample, i)
            tomo = np.load(load_from+tomo_name, allow_pickle=True)
            pdfs = np.array(tomo['pdfs'])
            print("Generate samples")
            photo_samples = []
            for j, pdf in enumerate(pdfs):
                if j % 100 == 0:
                    print(j, end='\r', flush=True)
                norm = np.sum(pdf)
                if normalize:
                    pdf = pdf/norm
                cdf = np.cumsum(pdf) # Cumulative distribution function
                cdf_i = interp1d(cdf, zgrid, fill_value="extrapolate") # Inverse CDF
                photo_samples.append(cdf_i(rs)) # Generate sample from photo-z PDF
            photo_samples = np.array(photo_samples)
            np.savez(candidate_fname,
                    samples=photo_samples)

In [None]:
load_from = "/pscratch/sd/j/jaimerz/CosmoDC2_catalogs/{}_pdfs".format(sample)
save_to = "/pscratch/sd/j/jaimerz/CosmoDC2_catalogs/{}_samples".format(sample)
make_samples(load_from, save_to)

1
Generate samples
4528700

## Make Histograms from samples

Finally we bin each catalog in the ensemble to generate redshift distributions for the populations as a whole; i.e. the n(z). This gives us an esemble of 1000 possible n(z) for each sample that captures the photometric ensemble.

In [None]:
def make_hists(load_from, save_to, grids,
              normalize=False):
    for i, grid in enumerate(grids):
        candidate_fname = save_to+"/{}_{}.npz".format(sample, i)
        if os.path.isfile(candidate_fname):
            pass
        else:
            photo_samples = np.load(load_from+"/{}_{}.npz".format(sample, i))
            photo_samples = photo_samples["samples"]
            zbins = np.linspace(grid[0],grid[1], 26)
            znodes = 0.5*(zbins[1:]+zbins[:-1])
            photo_hists = []
            for j, samples in enumerate(photo_samples.T):
                print(j, end='\r', flush=True)
                photo_hist = np.histogram(samples, bins=zbins, density=False)[0]
                photo_hists.append(photo_hist)
            photo_hists = np.array(photo_hists)
            if normalize:
                photo_hists_norms = np.array([np.sum(hist) for hist in photo_hists])
                photo_hists = photo_hists.T/photo_hists_norms
            else:
                photo_hists = photo_hists.T
            np.savez(candidate_fname,
                     zgrid=zbins,
                     znodes=znodes,
                     photo_hists=photo_hists)

In [None]:
#grids_lens = [[0.0, 0.6],
#        [0.0, 1.0],
#        [0.3, 1.0],
#        [0.3, 1.3],
#        [0.5, 1.5]]

grids_shear = [[0.0, 2.0],
         [0.0, 2.0],
         [0.0, 2.0],
         [0.0, 2.0],
         [0.0, 3.0]]

load_from = "/pscratch/sd/j/jaimerz/CosmoDC2_catalogs/{}_samples".format(sample)
save_to = "/pscratch/sd/j/jaimerz/CosmoDC2_catalogs/{}_hists".format(sample)
make_hists(load_from, save_to, grids)

## Plotting

In [None]:
lens_0 = np.load("/pscratch/sd/j/jaimerz/CosmoDC2_catalogs/lens_hists/{}_0.npz".format(sample), allow_pickle=True)
lens_1 = np.load("/pscratch/sd/j/jaimerz/CosmoDC2_catalogs/lens_hists/{}_1.npz".format(sample), allow_pickle=True)
lens_2 = np.load("/pscratch/sd/j/jaimerz/CosmoDC2_catalogs/lens_hists/{}_2.npz".format(sample), allow_pickle=True)
lens_3 = np.load("/pscratch/sd/j/jaimerz/CosmoDC2_catalogs/lens_hists/{}_3.npz".format(sample), allow_pickle=True)
lens_4 = np.load("/pscratch/sd/j/jaimerz/CosmoDC2_catalogs/lens_hists/{}_4.npz".format(sample), allow_pickle=True)

lenses = {'lens_0': lens_0,
          'lens_1': lens_1,
          'lens_2': lens_2,
          'lens_3': lens_3,
          'lens_4': lens_4}

In [None]:
#fig = plt.figure(figsize=(15,8))
colors = ['skyblue', 'teal', 'blue', 'purple', 'darkviolet']
for i, key in enumerate(list(lenses.keys())):
    tomo = lenses[key]
    photo_dz = np.mean(np.diff(tomo['znodes']))
    spec_dz = np.mean(np.diff(zgrid))
    alpha = photo_dz/spec_dz
    #plt.plot(tomo['spec_znodes'], alpha*tomo['spec_hist'], 'k-')
    plt.plot(tomo['znodes'], tomo['photo_hists'], '-', color=colors[i], alpha=0.01)
    plt.plot(tomo['znodes'], np.mean(tomo['photo_hists'], axis=1), '-', color=colors[i], label=key)

#nzsum = np.sum(fz_df['pdf'])
#plt.plot(zgrid,nzsum,c='b',label=f"i<{magcut} sum p(z)")

plt.xlim([0.0, 3.0])
plt.title("CosmoDC2")
plt.xlabel("redshift",fontsize=18)
plt.ylabel("N(z)",fontsize=18)
plt.legend(loc = 'upper right',fontsize=16);
plt.show()