# Estimating covariance on SkySim5000 and using it on Gaussian likelihood

(derived from <a href='SkySim500_firstcheck.ipynb'>SkySim500_firstcheck</a> notebook)

Author : Michel Aguena

In [None]:
import numpy as np
import pyccl as ccl
import matplotlib.pyplot as plt
import scipy.integrate
import astropy.units as u
from astropy.table import Table
import GCRCatalogs

%matplotlib inline

In [None]:
skysim_cat = GCRCatalogs.load_catalog('skysim5000_v1.1.1_small')

In [None]:
cosmo_ss  = skysim_cat.cosmology

In [None]:
cosmo_ss

## Extract DM haloes from the catalog in a given mass and redshift range. 

In [None]:
%%time
# get list of halos in a given redshift and mass range
mmin_extract = 1.e12 # Msun (M_fof)
zmin_extract = 0.
zmax_extract = 1.0
dm_halos = Table(skysim_cat.get_quantities(['halo_mass','hostHaloMass','redshift','ra', 'dec', 'halo_id',
                                             'baseDC2/sod_halo_mass','baseDC2/sod_halo_radius'],
                                            filters=[f'halo_mass > {mmin_extract}','is_central==True',
                                            f'redshift>{zmin_extract}', f'redshift<{zmax_extract}']))
# NB: SkySim5000 M200c masses are in units of Msun/h
dm_halos['m200c'] = dm_halos['baseDC2/sod_halo_mass']/cosmo_ss.h

In [None]:
print(f'There are {len(dm_halos):,} halos in this mass (Mfof) and redshift range')

In [None]:
plt.scatter(dm_halos['ra'], dm_halos['dec'], marker='.', s=0.001)
plt.xlabel('ra [deg]')
plt.ylabel('dec [deg]')

## Define a redshift and mass range for the comparison of data and prediction, and filter the data accordingly

In [None]:
dm_halos = dm_halos[dm_halos['m200c']>1e13]
print(f'There are {len(dm_halos):,} halos with M200c > 1e13 Msun')

In [None]:
# Define mass and redshift bins
n_mbins, n_zbins = 8, 5
nc_meas, logmass_bins, z_bins = np.histogram2d(
    np.log10(dm_halos['m200c']), dm_halos['redshift'],
    bins=(n_mbins, n_zbins))
mid_logmass = 0.5*(logmass_bins[:-1]+logmass_bins[1:])
mid_z = 0.5*(z_bins[:-1]+z_bins[1:])

In [None]:
mid_z, z_bins

Plot number counts

In [None]:
plt.figure(figsize=(10,6))
plt.plot(10**mid_logmass, nc_meas, marker='*', ls='', label=[f'z={z:.2f}' for z in mid_z])
plt.xscale('log')
plt.yscale('log')
plt.xlabel('M200,c [Msun]', size=12)
plt.ylabel('Number of halos', size=12)
plt.legend()
#print(f"Total number of halos in z=[{zmin},{zmax}] and M=[{mmin/1.e14:.2f}, {mmax/1.e15}] x 1e15 Msun: {len(data):,}")

# Measure covariance of clusters

Add pixels and check outliers to define which will be used for covariance computation

In [None]:
import healpy as hp

nside = 64
dm_halos['pixel'] = hp.ang2pix(nside, dm_halos['ra'], dm_halos['dec'], lonlat=True)


f, axes = plt.subplots(1, 2, figsize=(20, 6))
map_ = np.histogram(dm_halos['pixel'], np.arange(hp.nside2npix(nside)+1))[0]
map_ = np.array(map_, dtype=float)
map_[map_==0] = np.nan

#plot histogram of counts
axes[0].hist(map_)
axes[0].set_xlabel('# of clusters in pixels')

# plot map
hp.cartview(map_, latra=[-50, -30], lonra=[55, 75], 
            hold=True, cbar=False)

hp_ax = f.axes[-1]
hp_ax.axis('on')

#ax.patch.set_alpha(0.)
hp_ax.set_xlabel('RA')
hp_ax.set_ylabel('DEC')
hp_ax.grid(color='.7')
hp_ax.set_xticklabels(-hp_ax.get_xticks()) # RA values in plot are inverted


plt.show()

Define pixels to be used in covariance computation

In [None]:
pixels_for_cov = np.arange(hp.nside2npix(nside))[map_>200]

Bin the data in mass,redshift and pixels

In [None]:
pixeled_nc = np.transpose([
    np.histogram2d(
        np.log10(dm_halos['m200c'])[dm_halos['pixel']==p],
        dm_halos['redshift'][dm_halos['pixel']==p],
        bins=(logmass_bins, z_bins)
    )[0].flatten()
    for p in pixels_for_cov])

In [None]:
pixeled_nc = np.transpose([
    np.histogram2d(
        dm_halos['redshift'][dm_halos['pixel']==p],
        np.log10(dm_halos['m200c'])[dm_halos['pixel']==p],
        bins=(z_bins, logmass_bins)
    )[0].flatten()
    for p in pixels_for_cov])

Compute covariance and correlation matrices
\begin{equation}
Corr_{ij} = \frac{Cov_{ij}}{\sqrt{Cov_{ii}Cov_{jj}}}
\end{equation}

In [None]:
cov = np.cov(pixeled_nc)*(skysim_cat.sky_area/hp.nside2pixarea(nside, degrees=True))

In [None]:
corr = np.corrcoef(pixeled_nc)

See how strong correlated the different mass bins are:

In [None]:
f, axes = plt.subplots(
    n_zbins, n_zbins, sharex=True, sharey=True,
    figsize=(10,10), gridspec_kw={'hspace': 0., 'wspace': 0.,
                                 'right':.9, 'top':.9})
for i in range(n_zbins):
    for j in range(n_zbins):
        colors = axes[::-1][i][j].pcolor(
            corr[n_mbins*i:n_mbins*(i+1), n_mbins*j:n_mbins*(j+1)],
            vmin=-1, vmax=1, cmap='bwr')
for ax, z1, z2 in zip(axes[-1], z_bins, z_bins[1:]):
    ax.set_xticklabels([])
    ax.set_xlabel(f'Mass\n{z1:.2f}<z<{z2:.2f}')
for ax, z1, z2 in zip(axes[::-1,0], z_bins, z_bins[1:]):
    ax.set_yticklabels([])
    ax.set_ylabel(f'Mass\n{z1:.2f}<z<{z2:.2f}')
for ax in axes.flatten():
    ax.set_xticks(np.arange(n_mbins)+.5)
    ax.set_yticks(np.arange(n_mbins)+.5)
cb_ax = plt.axes([.92, .1, .02, .8])
#colors = plt.pcolor(corr)
plt.colorbar(colors, cax=cb_ax, label='Corr')
#plt.xlabel('mass bin')
#plt.ylabel('mass bin')

Plot number counts with variance as errorbars

In [None]:
plt.figure(figsize=(10,6))
shift, i = .005, 0 # Add shift to see all points
i = 0
for nc, err, lab in zip(nc_meas.T, np.sqrt(np.diag(cov)).reshape(n_zbins, n_mbins),
                        [f'z={z:.2f}' for z in mid_z]):
    plt.errorbar(10**(mid_logmass+i*shift), nc, err, ls='', capsize=3, label=lab)
    i += 1
plt.xscale('log')
plt.yscale('log')
plt.xlabel('M200,c [Msun]', size=12)
plt.ylabel('Number of halos', size=12)
plt.legend()
#print(f"Total number of halos in z=[{zmin},{zmax}] and M=[{mmin/1.e14:.2f}, {mmax/1.e15}] x 1e15 Msun: {len(data):,}")

## Prediction using CCL and the Tinker08 and Bocquet16 mass functions

In [None]:
# Define CCL Cosmology from SkySim cosmology
cosmo = ccl.Cosmology(Omega_c=cosmo_ss.Om0-cosmo_ss.Ob0, Omega_b=cosmo_ss.Ob0,
                      h=cosmo_ss.h, sigma8=cosmo_ss.sigma8, n_s=cosmo_ss.n_s, Neff=3.04)

print(cosmo)


### Differential comoving volume

In [None]:
def dV_over_dOmega_dz(z):
    a = 1./(1. + z)
    da = ccl.background.angular_diameter_distance(cosmo, a) 
    E = ccl.background.h_over_h0(cosmo, a)
    return ((1.+z)**2)*(da**2)*ccl.physical_constants.CLIGHT_HMPC/cosmo['h']/E 

In [None]:
### Sanity check - comparison to the volume computed with Astropy
dV_over_dOmega_dz(0.3), cosmo_ss.differential_comoving_volume(0.3).value 

### CCL mass functions

In [None]:
hmd_200c = ccl.halos.MassDef(200, 'critical')
def tinker08(logm, z):
    mass = 10**(logm)
    hmf_200c = ccl.halos.MassFuncTinker08(cosmo, mass_def=hmd_200c)
    nm = hmf_200c.get_mass_function(cosmo, mass, 1./(1+z))
    return nm # dn/dlog10M

hmd_200c = ccl.halos.MassDef200c()
def bocquet16(logm, z):
    mass = 10**(logm)
    hmf_200c = ccl.halos.MassFuncBocquet16(cosmo, mass_def=hmd_200c)
    nm = hmf_200c.get_mass_function(cosmo, mass, 1./(1+z))
    return nm # dn/dlog10M

In [None]:
def integrand_tinker08(logm,z):
    return tinker08(logm, z)*dV_over_dOmega_dz(z)

def integrand_bocquet16(logm,z):
    return bocquet16(logm, z)*dV_over_dOmega_dz(z)

### Solid angle of the `small` catalog = 50 deg2

Need to check if it is exactly 50 deg2 or if this has been rounded

In [None]:
DeltaOmega = skysim_cat.sky_area * np.pi**2/180**2

In [None]:
%%time
# Loop over mass bins defined by plt.hist and predict the corresponding number of haloes
# for Tinker 08
N_predicted_T08 = np.array([[
        scipy.integrate.dblquad(
            integrand_tinker08, zmin, zmax, 
            lambda x:logmmin, lambda x:logmmax,
            epsabs=1.e-4, epsrel=1.e-4)[0]
        for zmin, zmax in zip(z_bins, z_bins[1:])]
    for logmmin, logmmax in zip(logmass_bins, logmass_bins[1:])
])

In [None]:
%%time
# Loop over mass bins defined by plt.hist and predict the corresponding number of haloes
# for Bocquet 16
N_predicted_B16  = np.array([[
        scipy.integrate.dblquad(
            integrand_bocquet16, zmin, zmax, 
            lambda x:logmmin, lambda x:logmmax,
            epsabs=1.e-4, epsrel=1.e-4)[0]
        for zmin, zmax in zip(z_bins, z_bins[1:])]
    for logmmin, logmmax in zip(logmass_bins, logmass_bins[1:])
])

## Plot measured versus predicted number of haloes

In [None]:
f, axes = plt.subplots(2, 3, figsize=(10,6), sharex=True)

for i in range(n_zbins):
    ax = axes.flatten()[i]
    ax.errorbar(
        10**mid_logmass, nc_meas[:,i],
        np.sqrt(np.diag(cov)).reshape(n_zbins, n_mbins)[i],
        ls='', capsize=3,
        label=f'SkySim5000, {skysim_cat.sky_area:.2f} deg2 field')
    ax.plot(10**mid_logmass, N_predicted_T08[:,i]*DeltaOmega, label='T08, CCL')
    ax.plot(10**mid_logmass, N_predicted_B16[:,i]*DeltaOmega, label='B16, CCL')
    ax.set_xscale('log')
    ax.set_yscale('log')
    #leg = ax.legend(fontsize=12)
    #ax.get_legend().remove()
for ax in axes[-1,:]:
    ax.set_xlabel('$M_{200c}$ [M$_\odot$]', size=14)
for ax in axes[:,0]:
    ax.set_ylabel('Number of haloes', size=14)
    
leg_info = [l for l in axes[0][0].collections+axes[0][0].lines
            if l._label[0]!='_']
axes[-1][-1].legend(leg_info, [l._label for l in leg_info])
axes[-1][-1].axis('off')

# Gaussian likelihood

\begin{equation}
\mathcal{L} = \frac{1}{\sqrt{det(2\pi Cov)}}\exp{\left[-\frac{1}{2}(Obs-Theo)Cov^{-1}(Obs-Theo)^T\right]}
\end{equation}

In [None]:
def lnlike(obs, theo, cov):
    diff = obs-theo
    icov = np.linalg.inv(cov)
    chi2 = np.dot(diff, np.dot(icov, diff))
    return -0.5*chi2 - 0.5*np.log(np.linalg.det(cov))

In [None]:
print(f'''
Using full covariance
ln(like)[T08]: {lnlike(nc_meas.flatten(), N_predicted_T08.flatten()*DeltaOmega, cov)}
ln(like)[B16]: {lnlike(nc_meas.flatten(), N_predicted_B16.flatten()*DeltaOmega, cov)}

Using only diagonal
ln(like)[T08]: {lnlike(nc_meas.flatten(), N_predicted_T08.flatten()*DeltaOmega, np.diag(np.diag(cov)))}
ln(like)[B16]: {lnlike(nc_meas.flatten(), N_predicted_B16.flatten()*DeltaOmega, np.diag(np.diag(cov)))}
''')