In [None]:
import os
from pprint import pprint
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from IPython.display import Image
import ceci
import h5py
import yaml

Make sure to change your path in the next cell that leads to your TXPipe directory. See examples for IN2P3 and NERSC below.

In [None]:
# user specific paths -- IN2P3 example
# my_txpipe_dir = "/pbs/home/m/mricci/throng_mricci/desc/TXPipe"
my_txpipe_dir = "/pbs/throng/lsst/users/ccombet/TXPipe"

# user specific paths -- NERSC example
# my_txpipe_dir = "/pscratch/sd/a/avestruz/TXPipe"

os.chdir(my_txpipe_dir)

import txpipe

# Let's start working with the 1deg2 data file on Jupyter

First we will do some runs on the 1 deg^2 example data set with around 80k galaxies. This is small enough that we can do it all in jupyter.

The data set, which is based on CosmoDC2, contains pre-computed photo-z and and contains a RedMapper cluster catalog for the field.

In [None]:
print("Options for this pipeline and their defaults (this may be override by config file):")
print(txpipe.extensions.CLClusterBinningRedshiftRichness.config_options)

pip_stage = txpipe.extensions.CLClusterBinningRedshiftRichness.make_stage(
    # This is the initial cluster catalog - RAs, Decs, richess, redshift, etc.
    cluster_catalog="./data/example/inputs/cluster_catalog.hdf5",
    
    # This is the output for this stage
    cluster_catalog_tomography="./data/example/cluster_catalog_tomography.hdf5",

    # This contains all the options for this stage. You can override them here.
    config="examples/cosmodc2/config-1deg2-CL.yml",    
)

In [None]:
pip_stage.run()
pip_stage.finalize()

In [None]:
print("Actual options used for this pipeline (as defined in config file or default):")
print(pip_stage.config)

In [None]:
pip_stage.config['cluster_catalog']

In [None]:
pip_stage.config['cluster_catalog_tomography']

## Open cluster catalog input and compare to binning outputs

### Open cluster catalog input 

In [None]:
filename_in = pip_stage.config['cluster_catalog']
print(filename_in)

In [None]:
f_in = h5py.File(filename_in, "r")

In [None]:
print(f_in.keys())

In [None]:
dset_in = f_in['clusters']

In [None]:
cols = [col for col in dset_in]
print(cols)

In [None]:
plt.semilogy(dset_in['redshift'][()], dset_in['richness'][()],'.', alpha=1)

plt.xlabel('redshift')
plt.ylabel('richness')

### Open binning output

In [None]:
filename_out = pip_stage.config['cluster_catalog_tomography'] #output_dir + "/cluster_catalog_tomography.hdf5"
print (filename_out)

In [None]:
f_out = h5py.File(filename_out, "r")

In [None]:
print(f_out.keys())

In [None]:
dat_out = f_out['provenance']
dset_out = f_out['cluster_bin']

In [None]:
print(dset_out.keys())

In [None]:
[print (i, dict(dset_out[i].attrs), dset_out[i]['redshift'][:].size) for i in dset_out.keys()];

In [None]:
print ([col for col in dset_out['bin_zbin_0_richbin_0']])

In [None]:
print('The file contains',len(dset_out.keys()), 'keys corresponding to',
      len(pip_stage.config.zedge) - 1, ' redshift bins times', 
      len(pip_stage.config.richedge) - 1,'richness bins')

### Compare the two 

In [None]:
# plot data from input catalog
plt.semilogy(dset_in['redshift'][()], dset_in['richness'][()],'k.', alpha=1)
plt.xlabel('redshift')
plt.ylabel('richness')

# plot bin limits as defined in the config file
[plt.axvline(i,linestyle='dashed', color='black') for i in pip_stage.config.zedge]
[plt.axhline(i,linestyle='dotted', color='black') for i in pip_stage.config.richedge]

# overplot data from output file to make sure the bins are ordered correctly
markers=['s','o', 'D', 'P', '^']

for i in range(len(pip_stage.config.zedge)-1):
    for j in range(len(pip_stage.config.richedge)-1):
        plt.scatter(dset_out['bin_zbin_'+str(i)+'_richbin_'+str(j)]['redshift'][:], 
         dset_out['bin_zbin_'+str(i)+'_richbin_'+str(j)]['richness'][:], marker=markers[j], label='bin_zbin_'+str(i)+'_richbin_'+str(j))
    
    plt.legend(fontsize='x-small')

# Now let's do the same using the pipeline approach

Here we will use the 20deg2, but we can also use the 1deg2 files (just need to change 20deg2 to 1deg2 in the name of the files)

### Launching a pipeline

Let's have a look at the submission script for this pipeline:
- to work at CCin2p3 we can use: `examples/cosmodc2/Cluster_pipelines/20deg2-in2p3.sub`
- to work at NERSC we can use: `examples/cosmodc2/Cluster_pipelines/20deg2-nersc.sub`

In [None]:
! cat examples/cosmodc2/Cluster_pipelines/20deg2-in2p3.sub

In [None]:
! cat examples/cosmodc2/Cluster_pipelines/20deg2-nersc.sub

**The command below will submit a job and run the pipeline**



> **In a terminal, navigate to your TXPipe directory on IN2P3 and run**:
>```
sbatch examples/Cluster_pipelines/cosmodc2/20deg2-in2p3.sub
```


> **If you are ar NERSC, you will instead run**:
>```
sbatch examples/Cluster_pipelines/cosmodc2/20deg2-nersc.sub
```

## Dry-run the pipeline and produce a flowchart plot

The pipeline used here as the other stages commented to only highlight in the flowchart the `CLClusterBinning` stage.
The config file is the same. The file is written for in2p3 but it does not matter as it is a dry-run.

In [None]:
# Read the appropriate pipeline configuration, and ask for a flow-chart.
pipeline_file = "examples/cosmodc2/Cluster_pipelines/CLClusterBinning-20deg2-CL.yml"
flowchart_file = "CLClusterBinning.png"


pipeline_config = ceci.Pipeline.build_config(
    pipeline_file,
    flow_chart=flowchart_file,
    dry_run=True
)

# Run the flow-chart pipeline
ceci.run_pipeline(pipeline_config)

In [None]:
Image(flowchart_file)

## Look at the results

### Open the corresponding pipeline file to load correct input/output file names

In [None]:
with open(pipeline_file, 'r') as file:
    pipeline_content = yaml.safe_load(file)


In [None]:
# open input cluster catalog
filename_in = pipeline_content['inputs']['cluster_catalog']
print(filename_in)
f_in = h5py.File(filename_in, "r")
dset_in = f_in['clusters']

In [None]:
# open output binning output
filename_out =pipeline_content['output_dir']+"/cluster_catalog_tomography.hdf5"
print (filename_out)
f_out = h5py.File(filename_out, "r")
dat_out = f_out['provenance']
dset_out = f_out['cluster_bin']

### Open the corresponding config file to load config parameters

In [None]:
with open(yaml.safe_load(pipeline_content['config']), 'r') as file:
    config_content = yaml.safe_load(file)

In [None]:
config_bin_info = config_content['CLClusterBinningRedshiftRichness']

### Plot results

In [None]:
# plot data from input catalog
plt.semilogy(dset_in['redshift'][()], dset_in['richness'][()],'k.', alpha=1)
plt.xlabel('redshift')
plt.ylabel('richness')

# plot bin limits as defined in the config file
[plt.axvline(i,linestyle='dashed', color='black') for i in config_bin_info['zedge']]
[plt.axhline(i,linestyle='dotted', color='black') for i in config_bin_info['richedge']]

# overplot data from output file to make sure the bins are ordered correctly
markers=['s','o', 'D', 'P', '^']

for i in range(len(config_bin_info['zedge'])-1):
    for j in range(len(config_bin_info['richedge'])-1):
        plt.scatter(dset_out['bin_zbin_'+str(i)+'_richbin_'+str(j)]['redshift'][:], 
         dset_out['bin_zbin_'+str(i)+'_richbin_'+str(j)]['richness'][:], marker=markers[j], label='bin_zbin_'+str(i)+'_richbin_'+str(j))
    
    plt.legend(fontsize='x-small')