## **06.a MagLim sample**
#### Authors: **Amanda Farias (afariassantos2@gmail.com), Iago Lopes (iagolops2012@gmail.com)**,
#### Creation date: **09/09/2024**,  
#### Last Verifed to Run: **11/19/2024** (by @iagolops)

The objective of optimize MagLim sample from our previous results. For this we are going to look at some metrics for different parameters of the selector. For more details about the definition of MagLim, visit [this paper](https://arxiv.org/abs/2011.03411).
$~$
##### Logistics: This notebook is intended to be run through the Jupyter Lab NERSC interface available in __[Jupyter nersc](https://jupyter.nersc.gov/)__ .

<div class="alert alert-block alert-danger">
<b>attention:</b> For the final notebook, we will use the test dataset from notebook 2, for now, let's use the previously output evaluated by Iago. </div>

### Importing packages used in this notebook

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import tables_io
import qp
import matplotlib.pyplot as plt

from scipy.interpolate import UnivariateSpline
from tqdm import tqdm


from rail.core.data import (
    TableHandle,
    PqHandle,
    ModelHandle,
    QPHandle,
    DataHandle,
    Hdf5Handle,
    QPOrTableHandle
)

from rail.estimation.algos.naive_stack import NaiveStackSummarizer
from rail.estimation.algos.true_nz import TrueNZHistogrammer

from rail.evaluation.dist_to_dist_evaluator import DistToDistEvaluator
from rail.evaluation.dist_to_point_evaluator import DistToPointEvaluator
from rail.evaluation.point_to_point_evaluator import PointToPointEvaluator
from rail.evaluation.single_evaluator import SingleEvaluator
from rail.core.stage import RailStage
from Metrics import plot_old_valid, plot_metrics, plot_scatter, plot_pit_qq

from Plots import (
    plot_position,
    plot_errors,
    plot_color_color,
    plot_mag_color,
    plot_mag_histogram,
    plot_color_color_red,
    plot_z_dist
)


In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
DS = RailStage.data_store
DS.__class__.allow_overwrite = True

In [None]:
path = '/pscratch/sd/i/iago'

## Loading results

In [None]:
##############################################
############### Photo-z redshift #############
##############################################

############## Download the catalog with the photo-z #############

result = DS.read_file('pdfs_data', QPHandle, 
                      f'{path}/roman_rubin_samples/results_y1/output_estimate_a_roman_fzb_y1_10sig.hdf5')

############### Max redshift ##############
z_max = result().build_tables()['meta']['xvals'][0][-1]

############ X values for plot ############
zgrid = np.linspace(0, z_max, 301)

################## PDFS ##################
pdfs = result().ancil['zmode'].flatten()

########## Median or mode of PDF ##########
#mean = result().mean()
mean = result().mode(zgrid)

#### Array of photo-z with 132891 galaxies
zphot  = np.array([valor for sublista in mean for valor in sublista])

##############################################
################ True redshift ###############
##############################################

catalog = pd.read_csv(f'{path}/roman_rubin_samples/roman_rubin_y1_a_test_10sig.csv', sep=' ')
catalog['zphot'] = zphot
ztrue = catalog['redshift']

## Getting error of full sample

In [None]:
sigmas = []
z_bins = np.linspace(0,2,20)


for index in range(len(z_bins[1:])):
    zph = zphot[(zphot<=z_bins[index]) & (zphot>=z_bins[index-1])]
    ztr = ztrue[(zphot<=z_bins[index]) & (zphot>=z_bins[index-1])]
    if len(zph) > 0 and len(ztr) > 0: 
        ez = (zph - ztr) / (1 + ztr)
        sigma_68 = np.quantile(ez, 0.84) - np.quantile(ez, 0.16)
        sigmas.append(sigma_68)
    else:
        sigmas.append(np.nan) 
    
mean_full = np.nanmean(sigmas)

## Understanding MagLim cuts
The objective here is to look at the redshift x magnitude space to check what happens when we change the parameters

In [None]:
a_values = np.arange(2.5, 3.5, 0.5)
b_values = np.arange(18.0, 19.5, 0.5)

z_diff_ratio = (catalog['zphot'] - catalog['redshift']) / (1 + catalog['redshift'])

plt.figure(figsize=(10, 6))

hb = plt.hexbin(catalog['zphot'], catalog['mag_i_lsst'], C=z_diff_ratio,
                gridsize=400, cmap='inferno', reduce_C_function=np.mean, vmin=0, vmax=0.03)

cbar = plt.colorbar(hb)
cbar.set_label(r'$\sigma_z$', fontsize=22)

x_vals = np.linspace(min(catalog['zphot']), max(catalog['zphot']), 100)

for a in a_values:
    for b in b_values:
        y_vals = a * x_vals + b
        plt.plot(x_vals, y_vals, label=f'a={a}, b={b}', linewidth=1,ls='--')


plt.xlabel(r'$z_{phot}$', fontsize=22)
plt.ylabel('mag i', fontsize=20)
plt.tick_params(axis='both', which='major', labelsize=18)
plt.xlim(0, 3)
plt.ylim(17.5, 24)

plt.legend(fontsize=20, loc='lower right')
plt.title('MagLim cuts', fontsize=22)

plt.tight_layout()
plt.show()

## Optimizing MagLim sample

In [None]:
# Parameters to be tested
a_vec = np.arange(3, 7, 0.5) 
b_vec = np.arange(17, 20, 0.5) 

samples = {}
for a in a_vec:
    for b in b_vec:
        mask = catalog['mag_i_lsst']<a*catalog['zphot']+b
        filtered_catalog = catalog[mask]
        samples[f'a({a}) b({b})'] = filtered_catalog


In [None]:
# Initialize lists to store the mean sigma_68 and the sample size
mean_sigmas = []
sample_sizes = []
ratio = []
z_bins = np.linspace(0, 1.2, 13) 

# Iterate over each sample in the dictionary of samples
for key in samples.keys():
    data = samples[key]
    
    sigmas = []  
    valid_bins = []  
    
    # Iterate over the redshift bins
    for index in range(1, len(z_bins)):
        zph = data['zphot'][(data['zphot'] <= z_bins[index]) & (data['zphot'] >= z_bins[index-1])]
        ztr = data['redshift'][(data['zphot'] <= z_bins[index]) & (data['zphot'] >= z_bins[index-1])]
        
        if len(zph) > 0 and len(ztr) > 0: 
            ez = (zph - ztr) / (1 + ztr)  # Calculate E_z
            sigma_68 = np.quantile(ez, 0.84) - np.quantile(ez, 0.16)  
            sigmas.append(sigma_68)
            valid_bins.append(z_bins[index])
        else:
            sigmas.append(0.6)  
            valid_bins.append(z_bins[index])
    
    # Calculate the mean sigma_68 for the sample and append to the list
    mean_sigma_68 = np.nanmean(sigmas)
    mean_sigmas.append(mean_sigma_68)
    
    # Calculate sample size as a percentage and append to the list
    size_frac = len(data['zphot']) / len(catalog['zphot'])
    sample_sizes.append(size_frac * 100)
    ratio.append(size_frac*mean_full/mean_sigma_68)

# Create the first plot: bar for mean sigma_68 and line for sample size
fig, ax1 = plt.subplots(figsize=(12, 6))
bars = ax1.bar(samples.keys(), mean_sigmas, color='skyblue')
ax1.axhline(mean_full,color='black',linestyle='--',label=r'Mean $\sigma_z/(1+z)$ for full sample')
ax1.set_xlabel('Samples', fontsize=12)
ax1.set_ylabel(r'Mean $\sigma_z/(1+z)$', fontsize=12, color='blue')
ax1.set_title(r'Mean $\sigma_z/(1+z)$ and Size for Each Sample', fontsize=14)
ax1.set_xticks(np.arange(len(samples.keys())))
ax1.set_xticklabels(samples.keys(), rotation=90)
ax1.set_ylim(0, 0.1)
ax1.legend()
ax1.grid(True)

ax2 = ax1.twinx()
ax2.set_ylabel('Sample Size (%)', fontsize=12, color='red')
ax2.plot(samples.keys(), sample_sizes, 'o-', color='red', label='Sample Size (%)')
ax2.set_ylim(0, max(sample_sizes) * 1.1)
plt.tight_layout()
plt.show()



# Create the second plot: bar for density per degree^2
fig, ax1 = plt.subplots(figsize=(12, 6))
bars = ax1.bar(samples.keys(), ratio, color='orange')

ax1.set_xticks(np.arange(len(samples.keys())))
ax1.set_xticklabels(samples.keys(), rotation=90)
ax1.set_ylim(0,1)
ax1.set_title(r'Ratio $\frac{size}{\sigma_z(1+z)}$', fontsize=16)
plt.tight_layout()
plt.show()


# Filter samples with mean sigma_68 < 0.05 and plot sigma_68 distribution over redshift
filtered_samples = [key for key, mean_sigma in zip(samples.keys(), mean_sigmas) if mean_sigma < 0.022]
plt.figure(figsize=(10, 7))
for key in filtered_samples:
    data = samples[key]
    sigmas = []
    valid_bins = []
    
    for index in range(1, len(z_bins)):
        zph = data['zphot'][(data['zphot'] <= z_bins[index]) & (data['zphot'] >= z_bins[index-1])]
        ztr = data['redshift'][(data['zphot'] <= z_bins[index]) & (data['zphot'] >= z_bins[index-1])]
        
        if len(zph) > 0 and len(ztr) > 0:
            ez = (zph - ztr) / (1 + ztr)
            sigma_68 = np.quantile(ez, 0.84) - np.quantile(ez, 0.16)
            sigmas.append(sigma_68)
            valid_bins.append(z_bins[index])
        else:
            sigmas.append(0.6)
            valid_bins.append(z_bins[index])
    
    plt.scatter(valid_bins, sigmas, s=20, label=key, alpha=0.6)

plt.ylim(0, 0.15)
plt.axhline(0.02, color='black', linestyle='--')
plt.grid(True)
plt.xlabel(r'$z_{phot}$', fontsize=16)
plt.ylabel(r'$\sigma_z / (1 + z)$', fontsize=16)
plt.title('Distribution of $\sigma_{68}$ vs $z_{phot}$ for Selected Samples')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()


## Plotting all samples
In this segment, we will compute the following metrics: $\Delta z $, $\sigma_{68}$, $out_{3 \sigma}$, $out_{2 \sigma}$ e FR$_e$.

##### Definitions:
- $\Delta z$: The normalized difference between photometric and spectroscopic redshifts.

$\hspace{10cm} \Delta z = \frac{z_{phot} - z_{spec}}{1 + z_{spec}}$ 
<br>


- $\sigma_{68}$: The maximum absolute value of $\Delta z$ for the 68% of galaxies with the smallest $\Delta z$ values in bin $i$.

$\hspace{10cm} \sigma_{68} = max_{i \in U} \bigl\{ \big|\frac{z_{phot}^i - z_{spec}^i}{1 + z_{spec}^i} \big| \bigl\}$

where U is the set of the 68% of galaxies that have the smallest value of $|z_{phot} − z_{spec} |/(1+z_{spec})$ 
<br>

##### Outlier Metrics:
- $out_{3\sigma}$: The percentage of galaxies with $\Delta z$ greater than $3\sigma$.

- $out_{2\sigma}$: The percentage of galaxies with $\Delta z$ greater than $2\sigma$.




In [None]:
%%time

for sample in samples.keys():
    maglim = samples[sample]
    percentage = np.round(len(maglim['zphot']) / len(catalog['zphot']) * 100, 2)

    print(f'Sample size: {len(maglim["zphot"])}')
    print(f'Full size: {len(catalog["zphot"])}')
    print(f'Percentage: {percentage}%')

    plot_old_valid(maglim['zphot'], maglim['redshift'], code='FlexZBoost',z_max=1.2,
                   title=r'Point Estimates Scatter and KDE $N(z)$ ' + fr'{sample}')

    ########################################
    z_bins = np.arange(0.1, 1.2, 0.02)
    sigmas = []
    valid_bins = []
    sigmas_mag = []
    valid_bins_mag = []

    for index in range(1, len(z_bins)):
        zph = catalog['zphot'][(catalog['zphot'] <= z_bins[index]) & (catalog['zphot'] >= z_bins[index - 1])]
        ztr = catalog['redshift'][(catalog['zphot'] <= z_bins[index]) & (catalog['zphot'] >= z_bins[index - 1])]

        zph_mag = maglim['zphot'][(maglim['zphot'] <= z_bins[index]) & (maglim['zphot'] >= z_bins[index - 1])]
        ztr_mag = maglim['redshift'][(maglim['zphot'] <= z_bins[index]) & (maglim['zphot'] >= z_bins[index - 1])]

        if len(zph) > 0 and len(ztr) > 0:
            ez = (zph - ztr) / (1 + ztr)
            sigma_68 = np.quantile(ez, 0.84) - np.quantile(ez, 0.16)
            sigmas.append(sigma_68)
            valid_bins.append(z_bins[index])
        else:
            sigmas.append(0.6)
            valid_bins.append(z_bins[index])

        if len(zph_mag) > 0 and len(ztr_mag) > 0:
            ez_mag = (zph_mag - ztr_mag) / (1 + ztr_mag)
            sigma_68_mag = np.quantile(ez_mag, 0.84) - np.quantile(ez_mag, 0.16)
            sigmas_mag.append(sigma_68_mag)
            valid_bins_mag.append(z_bins[index])
        else:
            sigmas_mag.append(0.6)
            valid_bins_mag.append(z_bins[index])

    plt.figure(figsize=(15, 8))

    gs = plt.GridSpec(2, 1, height_ratios=[1.5, 1], hspace=0)

    ax1 = plt.subplot(gs[0])
    ax1.hist(catalog['zphot'], bins=z_bins, histtype='stepfilled', lw=2, label="Flux-limited", color='blue', alpha=0.4)
    ax1.hist(maglim['zphot'], bins=z_bins, histtype='stepfilled', lw=2, label="MagLim", color='green', linestyle='--')
    ax1.set_yscale('log')
    ax1.set_ylabel(r'$n(z)$', fontsize=24)
    ax1.set_xlim(0.2, 1.5)
    ax1.set_ylim(5e1, 3e5)
    ax1.xaxis.set_tick_params(labelsize=18)  
    ax1.yaxis.set_tick_params(labelsize=18)  
    ax1.legend(fontsize=18,loc=1)

    ax2 = plt.subplot(gs[1], sharex=ax1)
    ax2.step(z_bins[:-1], sigmas, where='mid', color='blue', lw=2, label='Flux-limited')
    ax2.step(z_bins[:-1], sigmas_mag, where='mid', color='green', lw=2, label='MagLim', linestyle='--')
    ax2.set_xlabel(r'$z_{phot}$', fontsize=24)
    ax2.set_ylabel(r'$\sigma_{z}/(1+z)$', fontsize=24)
    ax2.set_xlim(0.2, 1.5)
    ax2.set_ylim(0, 0.08)
    ax2.xaxis.set_tick_params(labelsize=18)
    ax2.yaxis.set_tick_params(labelsize=18)
    ax2.legend(fontsize=18,loc=2)
    plt.suptitle(fr'Sample size and $\sigma_z$ scatter {sample}', fontsize=24)
    plt.show()

    ######################################################

    plot_metrics(maglim['redshift'], maglim['zphot'], maximum=1.2, xlim=1.2, ylim=[-0.005, 0.005],
                 initial=0, title=f'Point estimate metrics {sample}')