## **03.a Training FlexZBoost**
#### Authors: **Amanda Farias (afariassantos2@gmail.com), Iago Lopes (iagolops2012@gmail.com), Bruno Moraes (bruno.a.l.moraes@gmail.com)**
#### Creation date: **09/10/2024**,  
#### Last Verifed to Run: **11/19/2024** (by @iago)


The objective of this notebook is to train the data selected on the previous notebook "02a_roman_rubin_analysis.ipynb". More details about FlexZBoost can be found in: https://arxiv.org/abs/1704.08095.

In [None]:
import pandas as pd
import h5py
import numpy as np
import matplotlib.pyplot as plt
from Plots import plot_position
from rail.core.stage import RailStage
from rail.core.data import PqHandle, TableHandle
from rail.estimation.algos.flexzboost import FlexZBoostInformer
from rail.core.data import Hdf5Handle
from scipy.spatial import Delaunay
from rail.estimation.algos.flexzboost import FlexZBoostEstimator

In [None]:
def create_output_file(outfile,xdf,num_rows):
    outf = h5py.File(outfile,"w")
    xx = outf.create_group('photometry')
    xx['galaxy_id']=xdf['galaxy_id'][:num_rows]
    for key in xdf.keys():
        if key != 'galaxy_id':
            xx[f'{key}'] = xdf[key][:num_rows]
    outf.close()

<div class="alert alert-block alert-warning">
<b>ATTENTION:</b> This is a change you need to make to ensure the code works correctly, as it needs to run in your NERSC account.
</div> 

In [None]:
nersc_name = 'iago'

Loading the data

In [None]:
DS = RailStage.data_store
DS.__class__.allow_overwrite = True

path= "/pscratch/sd/" + nersc_name[0] + "/" + nersc_name 
save_file  = True

In [None]:
catalog = pd.read_csv(f"{path}/roman_rubin.csv", sep=" ")

In [None]:
# Values defined in notebook 02a_roman_rubin_analysis
sigma = 10

if sigma == 10:
    i_cut = 23.7
    err_cut = 2.5*np.log10(1+0.1)
elif sigma == 5:
    i_cut = 24.7
    err_cut = 2.5*np.log10(1+0.2)

catalog = catalog[catalog["mag_i_lsst"] < i_cut]
catalog = catalog[catalog["mag_err_i_lsst"] < err_cut]
catalog

## Training
In the training and test division, we divide by hand, choosing those who are not next each other.

In [None]:
all_pix = list(set(catalog["pix"]))

for pix in all_pix:
    mask = catalog['pix'] == pix
    ra_mean = catalog[mask]['ra'].mean()
    dec_mean = catalog[mask]['dec'].mean()
    
    plt.scatter(catalog[mask]['ra'], catalog[mask]['dec'], s=10, label=pix)
    
    plt.text(ra_mean, dec_mean, str(pix), fontsize=12, ha='center', va='center')

plt.xlabel('RA')
plt.ylabel('Dec')
plt.show()

In [None]:
test_pix = [
    9921, 9922, 9923, 9924, 9925,
    10177, 10178, 10179, 10180, 10181,
    10429, 10430, 10431, 10432,
    10665, 10666, 10667, 10668
]


train_pix = [pix for pix in all_pix if pix not in test_pix]

In [None]:
train = pd.DataFrame([])
test = pd.DataFrame([])

for pix in all_pix:
    if pix in train_pix:
        train = pd.concat([train, catalog[catalog["pix"] == pix]])
    else:
        test = pd.concat([test, catalog[catalog["pix"] == pix]])

In [None]:
plt.figure(figsize=(10, 8))

plt.scatter(train['ra'], train['dec'], s=1, color='#007f5c', alpha=0.6, edgecolor='none')
plt.scatter(test['ra'], test['dec'], s=1, color='#d9b44a', alpha=0.6, edgecolor='none')

plt.xlabel('Right Ascension (RA)', fontsize=14)
plt.ylabel('Declination (Dec)', fontsize=14)
plt.title('Train and Test', fontsize=16)

plt.scatter([], [], s=70, color='#007f5c', label='Train/Validation')
plt.scatter([], [], s=70, color='#d9b44a', label='Test')

plt.legend(loc='lower right', fontsize=12)
plt.grid(True, linestyle='--', alpha=0.5)

plt.xlim(train['ra'].min()-1, train['ra'].max()+2)
plt.ylim(train['dec'].min()-1, train['dec'].max()+2)
plt.show()


In [None]:
train_cut = train.sample(int(0.2*len(train)),random_state=42) # reducing the size of the data
test_cut = test.sample(int(0.7*len(train)),random_state=42)

In [None]:
if save_file: # change on the beginning of notebook
    test_cut.to_csv(
        f"{path}/roman_rubin_y1_a_test_{sigma}sig.csv",
        sep=" ", index=False)
    train_cut.to_csv(
        f"{path}/roman_rubin_y1_a_train_{sigma}sig.csv",
    sep=" ", index=False)

In [None]:
### converting all the data to a hdf5 in RAIL's format
trainFile = f"{path}/roman_rubin_y1_a_train_{sigma}sig.hdf5"
testFile = f"{path}/roman_rubin_y1_a_test_{sigma}sig.hdf5"

tmp_dict_train = train_cut.to_dict(orient='list')
tmp_dict_test = test_cut.to_dict(orient='list')

def convert_lists_to_arrays(d):
    return {key: np.array(value) for key, value in d.items()}


array_dict_train = convert_lists_to_arrays(tmp_dict_train)
array_dict_test = convert_lists_to_arrays(tmp_dict_test)

train = pd.DataFrame(array_dict_train)
test = pd.DataFrame(array_dict_test)

create_output_file(trainFile, train, len(train['redshift']))
create_output_file(testFile, test, len(test['redshift']))

In [None]:
training_data = DS.read_file("training_data", TableHandle, trainFile)
test_data = DS.read_file("test_data", TableHandle, testFile)

In [None]:
#### parameters for FZB

z_max = max(train["redshift"])
z_min = min(train["redshift"])
print(z_max)

limits = []

bands = [
    "mag_u_lsst", "mag_g_lsst", "mag_r_lsst", 
    "mag_i_lsst", "mag_z_lsst", "mag_y_lsst",
]

for band in bands:
    df = pd.DataFrame(training_data.data['photometry'])
    filtered_df = df[df[f'{band}'] < 99] # we can't count 99 values
    limits.append(float(np.round(max(filtered_df[band]), 2)))

print(f"Limits: {limits}")

In [None]:
############################ FlexzBoost configurations #########################################

basis_system = "Fourier"
reg = "squarederror"
max_basis = 35
depth = 8

################################################################################################

fz_dict = dict(
    zmin=z_min, zmax=z_max,
    nzbins=301, trainfrac=0.5,
    bumpmin=0.02, bumpmax=0.35, nbump=20, 
    sharpmin=0.7, sharpmax=2.1, nsharp=15,
    retrain_full=True, nondetect_val=99.0,
    max_basis=max_basis, basis_system=basis_system,
    bands=[
        "mag_u_lsst", "mag_g_lsst", "mag_r_lsst",
        "mag_i_lsst", "mag_z_lsst", "mag_y_lsst",
    ],
    err_bands=[
        "mag_err_u_lsst", "mag_err_g_lsst", "mag_err_z_lsst",
        "mag_err_i_lsst", "mag_err_z_lsst", "mag_err_y_lsst",
    ],
    mag_limits={
        "mag_u_lsst": limits[0], "mag_g_lsst": limits[1], "mag_r_lsst": limits[2], 
        "mag_i_lsst": limits[3], "mag_z_lsst": limits[4], "mag_y_lsst": limits[5],
    },
    hdf5_groupname="photometry",
    model=f"train_a_roman_fzb_y1_{sigma}sig.pkl",
    regression_params={"max_depth": depth, "objective": f"reg:{reg}"},
)# qp_representation = 'flexzboost'
fz_modelfile = (
    f"train_a_roman_fzb_y1_{sigma}sig.pkl"
)

In [None]:
inform_pzflex = FlexZBoostInformer.make_stage(
    name=f"inform_fzboost_a_y1_{sigma}sig", **fz_dict
)

<div class="alert alert-block alert-danger">
    <strong>Warning!</strong> It takes approximately 30 min to train the model.
</div>

In [None]:
%%time
inform_pzflex.inform(training_data)