### This notebook shows an example of doing some data preperation and using sklearn to do a regression

This needs a bit of work to match the column names to the input dataset

The usual imports

In [None]:
import os
import tables_io
import numpy as np
import matplotlib.pyplot as plt
from macss import plotting_functions
from macss import utility_functions


#### Change this to match the correct location

In [None]:
HOME = os.environ['HOME']
pz_dir = f'{HOME}/macss'

Read a test file (in this case a Roman / Rubin open universe sim)

In [None]:
d = tables_io.read(f"{pz_dir}/data/roman_rubin_9925.hdf5")

In [None]:
d.keys()

Split it in half into training and test sets

In [None]:
train = tables_io.sliceObj(d, slice(0, -1, 2))
test = tables_io.sliceObj(d, slice(1, -1, 2))

Set up a regression algorithm

In [None]:
from sklearn.ensemble import HistGradientBoostingRegressor

In [None]:
reg = HistGradientBoostingRegressor(max_iter=5000)

Extract targets (specz redshfits) and features (color and magnitudes) from the data

In [None]:
train_targets, train_features = utility_functions.prepare_data_total_mag_and_colors(train, 'LSST_obs_{band}', 'ugrizy')
test_targets, test_features = utility_functions.prepare_data_total_mag_and_colors(test, 'LSST_obs_{band}', 'ugrizy')

### Do some data preparation, scaling the inputs, so that they are mapped onto the same ranges

In [None]:
from sklearn import preprocessing

In [None]:
scaler = preprocessing.StandardScaler().fit(train_features)

In [None]:
scaler.transform(train_features)

In [None]:
scaled = scaler.transform(train_features).clip(-5, 5)

#### Ok, let's apply a Principle components decomposition to this data

A PCA is basically a method to come up with a new set of basis functions of the data
that try to map out the correlations between the different inputs.

To see more about the PCA you can look here:
https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components=scaled.shape[-1], whiten=False)

In [None]:
pca.fit(scaled.clip(-5, 5))

In [None]:
pca.explained_variance_ratio_

In [None]:
pca_out = pca.transform(scaled)

#### Ok, let's make some featue plots...

We can look at the scaled inputs and compare them to the output of the PCA

In [None]:
fig = plotting_functions.plot_feature_histograms(scaled)

In [None]:
fig = plotting_functions.plot_feature_histograms(pca_out)

In [None]:
_ = plotting_functions.plot_pca_hist2d(train_features, pca_out)

In [None]:
_ = plotting_functions.plot_feature_target_hist2d(train_features, train_targets)

In [None]:
_ = plotting_functions.plot_feature_target_hist2d(pca_out, train_targets)

Run an example regression

In [None]:
preds = utility_functions.run_regression(reg, train_features, train_targets, test_features)

In [None]:
_ = plotting_functions.plot_true_predict_fancy(test_targets, preds)