Here we train a model on protoDC2 data via TPZ (http://matias-ck.com/mlz/tpz.html) to be used in the TXPipe framework. 

In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [3]:
import os
import pandas as pd

from sklearn.model_selection import train_test_split
from mlz.ml_codes import *

In [5]:
## Get the data
path = "/global/projecta/projectdirs/lsst/groups/PZ/PhotoZDC2/protoDC2v5"
infile = "protodc2_v5_ugrizy_witherrs.h5"
fullpath = os.path.join(path,infile)

## Make a table with the desired features
df = pd.read_hdf(fullpath)
## df.info()

input_df = pd.DataFrame()

input_df['zs'] = df['redshift']
input_df['u'] = df['scatmag_u']
input_df['g'] = df['scatmag_g']
input_df['r'] = df['scatmag_r']
input_df['i'] = df['scatmag_i']
input_df['z'] = df['scatmag_z']
input_df['y'] = df['scatmag_y']

u_g = df['scatmag_u'] - df['scatmag_g']
g_r = df['scatmag_g'] - df['scatmag_r']
r_i = df['scatmag_r'] - df['scatmag_i']
i_z = df['scatmag_i'] - df['scatmag_z']
z_y = df['scatmag_z'] - df['scatmag_y']

input_df['u-g'] = u_g
input_df['g-r'] = g_r
input_df['r-i'] = r_i
input_df['i-z'] = i_z
input_df['z-y'] = z_y

In [7]:
d = {'u': {'ind': 0}, 'g': {'ind': 1}, 'r': {'ind': 2}, 'i': {'ind': 3}, 'z': {'ind': 4}, 'z': {'ind': 4}, 
     'u-g': {'ind': 5}, 'g-r': {'ind': 6}, 'r-i': {'ind': 6}, 'i-z': {'ind': 6}, 'z-y': {'ind': 6}}

features = ['mag_u_lsst','mag_g_lsst','mag_r_lsst','mag_i_lsst','mag_z_lsst', 'mag_y_lsst', 'mag_u_lsst-mag_g_lsst', 
            'mag_g_lsst-mag_r_lsst', 'mag_r_lsst-mag_i_lsst', 'mag_i_lsst-mag_z_lsst', 'mag_z_lsst-mag_y_lsst']

In [8]:
## Split the data into training and test samples, and separate the features from the redshift
seed = 123
train, test = train_test_split(input_df, test_size=0.4, random_state=seed)
train_cols_mags = train.loc[:,'u':'z-y']
train_z = train.loc[:,'zs']

np.savetxt('../repos/MLZ/mlz/test/protoDC2.test', test.values)
np.savetxt('../repos/MLZ/mlz/test/protoDC2.train', train.values)

print(train_cols_mags.shape, train_z.shape)

(10972023, 11) (10972023,)


In [None]:
## Create a forest from a set of decision trees 
n_trees = 10
Trees = [TPZ.Rtree(train_cols_mags.as_matrix()[::100], train_z.as_matrix()[::100], minleaf=30, mstar=3, dict_dim=d, 
                   forest='yes') for i in range(n_trees)]

save('../repos/MLZ/mlz/test/protoDC2_trees_109721.npy', [features, Trees])
print(len(train_cols_mags.as_matrix()[::100]))