# Pz estimation from images 

First, this notebook shows where to get the data and the required pre processing. 

Then, it contains the results of the different convolutional models exploration:
- CNN
- ResNet
- DenseNet



In [None]:
import tensorflow as tf

#Checking for GPU access
if tf.test.gpu_device_name() != '/device:GPU:0':
  print('WARNING: GPU device not found.')
else:
  print('SUCCESS: Found GPU: {}'.format(tf.test.gpu_device_name()))

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#tools contains different useful functions (plot results, calculate metrics, etc.)
from tools import *

# 1. Data preprocessing

The main datafile is the one called 'download'. It contains a collection of galaxy images associated with a catalogue of features (including the redshift). 
The two other file 'img_30k.npy' and 'z_30k.npy' are just an extracted subset of the 'download' fulldata set for easier manipulation.  

In [None]:
# data is stored in the following repo
%ls /global/cfs/cdirs/lsst/groups/PZ/valentin_image_data_temp

In [None]:
data = np.load('/global/cfs/cdirs/lsst/groups/PZ/valentin_image_data_temp/download')

In [None]:
type(data)

In [None]:
data.files

### Labels

'labels' is a big catalogue containing multiple features for each galawy images

In [None]:
# let's extract the first 30k lines and create a Dataframe with them
cat = pd.DataFrame(data["labels"][:30000] )
cat.head()

In [None]:
z = cat.z

In [None]:
cat.z.min(), cat.z.max()

In [None]:
plt.hist(cat.z, bins=100);

Redshifts are relatively low in this dataset: 0 < z < 0.7

### Cube

'cube' contains all galaxy images

In [None]:
images = data['cube'][:30000]

In [None]:
images.shape

In [None]:
# To check the images, 3 channels and not 5 are required, hence the :3 at the end
plt.imshow(images[0, :, :,:3])
plt.show();

In [None]:
# Checl the distribution of each channels
for i,b in enumerate(['g', 'r', 'i', 'z', 'y']):
    plt.hist(images[...,i].flatten(), 100, label=b, alpha=0.5)
    plt.yscale('log');
    plt.legend()

Need to standardize the data.
The following data processing is the one proposed by [Francois Lanusse here.](https://github.com/EiffL/Tutorials/blob/master/PhotozCNN/photoz_inference_training_solution.ipynb)

In [None]:
# Let's evaluate the noise standard deviation in each band, and apply range compression accordingly
from astropy.stats import mad_std
scaling = []
for i,b in enumerate(['g', 'r', 'i', 'z', 'y']):
    plt.hist(images[...,i].flatten(), 100, label=b, alpha=0.5, range=[-1,1]);
    sigma = mad_std(images[...,i].flatten())
    scaling.append(sigma)
    plt.axvline(sigma, color='C%d'%i,alpha=0.5)
    plt.axvline(-sigma, color='C%d'%i,alpha=0.5)
    plt.legend()

In [None]:
# Let's have a look at this distribution if we rescale each band by the standard deviation
for i,b in enumerate(['g', 'r', 'i', 'z', 'y']):
    plt.hist(images[...,i].flatten()/scaling[i],100, label=b,alpha=0.5, range=[-10,10]);
    plt.legend()

In [None]:
def preprocessing(image):
    return np.arcsinh(image / scaling / 3. )

In [None]:
prepro_img = preprocessing(images)

In [None]:
for i,b in enumerate(['g', 'r', 'i', 'z', 'y']):
    plt.hist(tf.reshape(prepro_img[1000, :, :,i], -1), 100, label=b, alpha=0.5)
    plt.yscale('log');
plt.legend()

### Train val test split 

In [None]:
# Split into train and test
img_train = prepro_img[:15000,...]
img_val = prepro_img[15000:20000,...]
img_test = prepro_img[20000:, ...]

z_train = z[:15000]
z_val = z[15000:20000]
z_test = z[20000:]

In [None]:
img_train.shape

# 2. CNN model

In [None]:
import tensorflow.keras as tfk

def create_model():
    model = tfk.models.Sequential()
    
    model.add(tfk.layers.Conv2D(32, kernel_size=5, padding='same', input_shape=(64,64,5), activation='elu', strides=2))
    model.add(tfk.layers.BatchNormalization())

    model.add(tfk.layers.Conv2D(64, kernel_size=3, padding='same', activation='elu'))
    model.add(tfk.layers.BatchNormalization())

    model.add(tfk.layers.Conv2D(128, kernel_size=3, padding='same', strides=2, activation='elu'))
    model.add(tfk.layers.BatchNormalization())  

    model.add(tfk.layers.Conv2D(256, kernel_size=3, padding='same', activation='elu', strides=2))
    model.add(tfk.layers.BatchNormalization())

    model.add(tfk.layers.Conv2D(512, kernel_size=3, padding='same', activation='elu', strides=2))
    model.add(tfk.layers.BatchNormalization())
    
    # remplacer flatten par global pooling potentiellment moins d'overfit
    model.add(tfk.layers.Flatten())
    model.add(tfk.layers.Dense(512))
    model.add(tfk.layers.Activation('relu'))
    model.add(tfk.layers.Dense(256))
    model.add(tfk.layers.Activation('relu'))
    model.add(tfk.layers.Dense(1))

    model.compile(optimizer='adam', # learning rate will be set by LearningRateScheduler
                loss=tfk.metrics.mse)
    return model

In [None]:
model = create_model()

In [None]:
model.summary()

In [None]:
# Learning rate schedule
LEARNING_RATE=0.001
LEARNING_RATE_EXP_DECAY=1
lr_decay = tfk.callbacks.LearningRateScheduler(
    lambda epoch: LEARNING_RATE * LEARNING_RATE_EXP_DECAY**epoch)

# Tensoboard tracking
#tb_callback = tf.keras.callbacks.TensorBoard('./logs/CNN', update_freq='batch')

# Early_stopping
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, verbose=1, 
                                              restore_best_weights=True)

history_cnn = model.fit(x = img_train, 
          y = z_train,
          batch_size = 64,
          validation_data=(img_val, z_val),
          steps_per_epoch=len(img_train)//64,
          epochs=50,
          callbacks=[lr_decay, early_stop])

In [None]:
# Get the prediction
preds = model.predict(img_test)

In [None]:
# Metrics results
dz, pred_bias, smad, out_frac = metrics(z_test, preds.squeeze())
print_metrics(pred_bias, smad, out_frac)

In [None]:
history_plot(history_cnn, 'CNN Model Loss')

In [None]:
plot_results(z_test, preds.squeeze(), pred_bias, out_frac, smad, 'CNN')

# 3. ResNet

In [None]:
from tensorflow.keras.applications import ResNet50

def resnet_model():

    model = tfk.models.Sequential()

    model.add(ResNet50(include_top = False,
                     pooling = 'avg',
                     input_shape=(64,64,5),
                     weights=None))

    model.add(tfk.layers.Flatten())
    model.add(tfk.layers.Dense(512))
    model.add(tfk.layers.Activation('relu'))
    model.add(tfk.layers.Dense(256))
    model.add(tfk.layers.Activation('relu'))
    model.add(tfk.layers.Dense(1))

    model.compile(optimizer='adam', # learning rate will be set by LearningRateScheduler
                loss=tfk.metrics.mse)
    return model

In [None]:
model_resnet = resnet_model()

model_resnet.summary()

In [None]:
%%time
# Learning rate schedule
LEARNING_RATE=0.001
LEARNING_RATE_EXP_DECAY=0.9
lr_decay = tfk.callbacks.LearningRateScheduler(
    lambda epoch: LEARNING_RATE * LEARNING_RATE_EXP_DECAY**epoch,
    verbose=True)

# Tensoboard tracking
#tb_callback = tf.keras.callbacks.TensorBoard('./logs/ResNet', update_freq='batch')

# Early_stopping
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, verbose=1, 
                                              restore_best_weights=True)

history_resnet = model_resnet.fit(x = img_train, 
          y = z_train,
          batch_size = 64,
          validation_data=(img_val, z_val),
          steps_per_epoch=len(img_train)//64,
          epochs=50,
          callbacks=[lr_decay])

In [None]:
# Get the prediction
resnet_preds = model_resnet.predict(img_test).squeeze()

In [None]:
# Metrics results
dz, pred_bias, smad, out_frac = metrics(z_test, resnet_preds)
print_metrics(pred_bias, smad, out_frac)

In [None]:
history_plot(history_resnet, 'ResNet Model Loss')

In [None]:
plot_results(z_test, resnet_preds, pred_bias, out_frac, smad, 'ResNet')

# 4. DenseNet

In [None]:
from tensorflow.keras.applications import DenseNet121

def densenet_model():

    model = tfk.models.Sequential()

    model.add(DenseNet121(include_top = False,
                     pooling = 'avg',
                     input_shape=(64,64,5),
                     weights=None))

    model.add(tfk.layers.Flatten())
    model.add(tfk.layers.Dense(512))
    model.add(tfk.layers.Activation('relu'))
    model.add(tfk.layers.Dense(256))
    model.add(tfk.layers.Activation('relu'))
    model.add(tfk.layers.Dense(1))

    model.compile(optimizer='adam', # learning rate will be set by LearningRateScheduler
                loss=tfk.metrics.mse)
    return model

In [None]:
model_densenet = densenet_model()

model_densenet.summary()

In [None]:
%%time
# Learning rate schedule
LEARNING_RATE=0.001
LEARNING_RATE_EXP_DECAY=0.9
lr_decay = tfk.callbacks.LearningRateScheduler(
    lambda epoch: LEARNING_RATE * LEARNING_RATE_EXP_DECAY**epoch,
    verbose=True)

# Tensoboard tracking
#tb_callback = tf.keras.callbacks.TensorBoard('./logs/DenseNet', update_freq='batch')


# Early_stopping
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, verbose=1, 
                                              restore_best_weights=True)

history_dense = model_densenet.fit(x = img_train, 
          y = z_train,
          batch_size = 64,
          validation_data=(img_val, z_val),
          steps_per_epoch=len(img_train)//64,
          epochs=50,
          callbacks=[lr_decay, early_stop])

In [None]:
# Get the prediction
densenet_preds = model_densenet.predict(img_test).squeeze()

In [None]:
# Metrics results
dz, pred_bias, smad, out_frac = metrics(z_test, densenet_preds)
print_metrics(pred_bias, smad, out_frac)

In [None]:
history_plot(history_dense, 'DenseNet Model Loss')

In [None]:
plot_results(z_test, densenet_preds, pred_bias, out_frac, smad, 'DenseNet')