In [1]:
import os
import numpy as np
import xarray as xr
from datetime import datetime
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler, minmax_scale

import holodecml.ml_utils as ml


In [2]:
matt_train = "/glade/scratch/mhayman/holodec/holodec-ml-data/histogram/histogram_training_data_5000count20200819T091551.nc"
matt_valid = "/glade/scratch/mhayman/holodec/holodec-ml-data/histogram/histogram_validation_data_5000count20200819T091551.nc"
matt_test = "/glade/scratch/mhayman/holodec/holodec-ml-data/histogram/histogram_test_data_5000count20200819T091551.nc"


path_data_z = "/glade/p/cisl/aiml/ai4ess_hackathon/holodec/ft_rad_bidis_z/"
path_data_z_realimag = "/glade/p/cisl/aiml/ai4ess_hackathon/holodec/ft_rad_bidis_z_realimag/"

batch_size = 256
input_variable = "input_image"
label_variable = "histogram"

In [3]:
def calc_bins(var):
    ds = xr.open_dataset('/glade/p/cisl/aiml/ai4ess_hackathon/holodec/synthetic_holograms_50-100particle_bidisperse_training.nc')
    if var == 'z':
        delta = (round(ds[var].values.max()+1000) -
                 round(ds[var].values.min()-1000))
        step = int(delta / 20)
        edges = np.arange(round(ds[var].values.min()-1000),
                         round(ds[var].values.max()+1000),
                         step)
    else:
        edges = np.arange(0,200,5)
    centers = 0.5*np.diff(edges) \
                    + edges[:-1]
    return centers

def ranked_probability_score(y_true, y_pred):
    return np.mean((np.cumsum(y_true, axis=1) - np.cumsum(y_pred, axis=1)) ** 2) / (y_true.shape[1] -1)


In [4]:
with xr.open_dataset(matt_train, chunks={'hologram_number': batch_size}) as ds:
    print("Loading TRAINING dataset")

    if len(ds[input_variable].dims) == 4:
        train_inputs_matt = ds[input_variable].transpose('hologram_number','xsize','ysize','input_channels')
    elif len(ds[input_variable].dims) == 3:
        train_inputs_matt = ds[input_variable].transpose('hologram_number','rsize','input_channels')
    print(f"\ttrain_inputs_matt.shape:{train_inputs_matt.shape}")    
    
    input_scaler_matt = ml.MinMaxScalerX(train_inputs_matt)
    train_inputs_matt = input_scaler_matt.fit_transform(train_inputs_matt)
    print(f"\ttrain_inputs_matt.shape:{train_inputs_matt.shape}")

    train_outputs_matt = ds[label_variable]
    print(f"\ttrain_outputs_matt.shape:{train_outputs_matt.shape}")
   
    output_scaler_matt = ml.MinMaxScalerX(train_outputs_matt)
    train_outputs_matt = output_scaler_matt.fit_transform(train_outputs_matt)
    print(f"\ttrain_outputs_matt.shape:{train_outputs_matt.shape}")
    

Loading TRAINING dataset
	train_inputs_matt.shape:(5000, 600, 1)
	train_inputs_matt.shape:(5000, 600, 1)
	train_outputs_matt.shape:(5000, 19, 1)
	train_outputs_matt.shape:(5000, 19, 1)


In [5]:
path_data = "/glade/p/cisl/aiml/ai4ess_hackathon/holodec/ft_rad_bidis/"

fns = [x for x in os.listdir(path_data)]
fn_train = [x for x in fns if 'training' in x][0]
fn_valid = [x for x in fns if 'validation' in x][0]
fn_test = [x for x in fns if 'test' in x][0]
print(fns)

with xr.open_dataset(path_data + fn_train, chunks={'hologram_number': batch_size}) as ds:
    print("Loading TRAINING dataset")

    if len(ds[input_variable].dims) == 4:
        train_inputs = ds[input_variable].transpose('hologram_number','xsize','ysize','input_channels')
    elif len(ds[input_variable].dims) == 3:
        train_inputs = ds[input_variable].transpose('hologram_number','rsize','input_channels')
    print(f"\ttrain_inputs.shape:{train_inputs.shape}")    
    
    input_scaler = ml.MinMaxScalerX(train_inputs)
    train_inputs = input_scaler.fit_transform(train_inputs)
    print(f"\ttrain_inputs.shape:{train_inputs.shape}")

    train_outputs = ds[label_variable]
    print(f"\ttrain_outputs.shape:{train_outputs.shape}")
   
    output_scaler = ml.MinMaxScalerX(train_outputs)
    train_outputs = output_scaler.fit_transform(train_outputs)
    print(f"\ttrain_outputs.shape:{train_outputs.shape}")

with xr.open_dataset(path_data+fn_valid, chunks={'hologram_number': batch_size}) as ds:
    print("Loading VALIDATION dataset")

    if len(ds[input_variable].dims) == 4:
        valid_inputs = ds[input_variable].transpose('hologram_number','xsize','ysize','input_channels')
    elif len(ds[input_variable].dims) == 3:
        valid_inputs = ds[input_variable].transpose('hologram_number','rsize','input_channels')
    print(f"\tvalid_inputs.shape:{valid_inputs.shape}")    
    
    input_scaler = ml.MinMaxScalerX(valid_inputs)
    valid_inputs = input_scaler.fit_transform(valid_inputs)
    print(f"\tvalid_inputs.shape:{valid_inputs.shape}")

    valid_outputs = ds[label_variable]
    print(f"\tvalid_outputs.shape:{valid_outputs.shape}")
   
    output_scaler = ml.MinMaxScalerX(valid_outputs)
    valid_outputs = output_scaler.fit_transform(valid_outputs)
    print(f"\tvalid_outputs.shape:{valid_outputs.shape}")


['histogram_validation_data_5000count20201009T085713.nc', 'histogram_test_data_5000count20201009T085713.nc', 'histogram_training_data_5000count20201009T085713.nc']
Loading TRAINING dataset
	train_inputs.shape:(5000, 600, 1)
	train_inputs.shape:(5000, 600, 1)
	train_outputs.shape:(5000, 39, 1)
	train_outputs.shape:(5000, 39, 1)
Loading VALIDATION dataset
	valid_inputs.shape:(1000, 600, 1)
	valid_inputs.shape:(1000, 600, 1)
	valid_outputs.shape:(1000, 39, 1)
	valid_outputs.shape:(1000, 39, 1)


In [6]:
print(np.array_equal(train_inputs.values, train_inputs_matt.values))
print(np.array_equal(train_outputs.values, train_outputs_matt.values))

True

In [8]:
models = ["/glade/p/cisl/aiml/ggantos/holodec/ft_rad_bidis/d/",
          "/glade/p/cisl/aiml/ggantos/holodec/ft_rad_bidis/z/",
          "/glade/p/cisl/aiml/ggantos/holodec/ft_rad_bidis/z_realimag/"]
models

['/glade/p/cisl/aiml/ggantos/holodec/ft_rad_bidis/d/',
 '/glade/p/cisl/aiml/ggantos/holodec/ft_rad_bidis/z/',
 '/glade/p/cisl/aiml/ggantos/holodec/ft_rad_bidis/z_realimag/']

In [None]:
valid_index = 11
    
model = models[0]
model_name = "diameter"

# var = 'z'
# valid_outputs = np.squeeze(data_loaded['z_valid_outputs'].values) 
# bins = calc_bins(var)
var = 'd'
valid_outputs = np.squeeze(data_loaded['base_valid_outputs'].values) 
bins = calc_bins(var)[:-1]
    
bin_size = bins[1] - bins[0]

train_outputs_pred = np.genfromtxt(os.path.join(model, "train_outputs_pred.csv"))
valid_outputs_pred = np.genfromtxt(os.path.join(model, "valid_outputs_pred.csv"))

loss = np.genfromtxt(os.path.join(model, "loss.csv"))
val_loss = np.genfromtxt(os.path.join(model, "val_loss.csv"))

fig, axes = plt.subplots(1, 3, figsize=(16, 4))
axes[0].plot(loss)
axes[0].plot(val_loss)
axes[0].set_title(f'{model_name} loss')
axes[0].set_ylabel('loss')
axes[0].set_xlabel('epoch')
axes[0].legend(['training', 'validation'], loc='upper left')

axes[1].bar(bins / 1000, valid_outputs_pred[valid_index], bin_size / 1000, color='red', label="Predicted")
axes[1].bar(bins / 1000, valid_outputs[valid_index], bin_size / 1000, edgecolor='blue', facecolor="none", lw=3, label="True")
axes[1].set_ylim(0, 1)
axes[1].set_xlabel(f"{var}-axis particle position (mm)")
axes[1].set_ylabel(f"relative {var} distribtion")
axes[1].legend(loc="best")

axes[2].bar(bins / 1000, valid_outputs_pred.mean(axis=0), bin_size / 1000, color='red')
axes[2].bar(bins / 1000, valid_outputs.mean(axis=0), bin_size / 1000, edgecolor='blue', facecolor="none", lw=3)
axes[2].set_xlabel(f"{var} location (mm)")
axes[2].set_ylabel(f"Mean {var} Distribution")

rps_nn = ranked_probability_score(valid_outputs, valid_outputs_pred)
rps_climo = ranked_probability_score(valid_outputs, np.ones(valid_outputs_pred.shape) / valid_outputs_pred.shape[1])
print(f"RPS_nn: {rps_nn:0.3f}", f"RPS_climo: {rps_climo:0.3f}")
rpss = 1 - rps_nn / rps_climo
print(f"RPSS: {rpss:0.3f}")
plt.show()    