In [2]:
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import h5py

In [3]:
def prepare_data(NN_data_path: str, output_var: str, train_prop: float, val_prop: float, test_prop: float):
    """
    Prepares the input and output data for a neural network model in Tensorflow.
    The operations performed to the data are:
    dropping the identifier column from the input data,
    splitting the data into training, validation, and test sets,
    scaling the features into a closed range of [0, 1],
    converting  the data into 'tf.tensors',
    adding an additional dimension into the input tensors so that they can be fed into convolutional layers.
    
    Args:
    NN_data_path -- folder where the input and output data are located
    output_var -- name of the output variable that you want data from
    train_prop -- proportion of the data to be used for training
    val_prop -- proportion of the data to be used for validation
    test_prop -- proportion of the data to be used for testing
    
    Returns:
    data_dict -- a dict containing the following data:
                 scaled_X_train,
                 scaled_X_val,
                 scaled_X_test,
                 Y_train_output_var,
                 Y_val_output_var,
                 Y_test_output_var.             
                 
    Notes:
    - train_prop + val_prop + test_prop must be equal to 1.
    - options for output_var include: tdir_down, tdif_down, tdir_up, tdif_up, spherical_albedo, edir, edif, path_rad.
    """
    output_var = output_var.lower()
    assert train_prop + val_prop + test_prop == 1, "train_prop + val_prop + test_prop must be equal to 1."
    assert output_var in ["tdir_down","tdif_down","tdir_up","tdif_up","spherical_albedo","edir","edif","path_rad"], "output not found."
    
    # Inputs
    X = pd.read_csv(NN_data_path + "inputs.csv", index_col=0)
    X = X.drop("atmosphere_file", axis=1)    # Drop the identifier col
    
    def load_HDF5(folder_path: str, file_name: str):
        with h5py.File(name=folder_path+file_name, mode="r") as hf:
            data = hf["output_data"][:]
        return data
    
    # Outputs
    Y = load_HDF5(NN_data_path, "outputs_subset.h5")
    
    ###### Calculate the train, validation, and test sizes #####
    n = X.shape[0]    # number of samples
    num_train_samples = int(n * train_prop)
    num_val_samples = int(n * val_prop)
    num_test_samples = int(n - num_train_samples - num_val_samples)
    
    ###### Data splits ######
    # (X, Y) ==> (train | temp)
    # temp ==> (validation | test)
    X_train, X_temp, Y_train, Y_temp = train_test_split(X, Y, train_size=num_train_samples, random_state=42)
    X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, test_size=num_test_samples, random_state=42)
    
    
    ##### Scale the features for numerical stability #####
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaler.fit(X_train)
    scaled_X_train = scaler.transform(X_train)
    scaled_X_val = scaler.transform(X_val)
    scaled_X_test = scaler.transform(X_test)
    
    ##### Convert the data to tf.tensors #####
    scaled_X_train = tf.cast(scaled_X_train, dtype=tf.float32)
    scaled_X_val = tf.cast(scaled_X_val, dtype=tf.float32)
    scaled_X_test = tf.cast(scaled_X_test, dtype=tf.float32)
    Y_train = tf.cast(Y_train, dtype=tf.float32)
    Y_val = tf.cast(Y_val, dtype=tf.float32)
    Y_test = tf.cast(Y_test, dtype=tf.float32)
    
    ##### Add an extra dimension to the input data for convolution #####
    scaled_X_train = tf.expand_dims(scaled_X_train, axis=-1)
    scaled_X_val = tf.expand_dims(scaled_X_val, axis=-1)
    scaled_X_test = tf.expand_dims(scaled_X_test, axis=-1)
    
    ##### Create the 8 different outputs #####
    output_cols_in_order = ["tdir_down","tdif_down","tdir_up","tdif_up","spherical_albedo","edir","edif","path_rad"]
    output_col_indices = {col_name: idx for idx, col_name in enumerate(output_cols_in_order)}
    
    # A dict to store the returned data
    data_dict = {
        "scaled_X_train": scaled_X_train,
        "scaled_X_val": scaled_X_val,
        "scaled_X_test": scaled_X_test
    }
    
    if output_var == "tdir_down":
        Y_train_tdir_down = Y_train[:, :, output_col_indices["tdir_down"]]
        Y_val_tdir_down = Y_val[:, :, output_col_indices["tdir_down"]]
        Y_test_tdir_down = Y_test[:, :, output_col_indices["tdir_down"]]
        data_dict["Y_train_tdir_down"] = Y_train_tdir_down
        data_dict["Y_val_tdir_down"] = Y_val_tdir_down
        data_dict["Y_test_tdir_down"] = Y_test_tdir_down
        return data_dict
            
    elif output_var == "tdif_down":
        Y_train_tdif_down = Y_train[:, :, output_col_indices["tdif_down"]]
        Y_val_tdif_down = Y_val[:, :, output_col_indices["tdif_down"]]
        Y_test_tdif_down = Y_test[:, :, output_col_indices["tdif_down"]]
        data_dict["Y_train_tdif_down"] = Y_train_tdif_down
        data_dict["Y_val_tdif_down"] = Y_val_tdif_down
        data_dict["Y_test_tdif_down"] = Y_test_tdif_down
        return data_dict
        
    elif output_var == "tdir_up":
        Y_train_tdir_up = Y_train[:, :, output_col_indices["tdir_up"]]
        Y_val_tdir_up = Y_val[:, :, output_col_indices["tdir_up"]]
        Y_test_tdir_up = Y_test[:, :, output_col_indices["tdir_up"]]
        data_dict["Y_train_tdir_up"] = Y_train_tdir_up
        data_dict["Y_val_tdir_up"] = Y_val_tdir_up
        data_dict["Y_test_tdir_up"] = Y_test_tdir_up
        return data_dict
        
    elif output_var == "tdif_up":
        Y_train_tdif_up = Y_train[:, :, output_col_indices["tdif_up"]]
        Y_val_tdif_up = Y_val[:, :, output_col_indices["tdif_up"]]
        Y_test_tdif_up = Y_test[:, :, output_col_indices["tdif_up"]]
        data_dict["Y_train_tdif_up"] = Y_train_tdif_up
        data_dict["Y_val_tdif_up"] = Y_val_tdif_up
        data_dict["Y_test_tdif_up"] = Y_test_tdif_up
        return data_dict
        
    elif output_var == "spherical_albedo":
        Y_train_spherical_albedo = Y_train[:, :, output_col_indices["spherical_albedo"]]
        Y_val_spherical_albedo = Y_val[:, :, output_col_indices["spherical_albedo"]]
        Y_test_spherical_albedo = Y_test[:, :, output_col_indices["spherical_albedo"]]
        data_dict["Y_train_spherical_albedo"] = Y_train_spherical_albedo
        data_dict["Y_val_spherical_albedo"] = Y_val_spherical_albedo
        data_dict["Y_test_spherical_albedo"] = Y_test_spherical_albedo
        return data_dict
        
    elif output_var == "edir":
        Y_train_edir = Y_train[:, :, output_col_indices["edir"]]
        Y_val_edir = Y_val[:, :, output_col_indices["edir"]]
        Y_test_edir = Y_test[:, :, output_col_indices["edir"]]
        data_dict["Y_train_edir"] = Y_train_edir
        data_dict["Y_val_edir"] = Y_val_edir
        data_dict["Y_test_edir"] = Y_test_edir
        return data_dict
        
    elif output_var == "edif":
        Y_train_edif = Y_train[:, :, output_col_indices["edif"]]
        Y_val_edif = Y_val[:, :, output_col_indices["edif"]]
        Y_test_edif = Y_test[:, :, output_col_indices["edif"]]
        data_dict["Y_train_edif"] = Y_train_edif
        data_dict["Y_val_edif"] = Y_val_edif
        data_dict["Y_test_edif"] = Y_test_edif
        return data_dict
        
    elif output_var == "path_rad":
        Y_train_path_rad = Y_train[:, :, output_col_indices["path_rad"]]
        Y_val_path_rad = Y_val[:, :, output_col_indices["path_rad"]]
        Y_test_path_rad = Y_test[:, :, output_col_indices["path_rad"]]
        data_dict["Y_train_path_rad"] = Y_train_path_rad
        data_dict["Y_val_path_rad"] = Y_val_path_rad
        data_dict["Y_test_path_rad"] = Y_test_path_rad
        return data_dict

<h3>Convert to .py script</h3>

In [None]:
!jupyter nbconvert --to script prepare_NN_data.ipynb

[NbConvertApp] Converting notebook prepare_NN_data.ipynb to script
[NbConvertApp] Writing 8261 bytes to prepare_NN_data.py


- Jupyter notebooks need to be converted to python scripts ('.py' files) to be imported directly into another notebook / script.
- Note that this notebook itself is not converted into a '.py' file, but rather a copy is made of this notebook as a '.py' file.