In [1]:
import numpy as np
import pandas as pd
import os
import h5py

In [3]:
# One output matrix (to get the column names of the output data)
output1 = pd.read_json("/fmi/projappl/project_2004400/jamin/data/libradtran_data/NN_data/output_params/afglms_Q1_0_sza25_vza1.0_phi0_phi00_alt0_tau0.05.json")

In [4]:
output_cols = output1.columns.values.tolist()
output_cols

['wavelength',
 'rho0',
 'rho1',
 'rho2',
 'tdir_down',
 'tdif_down',
 'tdir_up',
 'tdif_up',
 'spherical_albedo',
 'edir',
 'edif',
 'path_rad',
 'albedo1',
 'albedo2']

In [4]:
def gen_outputs_full(output_folder_path):
    """
    Generates an output tensor of shape (1728, 235002, 14).
    
    Args:
        output_folder_path: path to the folder containing the libradtran output files
        
    Returns:
        output_tensor: tensor of shape (1728, 235002, 14).
    """
    output_files_list = sorted(os.listdir(output_folder_path))
    Y = np.zeros((1728, 235002, 14))
    for i, file in enumerate(output_files_list):
        full_path = output_folder_path + file
        M = pd.read_json(full_path).values
        Y[i, :, :] = M
    return Y

In [6]:
output_folder_path = "/fmi/projappl/project_2004400/jamin/data/libradtran_data/NN_data/output_params/"
outputs_full = gen_outputs_full(output_folder_path)

In [9]:
outputs_full.shape

(1728, 235002, 14)

In [5]:
def save_HDF5(Y, save_loc, file_name):
    """
    Saves the given NumPy array into the desired location in HDF5 format.
    
    Args:
        Y: NumPy array
        save_loc: the path to the desired saving location
        file_name: name of the saved file
        
    Returns:
        None.
    """
    full_path = save_loc + file_name
    with h5py.File(full_path, "w") as hf:
        hf.create_dataset("output_data", data=Y)

In [11]:
output_save_loc = "/fmi/projappl/project_2004400/jamin/data/libradtran_data/NN_data/"
output_file_name = "outputs_full.h5"
save_HDF5(Y=outputs_full, save_loc=output_save_loc, file_name=output_file_name)

In [6]:
def load_HDF5(file_path):
    """
    Loads in the outputs of the NN model that are in .h5 format.
    
    Args:
        file_path: location of the outputs
        
    Returns:
        data: a NumPy array.
    """
    with h5py.File(name=file_path, mode="r") as hf:
        data = hf["output_data"][:]
    return data

In [7]:
outputs_path = "/fmi/projappl/project_2004400/jamin/data/libradtran_data/NN_data/outputs_full.h5"

In [8]:
# Outputs
Y = load_HDF5(outputs_path)

In [9]:
Y.shape

(1728, 235002, 14)

<h3>Dealing with missing values</h3>

In [10]:
def columns_w_nans(Y):
    """
    Checks which columns contain nans (spoiler alert, only spherical albedo contains nans).
    
    Args:
        Y: a NumPy array (tensor)
        
    Returns:
        None.
        
    Prints:
        Columns that contain nan values.
    """
    flag = False
    for i in range(Y.shape[0]):
        df = pd.DataFrame(Y[i, :, :], columns=output_cols)
        cols_w_nans = df.columns[df.isna().any()]
        for col in cols_w_nans:
            if col != "spherical_albedo":
                print(col)
                flag = True
    if not flag:
        print("Only spherical albedo contains nans.")

In [11]:
columns_w_nans(Y)

Only spherical albedo contains nans.


Most likely, the reason that spherical_albedo contains nans is that it is computed as follows:

\begin{align*}
    \text{spherical albedo} = \frac{\text{albedo1} \, * \, (\text{rho2} - \text{rho0}) \, * \, \text{albedo2} \, * \, (\text{rho1} - \text{rho0})}{\text{albedo2} \, * \, \text{albedo1} \, * \, (\text{rho2} - \text{rho1})}.
\end{align*}

After some investigating, it looks like that the rows where $\, \text{spherical_albedo} = \text{nan}, \,$ $\, \text{rho2} \,$ and $\, \text{rho1} \,$ are equal to each other, meaning that in the spherical albedo calculation, there is division by zero. To deal with this, I am going to alter the data such that 

    
``` python
if (rho1 == rho2) or (rho1 == rho0) or (rho2 == rho0):
    spherical_albedo = 0
```

Hopefully after this, the nans are gone.

<h3>Replace the missing values</h3>

In [12]:
def replace_nans(Y):
    """
    Replaces the nan values in spherical_albedo with 0,
    if  (rho1 == rho2) or (rho1 == rho0) or (rho2 == rho0).
    
    Args:
        Y: a NumPy array (tensor)
        
    Returns:
        Y: a new NumPy array.
        
    Notes:
        - rho0 is located in the 2nd col (index position 1)
        - rho1 is located in the 3rd col (index position 2)
        - rho2 is located in the 4th col (index position 3)
        - spherical_albedo is located in the 9th col (index position 8)
        - The matrices are modified inplace.
    """
    for matrix in Y:
        for row in matrix:
            rho0 = row[1]
            rho1 = row[2]
            rho2 = row[3]
            if (rho1 == rho2) or (rho1 == rho0) or (rho2 == rho0):
                # Set spherical_albedo to zero
                row[8] = 0
    return Y

In [13]:
Y_modified = replace_nans(Y)

In [14]:
def check_nans(Y):
    """
    Checks if the given NumPy array contains any nans.
    
    Args:
        Y: a NumPy array
        
    Returns:
        None.
    """
    flag = False
    for i, matrix in enumerate(Y):
        if np.isnan(matrix).any():
            print(f"Matrix with index {i} contains missing values.")
            flag = True
    if not flag:
        print("The given tensor does not contain any missing values.")

In [15]:
check_nans(Y_modified)

The given tensor does not contain any missing values.


In [16]:
# Column index positions of the libradtran output variables
output_vars_col_idx = {col_name: idx for idx, col_name in enumerate(output1.columns)}
output_vars_col_idx

{'wavelength': 0,
 'rho0': 1,
 'rho1': 2,
 'rho2': 3,
 'tdir_down': 4,
 'tdif_down': 5,
 'tdir_up': 6,
 'tdif_up': 7,
 'spherical_albedo': 8,
 'edir': 9,
 'edif': 10,
 'path_rad': 11,
 'albedo1': 12,
 'albedo2': 13}

In [17]:
def create_subset(Y):
    """
    Creates a subset of the output data.
    Note that we are interested in predicting the following libradtran outputs:
    tdir_down, tdif_down, tdir_up, tdif_up, spherical_albedo, edir, edif, path_rad.
        
    Args:
        Y: the full output data (tensor) containing all the libradtran output variables
        
    Returns:
        Y_subset: subset of the full output data containing only the output variables of interest.
    """
    # Column indices of the libradtran output variables of interest
    tdir_down = output_vars_col_idx["tdir_down"]
    tdif_down = output_vars_col_idx["tdif_down"]
    tdir_up = output_vars_col_idx["tdir_up"]
    tdif_up = output_vars_col_idx["tdif_up"]
    spherical_albedo = output_vars_col_idx["spherical_albedo"]
    edir = output_vars_col_idx["edir"]
    edif = output_vars_col_idx["edif"]
    path_rad = output_vars_col_idx["path_rad"]
    
    # Index the desired subset from the full output data
    Y_subset = Y[:, :, [tdir_down, tdif_down, tdir_up, tdif_up, spherical_albedo, edir, edif, path_rad]]
    
    return Y_subset

In [18]:
Y_subset = create_subset(Y_modified)

In [19]:
Y_subset.shape

(1728, 235002, 8)

In [20]:
save_HDF5(Y=Y_subset,
          save_loc="/fmi/projappl/project_2004400/jamin/data/libradtran_data/NN_data/",
          file_name="outputs_subset.h5")