In [1]:
import configparser
import pandas as pd
import numpy as np
import glob
import re

In [2]:
def extract_vals(full_ini_path, key):
    """
    Args:
        full_ini_path: a full path to the .ini file that you wish to extract values from
        key: the variable name in the .ini file input string that you wish to extract
        
    Returns:
        A scalar matching the key argument.
    """
    # Initialize the parser
    config = configparser.ConfigParser()
    
    # Read in the .ini file
    config.read(full_ini_path)
    
    # Extract the input string from the main_str section
    input_str = config["main_str"]["input_str"]
    
    if key == "tau":
        tau_match = re.search(r'\baerosol_modify tau set ([\d.]+)', input_str)
        if tau_match:
            return float(tau_match.group(1))
    
    elif key == "atmosphere_file":
        atmosphere_match = re.search(r'\batmosphere_file\s+(\S+)', input_str)
        if atmosphere_match:
            split_path = atmosphere_match.group(1).split("/")
            return split_path[-1]
    
    # Regular expressions to extract the values of interest
    match = re.search(rf'\b{key}\b\s+([\d.]+)', input_str)
    
    if match:
        return float(match.group(1))
    else:
        raise ValueError(f"Invalid key: {key} not found")

In [3]:
def construct_matrix(full_ini_path):
    """
    Constructs an input matrix for the NN by extracting the following values
    from the .ini files containing the run configs: sza, altitude, tau.
    The seasonal atmospheric profile is used as an identifier,
    and its values with their corresponding pressure levels are concatenated to the input matrix as well.
    
    Args:
        full_ini_path: a full path to the .ini file that you wish to extract values from
        
    Returns:
        M: A Pandas DataFrame.
    """
    mls_folder_Path = "/fmi/projappl/project_2001985/jamin/data/libradtran_data/mls_files/"

    sza = extract_vals(full_ini_path, key="sza")
    altitude = extract_vals(full_ini_path, key="altitude")
    tau = extract_vals(full_ini_path, key="tau")
    atmosphere_file = extract_vals(full_ini_path, key="atmosphere_file")
    
    # Read in the midlatitude summer file as a NumPy array
    mls_arr = np.genfromtxt(mls_folder_Path + atmosphere_file, dtype="float")
    
    # The third column contains the temperature profile
    T_profile = mls_arr[:, 2]
    
    # The second column contains the pressure levels (these are used as column names)
    pressure_levels = [float(lvl) for lvl in mls_arr[:, 1]]
    
    # Create a dataframe (row vector) of temperatures where the pressure levels are the column names
    df_T = pd.DataFrame([T_profile], columns=[str(level) for level in pressure_levels])
    
    # Create a dataframe (row vector) of the variables of interest
    df_vars = pd.DataFrame({"atmosphere_file": atmosphere_file,
                            "sza": [sza],
                            "altitude": [altitude],
                            "tau": [tau]})
    
    # Concatenate the two dataframes to form a one bigger dataframe (row vector)
    M = pd.concat([df_vars, df_T], axis=1)
    
    return M

In [4]:
def gen_unique(ini_folder_path: str):
    """
    Generates a larger matrix by repeatedly calling the construct_matrix function.
    Notice that when we ran libradtran, we varied only few of the parameters.
    Every run generated 5 different run configuration files (.ini files).
    We don't need to select them all. This function filters the .ini files
    in a way that only the unique run configurations get chosen.
    
    Args:
        ini_folder_path: path to the folder containing the run configs
        
    Returns:
        X: A Pandas DataFrame.
    """
    # These files contain the unique run configurations that we need
    filtered_ini_paths = sorted(glob.glob(ini_folder_path + "*conf_obj1*"))
    
    # Initialize an empty DataFrame to store the concatenated results
    X = pd.DataFrame()
    
    # Loop over each run config, call construct_matrix, and concatenate the results vertically
    for ini_path in filtered_ini_paths:
        M = construct_matrix(ini_path)
        X = pd.concat([X, M], axis=0, ignore_index=True)
    
    return X

In [5]:
ini_folder_path = "/fmi/projappl/project_2001985/jamin/data/libradtran_data/NN_data/ini_files/"
X = gen_unique(ini_folder_path)

In [6]:
X.shape

(1728, 54)

In [7]:
X.head()

Unnamed: 0,atmosphere_file,sza,altitude,tau,2e-05,4e-05,6e-05,0.00012,0.00026,0.00062,...,324.0,372.0,426.0,487.0,554.0,628.0,710.0,802.0,902.0,1013.0
0,afglms_Q1_0,25.0,0.0,0.05,248.7303,248.7303,248.7303,248.7303,248.7303,248.7303,...,215.0723,220.1957,225.84,232.0706,237.8445,243.4155,248.4294,252.3007,253.4411,249.7135
1,afglms_Q1_0,25.0,0.0,0.15,248.7303,248.7303,248.7303,248.7303,248.7303,248.7303,...,215.0723,220.1957,225.84,232.0706,237.8445,243.4155,248.4294,252.3007,253.4411,249.7135
2,afglms_Q1_0,25.0,0.0,0.3,248.7303,248.7303,248.7303,248.7303,248.7303,248.7303,...,215.0723,220.1957,225.84,232.0706,237.8445,243.4155,248.4294,252.3007,253.4411,249.7135
3,afglms_Q1_0,25.0,1.5,0.05,248.7303,248.7303,248.7303,248.7303,248.7303,248.7303,...,215.0723,220.1957,225.84,232.0706,237.8445,243.4155,248.4294,252.3007,253.4411,249.7135
4,afglms_Q1_0,25.0,1.5,0.15,248.7303,248.7303,248.7303,248.7303,248.7303,248.7303,...,215.0723,220.1957,225.84,232.0706,237.8445,243.4155,248.4294,252.3007,253.4411,249.7135


In [8]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 54 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   atmosphere_file  1728 non-null   object 
 1   sza              1728 non-null   float64
 2   altitude         1728 non-null   float64
 3   tau              1728 non-null   float64
 4   2e-05            1728 non-null   float64
 5   4e-05            1728 non-null   float64
 6   6e-05            1728 non-null   float64
 7   0.00012          1728 non-null   float64
 8   0.00026          1728 non-null   float64
 9   0.00062          1728 non-null   float64
 10  0.00164          1728 non-null   float64
 11  0.00448          1728 non-null   float64
 12  0.012            1728 non-null   float64
 13  0.03             1728 non-null   float64
 14  0.067            1728 non-null   float64
 15  0.139            1728 non-null   float64
 16  0.272            1728 non-null   float64
 17  0.515         

In [9]:
def save_csv(df, save_loc, file_name):
    """
    Saves the given Pandas DataFrame into a csv file.
    
    Args:
        df: the Pandas DataFrame to be saved
        save_loc: path to the folder where to save the csv file
        file_name: name of the csv file that is to be saved
        
    Returns:
        None.
    """
    df.to_csv(save_loc + file_name)

In [10]:
input_save_loc = "/fmi/projappl/project_2001985/jamin/data/libradtran_data/NN_data/"
input_file_name = "inputs.csv"
save_csv(df=X, save_loc=input_save_loc, file_name=input_file_name)

In [11]:
def load_csv(folder_path, file_name):
    """
    Loads a given csv file.
    
    Args:
        folder_path: the path to the folder containing the csv file
        file_name: name of the csv file
        
    Returns:
        df: the csv file in a Pandas DataFrame.
    """
    df = pd.read_csv(folder_path + file_name, index_col=0)
    return df

In [12]:
csv_folder_path = "/fmi/projappl/project_2001985/jamin/data/libradtran_data/NN_data/"
csv_file_name = "inputs.csv"
df = load_csv(csv_folder_path, csv_file_name)

In [13]:
# Test that the inputs saved and loaded correctly
print((X == df).all().all())
print((X.values == df.values).all())
print((X.columns == df.columns).all())

True
True
True
