In [None]:
### Import the package
import storypy as sp # check if this is used for something else
# import esmvaltool
# from esmvaltool.diag_scripts.shared import run_diagnostic, get_cfg, group_metadata
# from esmvaltool.diag_scripts.shared._base import _get_input_data_files
import pandas as pd
import xarray as xr
import numpy as np

In [None]:
'''
To use the esmvaltool configuration file, we need to import the esmvaltool package.
This package is not available in the current environment.
The following code is an example of how to use the esmvaltool configuration file.
'''

'''
def parse_config(file):
    """Parse the settings file."""
    config = get_cfg(file)           
    config['input_data'] = _get_input_data_files(config)
    return config
'''

'''
# Load the configuration file
config= parse_config('/climca/people/ralawode/esmvaltool_output/full_storyline_analysis_complete_20240923_140137/run/multiple_regression_indices/multiple_regresion/settings.yml')'

cs.main_esmval(config, user_config)
'''

This configuration dictionary is used to set up the processing parameters for the climate data analysis tool. Each key in the dictionary configures a specific aspect of the data processing pipeline, from specifying file directories to setting analysis periods and regions.

## Keys and Descriptions
- data_dir
    - Type: String
    - Description:
        - The directory path where the source CMIP6 netCDF files are stored.
    - Example: '/climca/data/cmip6-ng'

- work_dir
    - Type: String
    - Description:
        - The directory path where intermediate processing outputs and combined netCDF files are written.
    - Example: '/climca/people/storylinetool/test_user/work_dir'

- plot_dir
    - Type: String
    - Description:
        - The directory path where the generated plots (e.g., time series plots) will be saved.
    - Example: '/climca/people/storylinetool/test_user/plot_dir'

- var_name
    - Type: List of strings
    - Description:
        - A list of variable names to be processed. These variable names correspond to the variables available in the dataset (e.g., precipitation (pr), sea level pressure (psl)).
    - Example: ['pr', 'psl']

- exp_name
    - Type: String
    - Description:
        - The experiment name (scenario) to filter scenario files (e.g., climate change scenario identifier).
    - Example: 'ssp585'

- freq
    - Type: String
    - Description:
        - The frequency of the data, such as monthly ('mon').
    - Example: 'mon'

- grid
    - Type: String
    - Description:
        - The grid resolution of the dataset (e.g., 'g025' for a 0.25° grid).
    - Example: 'g025'

- region_method
    - Type: String
    - Description:
        - The method to define the region for spatial analysis. For example, using a bounding box method ('box').
    - Example: 'box'

- period1
    - Type: List of strings
    - Description:
        - The first time period used for the baseline climatology (e.g., historical period). The list contains the start and end years as strings.
    - Example: ['1950', '1979']

- period2
    - Type: List of strings
    - Description:
        - The second time period used for future projections. This list also contains the start and end years as strings.
    - Example: ['2070', '2099']

- region_id
    - Type: Integer
    - Description:
        - An identifier for the region being analyzed. This can be used to apply region-specific processing or look up additional metadata.
    - Example: 18

- season
    - Type: List of integers (or tuple)
    - Description:
        - The months representing the season for which the analysis is performed. This can be provided as a list or a tuple.
    - Example: [11, 12, 1, 2, 3]
    - Note: You can also supply a tuple, e.g., (12, 1, 2).

- region_extents
    - Type: List of tuples
    - Description:
        - A list of tuples, where each tuple defines the geographical bounding box for a region. The tuple values represent (latitude_min, latitude_max, longitude_min, longitude_max).
    - Example: [(30, 45, -10, 40), (45, 55, 5, 20)]

In [None]:
user_config = dict(
        data_dir='/climca/data/cmip6-ng',
        work_dir='/climca/people/storylinetool/test_user/work_dir',
        plot_dir='/climca/people/storylinetool/test_user/plot_dir',
        var_name=['pr','psl'],
        exp_name='ssp585',
        freq='mon',
        grid='g025',
        region_method='box',
        period1 = ['1950', '1979'],
        period2 = ['2070', '2099'],
        region_id=18,
        season=[11, 12, 1, 2, 3],
        region_extents=[(30, 45, -10, 40), (45, 55, 5, 20)],
        #titles=["Region A", "Region B"]
    )

'''
Run the diagnostics:
--------------------
- Be sure to have created the work_dir and plot_dir
- Running the line below will save the ouput NetCDF file in the work_dir and the plots in the plot_dir.
'''
cs.main_direct(user_config)

In [None]:
def run_regression(preproc, user_config, regressor_csv_path):
    """
    Run spatial multiple linear regression (MLR) using a preprocessed NetCDF dataset and regressors CSV.
    
    Parameters:
        preproc (str): Path to the preprocessed NetCDF file.
        user_config (dict): Configuration dictionary containing keys like "work_dir".
        regressor_csv_path (str): Path to the CSV file containing regressors data.
    
    This function:
      1. Opens the preprocessed NetCDF file.
      2. Loads regressors from a CSV file.
      3. Finds common models between the dataset and the regressors.
      4. Subsets the dataset based on common models.
      5. Aligns the regressors DataFrame to the common models.
      6. Prepares the regressor names by inserting 'MEM' at the beginning.
      7. Instantiates spatial_MLR, sets up regression data, and performs the regression.
      8. Saves the regression output to the specified work directory.
    """

    target = user_config["target"]
   
    ds = xr.open_dataset(preproc)
    # Ensure the model coordinate is a string and stripped of any whitespace.
    ds_model_names = pd.Index(ds['model'].values.astype(str)).str.strip()
    
 
    regressors = pd.read_csv(regressor_csv_path, index_col=0)
    regressors.index = regressors.index.str.strip()  # Clean the index if necessary

    ds_unique = ds.groupby('model').first()
    common_models = list(regressors.index.intersection(ds_unique['model'].values))
    print("Common models:", common_models)
    
    # Subset and reindex using ds_unique.
    ds_subset = ds_unique.sel(model=common_models).reindex(model=common_models)
    
    target_var = user_config.get("target", "pr")
    target = ds_subset[target_var]

    regressors_aligned = regressors.loc[common_models]

    regressor_names = regressors_aligned.columns.insert(0, 'MEM')

    # Note: spatial_MLR should be defined/imported from your module.
    MLR = spatial_MLR()
    MLR.regression_data(target, regressors_aligned, regressor_names)

    output_path = os.path.join(user_config["work_dir"], 'regression_output')
    os.makedirs(output_path, exist_ok=True)
    os.chdir(output_path)
    MLR.perform_regression(output_path, 'pr')

Users could decide to start from here and import the preprocessed data that we stored in storypy.

In [None]:
user_config = dict(
    work_dir='/climca/people/storylinetool/test_user/work_dir',
    var_name=['pr']
)

preproc_file = '/path/to/preprocessed_file.nc'
regressor_csv = '/climca/people/ralawode/esmvaltool_output/zappa_shepherd_CMIP6_20241209_124052/work/storyline_analysis/remote_drivers/remote_drivers/scaled_standardized_drivers.csv'

run_regression(preproc_file, user_config, regressor_csv)