Preprocessing running mean fix (#92)

* add preprosessing patch * change defaults * fix max time
JoranAngevaare · Jul 24, 2023 · fa7cee6 · fa7cee6
1 parent c799f16
commit fa7cee6
Show file tree

Hide file tree

Showing 8 changed files with 82 additions and 35 deletions.
diff --git a/bin/oet_plot b/bin/oet_plot
@@ -81,7 +81,7 @@ def main(log, args):
                   extra_opt=args.extra_opt,
                   methods=args.methods,
                   )
-    except ValueError as e:
+    except Exception as e:
         write_error(args.err_file, f'{args.path} | {e}')
         raise e
 

diff --git a/optim_esm_tools/analyze/cmip_handler.py b/optim_esm_tools/analyze/cmip_handler.py
@@ -5,7 +5,7 @@
 import typing as ty
 from warnings import warn
 
-from .globals import _FOLDER_FMT
+from .globals import _FOLDER_FMT, _DEFAULT_MAX_TIME
 import optim_esm_tools as oet
 from optim_esm_tools.analyze import tipping_criteria
 
@@ -21,16 +21,18 @@ def add_conditions_to_ds(
 
     Args:
         ds (xr.Dataset): input dataset
-        calculate_conditions (ty.Tuple[tipping_criteria._Condition], optional): Calculate the results of these tipping conditions. Defaults to None.
-        condition_kwargs (ty.Mapping, optional): kwargs for the tipping conditions. Defaults to None.
+        calculate_conditions (ty.Tuple[tipping_criteria._Condition], optional): Calculate the
+            results of these tipping conditions. Defaults to None.
+        condition_kwargs (ty.Mapping, optional): kwargs for the tipping conditions. Defaults to
+            None.
         variable_of_interest (ty.Tuple[str], optional): Variables to handle. Defaults to ('tas',).
         _ma_window (int, optional): Moving average window (assumed to be years). Defaults to 10.
 
     Raises:
         ValueError: If there are multiple tipping conditions with the same short_description
 
     Returns:
-        xr.Dataset: The fully initiallized dataset
+        xr.Dataset: The fully initialized dataset
     """
     _ma_window = _ma_window or oet.config.config['analyze']['moving_average_years']
     if calculate_conditions is None:
@@ -68,7 +70,7 @@ def add_conditions_to_ds(
 def read_ds(
     base: str,
     variable_of_interest: ty.Tuple[str] = None,
-    max_time: ty.Optional[ty.Tuple[int, int, int]] = (2100, 1, 1),
+    max_time: ty.Optional[ty.Tuple[int, int, int]] = _DEFAULT_MAX_TIME,
     min_time: ty.Optional[ty.Tuple[int, int, int]] = None,
     apply_transform: bool = True,
     pre_process: bool = True,
@@ -84,19 +86,24 @@ def read_ds(
     Args:
         base (str): Folder to load the data from
         variable_of_interest (ty.Tuple[str], optional): Variables to handle. Defaults to ('tas',).
-        max_time (ty.Optional[ty.Tuple[int, int, int]], optional): Defines time range in which to load data. Defaults to (2100, 1, 1).
-        min_time (ty.Optional[ty.Tuple[int, int, int]], optional): Defines time range in which to load data. Defaults to None.
-        apply_transform: (bool, optional): Apply analysis specific postprocessing algoritms. Defaults to True.
-        pre_process (bool, optional): Should be true, this pre-processing of the data is required later on. Defaults to True.
-        area_query_kwargs (ty.Mapping, optional): additionall keyword arguments for searching.
+        max_time (ty.Optional[ty.Tuple[int, int, int]], optional): Defines time range in which to
+            load data. Defaults to (2100, 12, 31).
+        min_time (ty.Optional[ty.Tuple[int, int, int]], optional): Defines time range in which to
+            load data. Defaults to None.
+        apply_transform: (bool, optional): Apply analysis specific postprocessing algorithms.
+            Defaults to True.
+        pre_process (bool, optional): Should be true, this pre-processing of the data is required
+            later on. Defaults to True.
+        area_query_kwargs (ty.Mapping, optional): additionally keyword arguments for searching.
         strict (bool, optional): raise errors on loading, if any. Defaults to True.
         load (bool, optional): apply dataset.load to dataset directly. Defaults to False.
         _ma_window (int, optional): Moving average window (assumed to be years). Defaults to 10.
-        _cache (bool, optional): cache the dataset with it's extra fields to alow faster (re)loading. Defaults to True.
+        _cache (bool, optional): cache the dataset with it's extra fields to alow faster
+            (re)loading. Defaults to True.
         _file_name (str, optional): name to match. Defaults to configs settings.
 
     kwargs:
-        any kwargs are passed onto transfor_ds.
+        any kwargs are passed onto transform_ds.
 
     Returns:
         xr.Dataset: An xarray dataset with the appropriate variables
@@ -149,7 +156,7 @@ def read_ds(
         )
     else:
         log.warning(
-            f'Not preprocessing file is dangerous, dimentions may differ wildly!'
+            f'Not preprocessing file is dangerous, dimensions may differ wildly!'
         )
     data_set = oet.analyze.io.load_glob(data_path, load=load)
 

diff --git a/optim_esm_tools/analyze/find_matches.py b/optim_esm_tools/analyze/find_matches.py
@@ -60,14 +60,14 @@ def find_matches(
     """
     if grid is not None:
         get_logger().warning(
-            f'grid argument for find_matches is depricated, use grid_label'
+            f'grid argument for find_matches is deprecated, use grid_label'
         )
         grid_label = grid
     if max_versions is None:
         max_versions = int(9e9)
     if max_members is None:
         max_members = int(9e9)
-    variantes = sorted(
+    variants = sorted(
         glob.glob(
             os.path.join(
                 base,
@@ -85,7 +85,7 @@ def find_matches(
         key=_variant_label_id_and_version,
     )
     seen = dict()
-    for candidate in variantes:
+    for candidate in variants:
         folders = candidate.split(os.sep)
         group = folders[-7]
         member = folders[-5]

diff --git a/optim_esm_tools/analyze/globals.py b/optim_esm_tools/analyze/globals.py
@@ -3,3 +3,4 @@
 _SECONDS_TO_YEAR = int(config['constants']['seconds_to_year'])
 _FOLDER_FMT = config['CMIP_files']['folder_fmt'].split()
 _CMIP_HANDLER_VERSION = config['versions']['cmip_handler']
+_DEFAULT_MAX_TIME = tuple([int(s) for s in config['analyze']['max_time'].split()])
diff --git a/optim_esm_tools/analyze/pre_process.py b/optim_esm_tools/analyze/pre_process.py
@@ -2,6 +2,8 @@
 from optim_esm_tools.config import config, get_logger
 from optim_esm_tools.analyze.xarray_tools import _native_date_fmt
 from optim_esm_tools.analyze.io import load_glob
+from optim_esm_tools.analyze.globals import _DEFAULT_MAX_TIME
+import numpy as np
 import os
 import typing as ty
 
@@ -10,7 +12,7 @@
 def pre_process(
     source: str,
     target_grid: str = None,
-    max_time: ty.Optional[ty.Tuple[int, int, int]] = (2100, 1, 1),
+    max_time: ty.Optional[ty.Tuple[int, int, int]] = _DEFAULT_MAX_TIME,
     min_time: ty.Optional[ty.Tuple[int, int, int]] = None,
     save_as: str = None,
     clean_up: bool = True,
@@ -26,9 +28,12 @@ def pre_process(
 
     Args:
         source (str): path of file to parse
-        target_grid (str, optional): Grid specification (like n64, n90 etc.). Defaults to None and is taken from config.
-        max_time (ty.Optional[ty.Tuple[int, int, int]], optional): Defines time range in which to load data. Defaults to (2100, 1, 1).
-        min_time (ty.Optional[ty.Tuple[int, int, int]], optional): Defines time range in which to load data. Defaults to None.
+        target_grid (str, optional): Grid specification (like n64, n90 etc.). Defaults to None and
+            is taken from config.
+        max_time (ty.Optional[ty.Tuple[int, int, int]], optional): Defines time range in which to
+            load data. Defaults to (2100, 12, 31).
+        min_time (ty.Optional[ty.Tuple[int, int, int]], optional): Defines time range in which to
+            load data. Defaults to None.
         save_as (str, optional): path where to store the pre-processed folder. Defaults to None.
         clean_up (bool, optional): delete intermediate files. Defaults to True.
         _ma_window (int, optional): moving average window (assumed 10 years). Defaults to None.
@@ -44,7 +49,7 @@ def pre_process(
 
     _remove_bad_vars(source)
     variable_id = variable_id or _read_variable_id(source)
-    max_time = max_time or (9999, 1, 1)  # unreasonably far away
+    max_time = max_time or (9999, 12, 31)  # unreasonably far away
     min_time = min_time or (0, 1, 1)  # unreasonably long ago
     target_grid = target_grid or config['analyze']['regrid_to']
     _ma_window = _ma_window or config['analyze']['moving_average_years']
@@ -90,8 +95,14 @@ def pre_process(
     os.remove(f_tmp)
 
     cdo_int.runmean(_ma_window, input=f_regrid, output=f_tmp)
-
-    cdo_int.chname(f'{var},{var_rm}', input=f_tmp, output=f_rm)
+    _run_mean_patch(
+        f_start=f_regrid,
+        f_rm=f_tmp,
+        f_out=f_rm,
+        ma_window=_ma_window,
+        var_name=var,
+        var_rm_name=var_rm,
+    )
     os.remove(f_tmp)
 
     cdo_int.detrend(input=f_rm, output=f_tmp)
@@ -138,6 +149,32 @@ def _check_time_range(path, max_time, min_time, ma_window):
         raise NoDataInTimeRangeError(message)
 
 
+def _run_mean_patch(f_start, f_rm, f_out, ma_window, var_name, var_rm_name):
+    """
+    Patch running mean file, since cdo decreases the length of the file by the ma_window, merging
+    two files of different durations results in bad data.
+
+    As a solution, we take the original file (f_start) and use it's shape (like the number of
+    timestamps) and fill those timestamps where the value of the running mean is defined.
+    Everything else is set to zero.
+    """
+    ds_base = load_glob(f_start)
+    ds_rm = load_glob(f_rm)
+    ds_out = ds_base.copy()
+
+    # Replace timestamps with the CDO computed running mean
+    data = np.zeros(ds_base[var_name].shape, ds_base[var_name].dtype)
+    data[:] = np.nan
+    ma_window = int(ma_window)
+    data[ma_window // 2 : 1 - ma_window // 2] = ds_rm[var_name].values
+    ds_out[var_name].data = data
+
+    # Patch variables and save
+    ds_out = ds_out.rename({var_name: var_rm_name})
+    ds_out.attrs = ds_rm.attrs
+    ds_out.to_netcdf(f_out)
+
+
 class NoDataInTimeRangeError(Exception):
     pass
 

diff --git a/optim_esm_tools/analyze/region_finding.py b/optim_esm_tools/analyze/region_finding.py
@@ -46,10 +46,10 @@ def mask_xr_ds(data_set, da_mask, masked_dims=None, drop=False):
 
 
 def _drop_by_mask(data_set, masked_dims, ds_start, da_mask):
-    """Drop values with masked_dims dimentions.
+    """Drop values with masked_dims dimensions.
     Unfortunately, data_set.where(da_mask, drop=True) sometimes leads to bad results,
     for example for time_bnds (time, bnds) being dropped by (lon, lat). So we have to do
-    some funny bookkeeping of which datavars we can drop with data_set.where.
+    some funny bookkeeping of which data vars we can drop with data_set.where.
     """
 
     dropped = []
@@ -68,7 +68,7 @@ def _drop_by_mask(data_set, masked_dims, ds_start, da_mask):
 
 
 def _mask_xr_ds(data_set, masked_dims, ds_start, da_mask):
-    """Rebuild data_set for each variabled that has all masked_dims"""
+    """Rebuild data_set for each variable that has all masked_dims"""
     for k, data_array in data_set.data_vars.items():
         if all(dim in list(data_array.dims) for dim in masked_dims):
             # First dim is time?
@@ -257,7 +257,8 @@ def store_mask(self, mask, m_i, store_masks=True):
         store_in_dir = os.path.join(save_in, 'masks')
         os.makedirs(store_in_dir, exist_ok=True)
         # Mask twice, "mask" is a np.ndarray, whereas ds.where needs a xr.DataArray.
-        # While we could make this more efficient (and only use the second step), the first step does only take ~10 ms
+        # While we could make this more efficient (and only use the second step), the first step
+        # does only take ~10 ms
         ds_masked = mask_xr_ds(self.data_set.copy(), mask)
         ds_masked = mask_xr_ds(ds_masked, ~ds_masked['cell_area'].isnull(), drop=True)
         ds_masked.to_netcdf(
@@ -280,7 +281,7 @@ def make_mask_figures(self, masks):
 
 class MaxRegion(RegionExtractor):
     def get_masks(self) -> dict:
-        """Get mask for max of ii and iii and a box arround that"""
+        """Get mask for max of ii and iii and a box around that"""
 
         def _val(label):
             return self.data_set[label].values
@@ -651,8 +652,6 @@ def find_historical(
         query_updates=None,
         search_kw=None,
     ):
-        from optim_esm_tools.config import config
-
         base = base_from_path(
             self.data_set.attrs['path'], look_back_extra=look_back_extra
         )
@@ -772,7 +771,7 @@ def get_masks(
             arr = self.data_set[lab].values
             arr_historical = historical_ds[lab].values
 
-            # If arr_historical is 0, the devision is going to get a nan assigned,
+            # If arr_historical is 0, the division is going to get a nan assigned,
             # despite this being the most interesting region (no historical
             # changes, only in the scenario's)!
             mask_no_std = arr_historical == 0

diff --git a/optim_esm_tools/analyze/tipping_criteria.py b/optim_esm_tools/analyze/tipping_criteria.py
@@ -167,8 +167,8 @@ def running_mean_diff(
         rename_to (str, optional): . Defaults to 'long_name'.
         unit (str, optional): . Defaults to 'absolute'.
         apply_abs (bool, optional): . Defaults to True.
-        _t_0_date (ty.Optional[tuple], optional): . Defaults to (2015, 1, 1).
-        _t_1_date (ty.Optional[tuple], optional): . Defaults to (2100, 1, 1).
+        _t_0_date (ty.Optional[tuple], optional): . Defaults to None.
+        _t_1_date (ty.Optional[tuple], optional): . Defaults to None.
 
     Raises:
         ValueError: when no timestamps are not none?

diff --git a/optim_esm_tools/optim_esm_conf.ini b/optim_esm_tools/optim_esm_conf.ini
@@ -6,7 +6,7 @@ seconds_to_year = 31557600
 
 
 [versions]
-cmip_handler = 0.4.2
+cmip_handler = 0.4.3
 
 
 [display]
@@ -27,6 +27,9 @@ cartopy_projection = PlateCarree
 clustering_fudge_factor = 1.1
 clustering_min_neighbors = 8
 
+# Split data sets on this year for nominal analyses
+max_time=2100 12 31
+
 
 [CMIP_files]
 folder_fmt = activity_id institution_id source_id experiment_id variant_label domain variable_id grid_label version