Skip to content

Commit

Permalink
Preprocessing running mean fix (#92)
Browse files Browse the repository at this point in the history
* add preprosessing patch

* change defaults

* fix max time
  • Loading branch information
JoranAngevaare committed Jul 24, 2023
1 parent c799f16 commit fa7cee6
Show file tree
Hide file tree
Showing 8 changed files with 82 additions and 35 deletions.
2 changes: 1 addition & 1 deletion bin/oet_plot
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def main(log, args):
extra_opt=args.extra_opt,
methods=args.methods,
)
except ValueError as e:
except Exception as e:
write_error(args.err_file, f'{args.path} | {e}')
raise e

Expand Down
33 changes: 20 additions & 13 deletions optim_esm_tools/analyze/cmip_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import typing as ty
from warnings import warn

from .globals import _FOLDER_FMT
from .globals import _FOLDER_FMT, _DEFAULT_MAX_TIME
import optim_esm_tools as oet
from optim_esm_tools.analyze import tipping_criteria

Expand All @@ -21,16 +21,18 @@ def add_conditions_to_ds(
Args:
ds (xr.Dataset): input dataset
calculate_conditions (ty.Tuple[tipping_criteria._Condition], optional): Calculate the results of these tipping conditions. Defaults to None.
condition_kwargs (ty.Mapping, optional): kwargs for the tipping conditions. Defaults to None.
calculate_conditions (ty.Tuple[tipping_criteria._Condition], optional): Calculate the
results of these tipping conditions. Defaults to None.
condition_kwargs (ty.Mapping, optional): kwargs for the tipping conditions. Defaults to
None.
variable_of_interest (ty.Tuple[str], optional): Variables to handle. Defaults to ('tas',).
_ma_window (int, optional): Moving average window (assumed to be years). Defaults to 10.
Raises:
ValueError: If there are multiple tipping conditions with the same short_description
Returns:
xr.Dataset: The fully initiallized dataset
xr.Dataset: The fully initialized dataset
"""
_ma_window = _ma_window or oet.config.config['analyze']['moving_average_years']
if calculate_conditions is None:
Expand Down Expand Up @@ -68,7 +70,7 @@ def add_conditions_to_ds(
def read_ds(
base: str,
variable_of_interest: ty.Tuple[str] = None,
max_time: ty.Optional[ty.Tuple[int, int, int]] = (2100, 1, 1),
max_time: ty.Optional[ty.Tuple[int, int, int]] = _DEFAULT_MAX_TIME,
min_time: ty.Optional[ty.Tuple[int, int, int]] = None,
apply_transform: bool = True,
pre_process: bool = True,
Expand All @@ -84,19 +86,24 @@ def read_ds(
Args:
base (str): Folder to load the data from
variable_of_interest (ty.Tuple[str], optional): Variables to handle. Defaults to ('tas',).
max_time (ty.Optional[ty.Tuple[int, int, int]], optional): Defines time range in which to load data. Defaults to (2100, 1, 1).
min_time (ty.Optional[ty.Tuple[int, int, int]], optional): Defines time range in which to load data. Defaults to None.
apply_transform: (bool, optional): Apply analysis specific postprocessing algoritms. Defaults to True.
pre_process (bool, optional): Should be true, this pre-processing of the data is required later on. Defaults to True.
area_query_kwargs (ty.Mapping, optional): additionall keyword arguments for searching.
max_time (ty.Optional[ty.Tuple[int, int, int]], optional): Defines time range in which to
load data. Defaults to (2100, 12, 31).
min_time (ty.Optional[ty.Tuple[int, int, int]], optional): Defines time range in which to
load data. Defaults to None.
apply_transform: (bool, optional): Apply analysis specific postprocessing algorithms.
Defaults to True.
pre_process (bool, optional): Should be true, this pre-processing of the data is required
later on. Defaults to True.
area_query_kwargs (ty.Mapping, optional): additionally keyword arguments for searching.
strict (bool, optional): raise errors on loading, if any. Defaults to True.
load (bool, optional): apply dataset.load to dataset directly. Defaults to False.
_ma_window (int, optional): Moving average window (assumed to be years). Defaults to 10.
_cache (bool, optional): cache the dataset with it's extra fields to alow faster (re)loading. Defaults to True.
_cache (bool, optional): cache the dataset with it's extra fields to alow faster
(re)loading. Defaults to True.
_file_name (str, optional): name to match. Defaults to configs settings.
kwargs:
any kwargs are passed onto transfor_ds.
any kwargs are passed onto transform_ds.
Returns:
xr.Dataset: An xarray dataset with the appropriate variables
Expand Down Expand Up @@ -149,7 +156,7 @@ def read_ds(
)
else:
log.warning(
f'Not preprocessing file is dangerous, dimentions may differ wildly!'
f'Not preprocessing file is dangerous, dimensions may differ wildly!'
)
data_set = oet.analyze.io.load_glob(data_path, load=load)

Expand Down
6 changes: 3 additions & 3 deletions optim_esm_tools/analyze/find_matches.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,14 +60,14 @@ def find_matches(
"""
if grid is not None:
get_logger().warning(
f'grid argument for find_matches is depricated, use grid_label'
f'grid argument for find_matches is deprecated, use grid_label'
)
grid_label = grid
if max_versions is None:
max_versions = int(9e9)
if max_members is None:
max_members = int(9e9)
variantes = sorted(
variants = sorted(
glob.glob(
os.path.join(
base,
Expand All @@ -85,7 +85,7 @@ def find_matches(
key=_variant_label_id_and_version,
)
seen = dict()
for candidate in variantes:
for candidate in variants:
folders = candidate.split(os.sep)
group = folders[-7]
member = folders[-5]
Expand Down
1 change: 1 addition & 0 deletions optim_esm_tools/analyze/globals.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@
_SECONDS_TO_YEAR = int(config['constants']['seconds_to_year'])
_FOLDER_FMT = config['CMIP_files']['folder_fmt'].split()
_CMIP_HANDLER_VERSION = config['versions']['cmip_handler']
_DEFAULT_MAX_TIME = tuple([int(s) for s in config['analyze']['max_time'].split()])
51 changes: 44 additions & 7 deletions optim_esm_tools/analyze/pre_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
from optim_esm_tools.config import config, get_logger
from optim_esm_tools.analyze.xarray_tools import _native_date_fmt
from optim_esm_tools.analyze.io import load_glob
from optim_esm_tools.analyze.globals import _DEFAULT_MAX_TIME
import numpy as np
import os
import typing as ty

Expand All @@ -10,7 +12,7 @@
def pre_process(
source: str,
target_grid: str = None,
max_time: ty.Optional[ty.Tuple[int, int, int]] = (2100, 1, 1),
max_time: ty.Optional[ty.Tuple[int, int, int]] = _DEFAULT_MAX_TIME,
min_time: ty.Optional[ty.Tuple[int, int, int]] = None,
save_as: str = None,
clean_up: bool = True,
Expand All @@ -26,9 +28,12 @@ def pre_process(
Args:
source (str): path of file to parse
target_grid (str, optional): Grid specification (like n64, n90 etc.). Defaults to None and is taken from config.
max_time (ty.Optional[ty.Tuple[int, int, int]], optional): Defines time range in which to load data. Defaults to (2100, 1, 1).
min_time (ty.Optional[ty.Tuple[int, int, int]], optional): Defines time range in which to load data. Defaults to None.
target_grid (str, optional): Grid specification (like n64, n90 etc.). Defaults to None and
is taken from config.
max_time (ty.Optional[ty.Tuple[int, int, int]], optional): Defines time range in which to
load data. Defaults to (2100, 12, 31).
min_time (ty.Optional[ty.Tuple[int, int, int]], optional): Defines time range in which to
load data. Defaults to None.
save_as (str, optional): path where to store the pre-processed folder. Defaults to None.
clean_up (bool, optional): delete intermediate files. Defaults to True.
_ma_window (int, optional): moving average window (assumed 10 years). Defaults to None.
Expand All @@ -44,7 +49,7 @@ def pre_process(

_remove_bad_vars(source)
variable_id = variable_id or _read_variable_id(source)
max_time = max_time or (9999, 1, 1) # unreasonably far away
max_time = max_time or (9999, 12, 31) # unreasonably far away
min_time = min_time or (0, 1, 1) # unreasonably long ago
target_grid = target_grid or config['analyze']['regrid_to']
_ma_window = _ma_window or config['analyze']['moving_average_years']
Expand Down Expand Up @@ -90,8 +95,14 @@ def pre_process(
os.remove(f_tmp)

cdo_int.runmean(_ma_window, input=f_regrid, output=f_tmp)

cdo_int.chname(f'{var},{var_rm}', input=f_tmp, output=f_rm)
_run_mean_patch(
f_start=f_regrid,
f_rm=f_tmp,
f_out=f_rm,
ma_window=_ma_window,
var_name=var,
var_rm_name=var_rm,
)
os.remove(f_tmp)

cdo_int.detrend(input=f_rm, output=f_tmp)
Expand Down Expand Up @@ -138,6 +149,32 @@ def _check_time_range(path, max_time, min_time, ma_window):
raise NoDataInTimeRangeError(message)


def _run_mean_patch(f_start, f_rm, f_out, ma_window, var_name, var_rm_name):
"""
Patch running mean file, since cdo decreases the length of the file by the ma_window, merging
two files of different durations results in bad data.
As a solution, we take the original file (f_start) and use it's shape (like the number of
timestamps) and fill those timestamps where the value of the running mean is defined.
Everything else is set to zero.
"""
ds_base = load_glob(f_start)
ds_rm = load_glob(f_rm)
ds_out = ds_base.copy()

# Replace timestamps with the CDO computed running mean
data = np.zeros(ds_base[var_name].shape, ds_base[var_name].dtype)
data[:] = np.nan
ma_window = int(ma_window)
data[ma_window // 2 : 1 - ma_window // 2] = ds_rm[var_name].values
ds_out[var_name].data = data

# Patch variables and save
ds_out = ds_out.rename({var_name: var_rm_name})
ds_out.attrs = ds_rm.attrs
ds_out.to_netcdf(f_out)


class NoDataInTimeRangeError(Exception):
pass

Expand Down
15 changes: 7 additions & 8 deletions optim_esm_tools/analyze/region_finding.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,10 @@ def mask_xr_ds(data_set, da_mask, masked_dims=None, drop=False):


def _drop_by_mask(data_set, masked_dims, ds_start, da_mask):
"""Drop values with masked_dims dimentions.
"""Drop values with masked_dims dimensions.
Unfortunately, data_set.where(da_mask, drop=True) sometimes leads to bad results,
for example for time_bnds (time, bnds) being dropped by (lon, lat). So we have to do
some funny bookkeeping of which datavars we can drop with data_set.where.
some funny bookkeeping of which data vars we can drop with data_set.where.
"""

dropped = []
Expand All @@ -68,7 +68,7 @@ def _drop_by_mask(data_set, masked_dims, ds_start, da_mask):


def _mask_xr_ds(data_set, masked_dims, ds_start, da_mask):
"""Rebuild data_set for each variabled that has all masked_dims"""
"""Rebuild data_set for each variable that has all masked_dims"""
for k, data_array in data_set.data_vars.items():
if all(dim in list(data_array.dims) for dim in masked_dims):
# First dim is time?
Expand Down Expand Up @@ -257,7 +257,8 @@ def store_mask(self, mask, m_i, store_masks=True):
store_in_dir = os.path.join(save_in, 'masks')
os.makedirs(store_in_dir, exist_ok=True)
# Mask twice, "mask" is a np.ndarray, whereas ds.where needs a xr.DataArray.
# While we could make this more efficient (and only use the second step), the first step does only take ~10 ms
# While we could make this more efficient (and only use the second step), the first step
# does only take ~10 ms
ds_masked = mask_xr_ds(self.data_set.copy(), mask)
ds_masked = mask_xr_ds(ds_masked, ~ds_masked['cell_area'].isnull(), drop=True)
ds_masked.to_netcdf(
Expand All @@ -280,7 +281,7 @@ def make_mask_figures(self, masks):

class MaxRegion(RegionExtractor):
def get_masks(self) -> dict:
"""Get mask for max of ii and iii and a box arround that"""
"""Get mask for max of ii and iii and a box around that"""

def _val(label):
return self.data_set[label].values
Expand Down Expand Up @@ -651,8 +652,6 @@ def find_historical(
query_updates=None,
search_kw=None,
):
from optim_esm_tools.config import config

base = base_from_path(
self.data_set.attrs['path'], look_back_extra=look_back_extra
)
Expand Down Expand Up @@ -772,7 +771,7 @@ def get_masks(
arr = self.data_set[lab].values
arr_historical = historical_ds[lab].values

# If arr_historical is 0, the devision is going to get a nan assigned,
# If arr_historical is 0, the division is going to get a nan assigned,
# despite this being the most interesting region (no historical
# changes, only in the scenario's)!
mask_no_std = arr_historical == 0
Expand Down
4 changes: 2 additions & 2 deletions optim_esm_tools/analyze/tipping_criteria.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,8 +167,8 @@ def running_mean_diff(
rename_to (str, optional): . Defaults to 'long_name'.
unit (str, optional): . Defaults to 'absolute'.
apply_abs (bool, optional): . Defaults to True.
_t_0_date (ty.Optional[tuple], optional): . Defaults to (2015, 1, 1).
_t_1_date (ty.Optional[tuple], optional): . Defaults to (2100, 1, 1).
_t_0_date (ty.Optional[tuple], optional): . Defaults to None.
_t_1_date (ty.Optional[tuple], optional): . Defaults to None.
Raises:
ValueError: when no timestamps are not none?
Expand Down
5 changes: 4 additions & 1 deletion optim_esm_tools/optim_esm_conf.ini
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ seconds_to_year = 31557600


[versions]
cmip_handler = 0.4.2
cmip_handler = 0.4.3


[display]
Expand All @@ -27,6 +27,9 @@ cartopy_projection = PlateCarree
clustering_fudge_factor = 1.1
clustering_min_neighbors = 8

# Split data sets on this year for nominal analyses
max_time=2100 12 31


[CMIP_files]
folder_fmt = activity_id institution_id source_id experiment_id variant_label domain variable_id grid_label version
Expand Down

0 comments on commit fa7cee6

Please sign in to comment.