Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

xarray wrapper to draw auto-regression samples #322

Merged
merged 8 commits into from Oct 27, 2023
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
8 changes: 5 additions & 3 deletions CHANGELOG.rst
Expand Up @@ -30,12 +30,14 @@ New Features
of the standard deviation (
`#306 <https://github.com/MESMER-group/mesmer/issues/306>`_
`#318 <https://github.com/MESMER-group/mesmer/pull/318>`_). By `Mathias Hauser`_.
- Add ``mesmer.stats.auto_regression._draw_auto_regression_correlated_np``: to draw samples of an
auto regression model (
- Add ``_draw_auto_regression_correlated`` and ``_draw_auto_regression_correlated``: to draw samples of a
mathause marked this conversation as resolved.
Show resolved Hide resolved
(spatially-)correlated and uncorrelated auto regression model (
`#322 <https://github.com/MESMER-group/mesmer/pull/322>`_,
`#161 <https://github.com/MESMER-group/mesmer/pull/161>`_ and
`#313 <https://github.com/MESMER-group/mesmer/pull/313>`_).
By `Mathias Hauser`_.
- Extract function to select the order of the auto regressive model: ``mesmer.stats.auto_regression._select_ar_order_xr``
- Add ``mesmer.stats.auto_regression._select_ar_order_xr`` to select the order of an
auto regressive model
(`#176 <https://github.com/MESMER-group/mesmer/pull/176>`_).
By `Mathias Hauser`_.

Expand Down
31 changes: 20 additions & 11 deletions mesmer/create_emulations/create_emus_gv.py
Expand Up @@ -6,11 +6,10 @@
Functions to create global variability emulations with MESMER.
"""


import numpy as np
import xarray as xr

from mesmer.io.save_mesmer_bundle import save_mesmer_data
from mesmer.stats.auto_regression import _draw_auto_regression_correlated_np
from mesmer.stats.auto_regression import _draw_auto_regression_uncorrelated


def create_emus_gv(params_gv, preds_gv, cfg, save_emus=True):
Expand Down Expand Up @@ -166,15 +165,25 @@ def create_emus_gv_AR(params_gv, nr_emus_v, nr_ts_emus_v, seed):
# only use the selected coeffs
ar_coefs = ar_coefs[:AR_order_sel]

emus_gv = _draw_auto_regression_correlated_np(
intercept=ar_int,
# reshape to n_coefs x n_cells (cell was squeezed in train_gv.train_gv_AR)
coeffs=ar_coefs[:, np.newaxis],
covariance=AR_std_innovs**2, # pass the (co-)variance!
n_samples=nr_emus_v,
n_ts=nr_ts_emus_v,
# create intermediate ar_params Dataset
# the variables are 1D (except coeffs)
intercept = xr.DataArray(ar_int)
coeffs = xr.DataArray(ar_coefs, dims="lags")
variance = xr.DataArray(AR_std_innovs**2)

ar_params = xr.Dataset(
{"intercept": intercept, "coeffs": coeffs, "variance": variance}
)

emus_gv = _draw_auto_regression_uncorrelated(
ar_params,
time=nr_ts_emus_v,
realisation=nr_emus_v,
seed=seed,
buffer=buffer,
)

return emus_gv.squeeze()
# get back the old order of the emus
emus_gv = emus_gv.values.transpose()

return emus_gv
30 changes: 21 additions & 9 deletions mesmer/create_emulations/create_emus_lv.py
Expand Up @@ -8,11 +8,12 @@


import numpy as np
import xarray as xr

import mesmer.stats
from mesmer.create_emulations.utils import _gather_lr_params, _gather_lr_preds
from mesmer.io.save_mesmer_bundle import save_mesmer_data
from mesmer.stats.auto_regression import _draw_auto_regression_correlated_np
from mesmer.stats.auto_regression import _draw_auto_regression_correlated


def create_emus_lv(params_lv, preds_lv, cfg, save_emus=True, submethod=""):
Expand Down Expand Up @@ -184,18 +185,29 @@ def create_emus_lv_AR1_sci(emus_lv, params_lv, preds_lv, cfg):
if len(emus_lv[scen]) == 0:
emus_lv[scen][targ] = 0

emus_ar = _draw_auto_regression_correlated_np(
intercept=params_lv["AR1_int"][targ],
# reshape to n_coefs x n_cells
# (coeffs was squeezed in train_lv.train_lv_AR1_sci)
coeffs=params_lv["AR1_coef"][targ][np.newaxis, :],
covariance=params_lv["loc_ecov_AR1_innovs"][targ],
n_samples=nr_emus_v,
n_ts=nr_ts_emus_stoch_v,
# create intermediate ar_params & covariance Dataset / DataArray
intercept = xr.DataArray(params_lv["AR1_int"][targ], dims="gridpoint")

dims = ("lags", "gridpoint")
coeffs = xr.DataArray(params_lv["AR1_coef"][targ][np.newaxis, :], dims=dims)

ar_params = xr.Dataset({"intercept": intercept, "coeffs": coeffs})

dims = ("gridpoint_i", "gridpoint_j")
covariance = xr.DataArray(params_lv["loc_ecov_AR1_innovs"][targ], dims=dims)

emus_ar = _draw_auto_regression_correlated(
ar_params,
covariance,
time=nr_ts_emus_stoch_v,
realisation=nr_emus_v,
seed=seed,
buffer=20,
)

# get back the old order of the emus
emus_ar = emus_ar.values.transpose((2, 0, 1))

emus_lv[scen][targ] += emus_ar.squeeze()
return emus_lv

Expand Down
251 changes: 250 additions & 1 deletion mesmer/stats/auto_regression.py
@@ -1,6 +1,9 @@
import numpy as np
import pandas as pd
import xarray as xr

from mesmer.core.utils import _check_dataarray_form, _check_dataset_form


def _select_ar_order_xr(data, dim, maxlag, ic="bic"):
"""Select the order of an autoregressive process - xarray wrapper
Expand Down Expand Up @@ -79,6 +82,252 @@ def _select_ar_order_np(data, maxlag, ic="bic"):
return selected_ar_order


def _get_size_and_coord_dict(coords_or_size, dim, name):

if isinstance(coords_or_size, int):
size, coord_dict = coords_or_size, {}

return size, coord_dict

# TODO: use public xr.Index when the minimum xarray version is v2023.08.0~26
mathause marked this conversation as resolved.
Show resolved Hide resolved
xr_Index = xr.core.indexes.Index

if not isinstance(coords_or_size, (xr.DataArray, xr_Index, pd.Index)):
raise TypeError(
f"expected '{name}' to be an `int`, pandas or xarray Index or a `DataArray`"
f" got {type(coords_or_size)}"
)

if coords_or_size.ndim != 1:
raise ValueError(f"Coords must be 1D but has {coords_or_size.ndim} dimensions")

size = coords_or_size.size
coord_dict = {dim: np.asarray(coords_or_size)}

return size, coord_dict


def _draw_auto_regression_uncorrelated(
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@yquilcaille can you read the docstrings and let me know if this makes sense?

I added a leading underscore to the name as I am not yet sure I'll expose the functions as is, but that can be changed.

ar_params,
*,
time,
realisation,
seed,
buffer,
time_dim="time",
realisation_dim="realisation",
):

mathause marked this conversation as resolved.
Show resolved Hide resolved
"""draw time series of an auto regression process

Parameters
----------
ar_params : Dataset
Dataset containing the estimated parameters of the AR process. Must contain the
following DataArray objects:

- intercept
- coeffs
- variance

time : int | DataArray | Index
Defines the number of auto-correlated samples to draw and possibly its coordinates.

- ``int``: defines the number of time steps to draw
- ``DataArray`` or ``Index``: defines the coordinates and its length the number
of samples along the time dimension to draw.

realisation : int | DataArray
Defines the number of uncorrelated samples to draw and possibly its coordinates.
See ``time`` for details.

seed : int
Seed used to initialize the pseudo-random number generator.

buffer : int
Buffer to initialize the autoregressive process (ensures that start at 0 does
not influence overall result).

Returns
-------
out : DataArray
Drawn realizations of the specified autoregressive process. The array has shape
n_time x n_coeffs x n_realisations.

Notes
-----
The number of (spatially-)correlated samples is defined by the size of ``ar_params``
(``n_coeffs``, i.e. the number of gridpoints) and ``covariance`` (which must be
equal).

mathause marked this conversation as resolved.
Show resolved Hide resolved
"""

# check the input
_check_dataset_form(
ar_params, "ar_params", required_vars=("intercept", "coeffs", "variance")
)

if (
ar_params.intercept.ndim != 0
or ar_params.coeffs.ndim != 1
or ar_params.variance.ndim != 0
):
raise ValueError(
"``_draw_auto_regression_uncorrelated`` can currently only handle single points"
)

# _draw_ar_corr_xr_internal expects 2D arrays
ar_params = ar_params.expand_dims("__gridpoint__")

result = _draw_ar_corr_xr_internal(
intercept=ar_params.intercept,
coeffs=ar_params.coeffs,
covariance=ar_params.variance,
time=time,
realisation=realisation,
seed=seed,
buffer=buffer,
time_dim=time_dim,
realisation_dim=realisation_dim,
)

# remove the "__gridpoint__" dim again
result = result.squeeze(dim="__gridpoint__", drop=True)

return result


def _draw_auto_regression_correlated(
ar_params,
covariance,
*,
time,
realisation,
seed,
buffer,
time_dim="time",
realisation_dim="realisation",
):
"""
draw time series of an auto regression process with spatially-correlated innovations

Parameters
----------
ar_params : Dataset
Dataset containing the estimated parameters of the AR process. Must contain the
following DataArray objects:

- intercept
- coeffs

covariance : DataArray
The (co-)variance array. Must be symmetric and positive-semidefinite.

time : int | DataArray | Index
Defines the number of auto-correlated samples to draw and possibly its coordinates.

- ``int``: defines the number of time steps to draw
- ``DataArray`` or ``Index``: defines the coordinates and its length the number
of samples along the time dimension to draw.

realisation : int | DataArray
Defines the number of uncorrelated samples to draw and possibly its coordinates.
See ``time`` for details.

seed : int
Seed used to initialize the pseudo-random number generator.

buffer : int
Buffer to initialize the autoregressive process (ensures that start at 0 does
not influence overall result).

Returns
-------
out : DataArray
Drawn realizations of the specified autoregressive process. The array has shape
n_time x n_coeffs x n_realisations.

Notes
-----
The number of (spatially-)correlated samples is defined by the size of ``ar_params``
(``n_coeffs``, i.e. the number of gridpoints) and ``covariance`` (which must be
equal).

"""

# check the input
_check_dataset_form(ar_params, "ar_params", required_vars=("intercept", "coeffs"))
_check_dataarray_form(ar_params.intercept, "intercept", ndim=1)

(dim,), size = ar_params.intercept.dims, ar_params.intercept.size
_check_dataarray_form(
ar_params.coeffs, "coeffs", ndim=2, required_dims=("lags", dim)
)
_check_dataarray_form(covariance, "covariance", ndim=2, shape=(size, size))

result = _draw_ar_corr_xr_internal(
intercept=ar_params.intercept,
coeffs=ar_params.coeffs,
covariance=covariance,
time=time,
realisation=realisation,
seed=seed,
buffer=buffer,
time_dim=time_dim,
realisation_dim=realisation_dim,
)

return result


def _draw_ar_corr_xr_internal(
intercept,
coeffs,
covariance,
*,
time,
realisation,
seed,
buffer,
time_dim="time",
realisation_dim="realisation",
):

# get the size and coords of the new dimensions
n_ts, time_coords = _get_size_and_coord_dict(time, time_dim, "time")
n_realisations, realisation_coords = _get_size_and_coord_dict(
realisation, realisation_dim, "realisation"
)

# the dimension name of the gridpoints
(gridpoint_dim,) = set(intercept.dims)
# make sure non-dimension coords are properly caught
gridpoint_coords = dict(coeffs[gridpoint_dim].coords)

out = _draw_auto_regression_correlated_np(
intercept=intercept.values,
coeffs=coeffs.transpose(..., gridpoint_dim).values,
covariance=covariance.values,
n_samples=n_realisations,
n_ts=n_ts,
seed=seed,
buffer=buffer,
)

dims = (realisation_dim, time_dim, gridpoint_dim)

# TODO: use dict union once requiring py3.9+
# coords = gridpoint_coords | time_coords | realisation_coords
coords = {**time_coords, **realisation_coords, **gridpoint_coords}

out = xr.DataArray(out, dims=dims, coords=coords)

# for consistency we transpose to time x gridpoint x realisation
out = out.transpose(time_dim, gridpoint_dim, realisation_dim)

return out


def _draw_auto_regression_correlated_np(
*, intercept, coeffs, covariance, n_samples, n_ts, seed, buffer
):
Expand Down Expand Up @@ -132,7 +381,7 @@ def _draw_auto_regression_correlated_np(
# arbitrary lags? no, see: https://github.com/MESMER-group/mesmer/issues/164
ar_lags = np.arange(1, ar_order + 1, dtype=int)

# ensure reproducibility (TODO: clarify approach to this, see #35)
# ensure reproducibility (TODO: https://github.com/MESMER-group/mesmer/issues/35)
np.random.seed(seed)

# NOTE: 'innovations' is the error or noise term.
Expand Down