Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add _fit_auto_regression functions #139

Merged
merged 8 commits into from Apr 7, 2022
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 3 additions & 1 deletion CHANGELOG.rst
Expand Up @@ -13,7 +13,9 @@ New Features
- Add ``mesmer.core.linear_regression``: xarray wrapper for ``mesmer.core._linear_regression``.
(`#123 <https://github.com/MESMER-group/mesmer/pull/123>`_).
By `Mathias Hauser <https://github.com/mathause>`_.

- Add ``mesmer.core.auto_regression._fit_auto_regression_xr``: xarray wrapper to fit an
auto regression model (`#139 <https://github.com/MESMER-group/mesmer/pull/139>`_).
By `Mathias Hauser <https://github.com/mathause>`_.

Breaking changes
^^^^^^^^^^^^^^^^
Expand Down
44 changes: 22 additions & 22 deletions mesmer/calibrate_mesmer/train_gv.py
Expand Up @@ -12,8 +12,11 @@
import joblib
import numpy as np
import statsmodels.api as sm
import xarray as xr
from packaging.version import Version
from statsmodels.tsa.ar_model import AutoReg, ar_select_order
from statsmodels.tsa.ar_model import ar_select_order

from mesmer.core.auto_regression import _fit_auto_regression_xr


def train_gv(gv, targ, esm, cfg, save_params=True, **kwargs):
Expand Down Expand Up @@ -193,29 +196,26 @@ def train_gv_AR(params_gv, gv, max_lag, sel_crit):
AR_order_sel = int(np.percentile(AR_order_scens_tmp, **pct_kwargs))

# determine the AR params for the selected AR order
params_gv["AR_int"] = 0
params_gv["AR_coefs"] = np.zeros(AR_order_sel)
params_gv["AR_order_sel"] = AR_order_sel
params_gv["AR_std_innovs"] = 0

res = list()
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rename to result? res might stand for residuals.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep although if the function is small enough it will be clear from the context

for scen_idx, scen in enumerate(gv.keys()):
nr_runs = gv[scen].shape[0]
AR_order_runs_tmp = np.zeros(nr_runs)
AR_int_tmp = 0
AR_coefs_tmp = np.zeros(AR_order_sel)
AR_std_innovs_tmp = 0
data = gv[scen]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we split out a function for this inner loop, we might be able to make the pattern look more like linear regression does

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Although I guess it's almost as small as it can be already, the inner function would have to be called something like _fit_auto_regression_with_mean_over_runs I guess.

(Side note, what does 'run' mean here? Is that ensemble member?)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is definitely a good idea. However, I am still not sure what our "outer" data structure should be (I have a prototype of a DataList but I am not entirely convinced).

run is an ensemble member. I just followed Lea's terminology. But I agree we should standardize this stuff...


for run in np.arange(nr_runs):
AR_model_tmp = AutoReg(
gv[scen][run], lags=AR_order_sel, old_names=False
).fit()
AR_int_tmp += AR_model_tmp.params[0] / nr_runs
AR_coefs_tmp += AR_model_tmp.params[1:] / nr_runs
AR_std_innovs_tmp += np.sqrt(AR_model_tmp.sigma2) / nr_runs

params_gv["AR_int"] += AR_int_tmp / nr_scens
params_gv["AR_coefs"] += AR_coefs_tmp / nr_scens
params_gv["AR_std_innovs"] += AR_std_innovs_tmp / nr_scens
# create temporary DataArray
data = xr.DataArray(data, dims=("run", "time"))

params = _fit_auto_regression_xr(data, dim="time", lags=AR_order_sel)
params = params.mean("run")

res.append(params)

res = xr.concat(res, dim="scen")
res = res.mean("scen")

# TODO: remove np.float64(...) (only here so the tests pass)
params_gv["AR_order_sel"] = AR_order_sel
params_gv["AR_int"] = np.float64(res.trend.values)
mathause marked this conversation as resolved.
Show resolved Hide resolved
params_gv["AR_coefs"] = res.coeffs.values.squeeze()
params_gv["AR_std_innovs"] = np.float64(res.standard_deviation.values)

# check if fitted AR process is stationary
# (highly unlikely this test will ever fail but better safe than sorry)
Expand Down
47 changes: 22 additions & 25 deletions mesmer/calibrate_mesmer/train_lv.py
Expand Up @@ -11,8 +11,10 @@

import joblib
import numpy as np
import xarray as xr
from scipy.stats import multivariate_normal
from statsmodels.tsa.ar_model import AutoReg

from mesmer.core.auto_regression import _fit_auto_regression_xr

from .train_utils import train_l_prepare_X_y_wgteq

Expand Down Expand Up @@ -217,38 +219,33 @@ def train_lv_AR1_sci(params_lv, targs, y, wgt_scen_eq, aux, cfg):
# easier to loop over individ runs / scenarios
targ_names = list(targs.keys())
scenarios_tr = list(targs[targ_names[0]].keys())
nr_scens = len(scenarios_tr)

# fit parameters for each target individually
for t, targ_name in enumerate(targ_names):
targ = targs[targ_name]
nr_gps = y.shape[1]
y_targ = y[:, :, t]

# AR(1)
params_lv["AR1_int"][targ_name] = np.zeros(nr_gps)
params_lv["AR1_coef"][targ_name] = np.zeros(nr_gps)
params_lv["AR1_std_innovs"][targ_name] = np.zeros(nr_gps)

res = list()
for scen in scenarios_tr:
nr_runs, nr_ts, nr_gps = targ[scen].shape
AR1_int_runs = np.zeros(nr_gps)
AR1_coef_runs = np.zeros(nr_gps)
AR1_std_innovs_runs = np.zeros(nr_gps)

for run in np.arange(nr_runs):
for gp in np.arange(nr_gps):
AR1_model = AutoReg(
targ[scen][run, :, gp], lags=1, old_names=False
).fit()
AR1_int_runs[gp] += AR1_model.params[0] / nr_runs
AR1_coef_runs[gp] += AR1_model.params[1] / nr_runs
# sqrt of variance = standard deviation
AR1_std_innovs_runs[gp] += np.sqrt(AR1_model.sigma2) / nr_runs

params_lv["AR1_int"][targ_name] += AR1_int_runs / nr_scens
params_lv["AR1_coef"][targ_name] += AR1_coef_runs / nr_scens
params_lv["AR1_std_innovs"][targ_name] += AR1_std_innovs_runs / nr_scens
data = targ[scen]

nr_runs, nr_ts, nr_gps = data.shape

# create temporary DataArray
data = xr.DataArray(data, dims=("run", "time", "cell"))

params = _fit_auto_regression_xr(data, dim="time", lags=1)
params = params.mean("run")

res.append(params)

res = xr.concat(res, dim="scen")
res = res.mean("scen")

params_lv["AR1_int"][targ_name] = res.trend.values
mathause marked this conversation as resolved.
Show resolved Hide resolved
params_lv["AR1_coef"][targ_name] = res.coeffs.values.squeeze()
params_lv["AR1_std_innovs"][targ_name] = res.standard_deviation.values

# determine localization radius, empirical cov matrix, and localized ecov matrix
(
Expand Down
77 changes: 77 additions & 0 deletions mesmer/core/auto_regression.py
@@ -0,0 +1,77 @@
import numpy as np
import xarray as xr


def _fit_auto_regression_xr(data, dim, lags):
"""
fit an auto regression - xarray wrapper

Parameters
----------
data : xr.DataArray
A ``xr.DataArray`` to estimate the auto regression over.
dim : str
Dimension along which to fit the auto regression over.
lags : int
The number of lags to include in the model.

Returns
-------
:obj:`xr.Dataset`
Dataset containing the estimated parameters of the ``trend``, the AR ``coeffs``
mathause marked this conversation as resolved.
Show resolved Hide resolved
and the ``standard_deviation`` of the residuals.
"""

if not isinstance(data, xr.DataArray):
raise TypeError(f"Expected a `xr.DataArray`, got {type(data)}")

trend, coeffs, std = xr.apply_ufunc(
mathause marked this conversation as resolved.
Show resolved Hide resolved
_fit_auto_regression_np,
data,
input_core_dims=[[dim]],
output_core_dims=((), ("lags",), ()),
vectorize=True,
output_dtypes=[float, float, float],
kwargs={"lags": lags},
)

# TODO: are the names appropriate?
data_vars = {"trend": trend, "coeffs": coeffs, "standard_deviation": std}
mathause marked this conversation as resolved.
Show resolved Hide resolved
mathause marked this conversation as resolved.
Show resolved Hide resolved

# TODO: add coords for lags?
return xr.Dataset(data_vars)


def _fit_auto_regression_np(data, lags):
"""
fit an auto regression - numpy wrapper

Parameters
----------
data : np.array
A numpy array to estimate the auto regression over. Must be 1D.
lags : int
The number of lags to include in the model.

Returns
-------
trend : :obj:`np.array`
Trend part of the fited AR model.
mathause marked this conversation as resolved.
Show resolved Hide resolved
coeffs : :obj:`np.array`
Coefficients if the AR model. Will have as many entries as ``lags``.
std :obj:`np.array`
Standard deviation of the residuals.
"""

from statsmodels.tsa.ar_model import AutoReg
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why here and not wrapped in try except in top level? Is that an xarray pattern?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I did this for the linear regression because the sklearn class has the same name as ours (LinearRegression) so I just followed the same pattern.


AR_model = AutoReg(data, lags=lags, old_names=False)
AR_result = AR_model.fit()

trend = AR_result.params[0]
mathause marked this conversation as resolved.
Show resolved Hide resolved
coeffs = AR_result.params[1:]

# sqrt of variance = standard deviation
std = np.sqrt(AR_result.sigma2)

return trend, coeffs, std
mathause marked this conversation as resolved.
Show resolved Hide resolved
6 changes: 6 additions & 0 deletions mesmer/core/utils.py
Expand Up @@ -71,6 +71,7 @@ def _check_dataarray_form(
*,
ndim: int = None,
required_dims: Union[str, Set[str]] = set(),
shape=None,
):
"""check if a dataset conforms to some conditions

Expand All @@ -82,6 +83,8 @@ def _check_dataarray_form(
Number of required dimensions
required_dims: str, set of str, optional
Names of dims that are required for obj
shape : tuple of ints, default: None
Required shape. Ignored if None.

Raises
------
Expand All @@ -101,3 +104,6 @@ def _check_dataarray_form(
if required_dims - set(obj.dims):
missing_dims = " ,".join(required_dims - set(obj.dims))
raise ValueError(f"{name} is missing the required dims: {missing_dims}")

if shape is not None and obj.shape != shape:
raise ValueError(f"{name} has wrong shape - expected {shape}, got {obj.shape}")
92 changes: 92 additions & 0 deletions tests/integration/test_auto_regression.py
@@ -0,0 +1,92 @@
from unittest import mock

import numpy as np
import pytest
import xarray as xr

import mesmer.core.auto_regression
from mesmer.core.utils import _check_dataarray_form, _check_dataset_form

from .utils import trend_data_1D, trend_data_2D


@pytest.mark.parametrize("obj", [xr.Dataset(), None])
def test_auto_regression_xr_errors(obj):
mathause marked this conversation as resolved.
Show resolved Hide resolved

with pytest.raises(TypeError, match="Expected a `xr.DataArray`"):
mesmer.core.auto_regression._fit_auto_regression_xr(obj, "dim", lags=1)


@pytest.mark.parametrize("lags", [1, 2])
def test_auto_regression_xr_1D(lags):
mathause marked this conversation as resolved.
Show resolved Hide resolved

data = trend_data_1D()
res = mesmer.core.auto_regression._fit_auto_regression_xr(data, "time", lags=lags)

_check_dataset_form(
res,
"_fit_auto_regression_result",
required_vars=["trend", "coeffs", "standard_deviation"],
mathause marked this conversation as resolved.
Show resolved Hide resolved
)

_check_dataarray_form(res.trend, "trend", ndim=0, shape=())
mathause marked this conversation as resolved.
Show resolved Hide resolved
_check_dataarray_form(
res.coeffs, "coeffs", ndim=1, required_dims={"lags"}, shape=(lags,)
)
_check_dataarray_form(
res.standard_deviation, "standard_deviation", ndim=0, shape=()
)


@pytest.mark.parametrize("lags", [1, 2])
def test_auto_regression_xr_2D(lags):
mathause marked this conversation as resolved.
Show resolved Hide resolved

data = trend_data_2D()
res = mesmer.core.auto_regression._fit_auto_regression_xr(data, "time", lags=lags)

(n_cells,) = data.cells.shape

_check_dataset_form(
res,
"_fit_auto_regression_result",
required_vars=["trend", "coeffs", "standard_deviation"],
mathause marked this conversation as resolved.
Show resolved Hide resolved
)

_check_dataarray_form(res.trend, "trend", ndim=1, shape=(n_cells,))
mathause marked this conversation as resolved.
Show resolved Hide resolved
_check_dataarray_form(
res.coeffs,
"coeffs",
ndim=2,
required_dims={"cells", "lags"},
shape=(n_cells, lags),
)
_check_dataarray_form(
res.standard_deviation, "standard_deviation", ndim=1, shape=(n_cells,)
)


@pytest.mark.parametrize("lags", [1, 2])
def test_auto_regression_np(lags):
mathause marked this conversation as resolved.
Show resolved Hide resolved

data = np.array([0, 1, 3.14])

mock_auto_regressor = mock.Mock()
mock_auto_regressor.params = np.array([0.1, 0.25])
mock_auto_regressor.sigma2 = 3.14

with mock.patch(
"statsmodels.tsa.ar_model.AutoReg"
) as mocked_auto_regression, mock.patch(
"statsmodels.tsa.ar_model.AutoRegResults"
) as mocked_auto_regression_result:

mocked_auto_regression.return_value = mocked_auto_regression_result
mocked_auto_regression_result.return_value = mock_auto_regressor

mesmer.core.auto_regression._fit_auto_regression_np(data, lags=lags)

mocked_auto_regression.assert_called_once()
mocked_auto_regression.assert_called_with(data, lags=lags, old_names=False)

mocked_auto_regression_result.fit.assert_called_once()
mocked_auto_regression_result.fit.assert_called_with()
15 changes: 15 additions & 0 deletions tests/integration/test_utils.py
Expand Up @@ -109,3 +109,18 @@ def test_check_dataarray_form_required_dims(required_dims):
mesmer.core.utils._check_dataarray_form(da, required_dims="y")
mesmer.core.utils._check_dataarray_form(da, required_dims=["x", "y"])
mesmer.core.utils._check_dataarray_form(da, required_dims={"x", "y"})


def test_check_dataarray_form_shape():

da = xr.DataArray(np.ones((2, 2)), dims=("x", "y"))

for shape in ((), (1,), (1, 2), (2, 1), (1, 2, 3)):
with pytest.raises(ValueError, match="obj has wrong shape"):
mesmer.core.utils._check_dataarray_form(da, shape=shape)

with pytest.raises(ValueError, match="test has wrong shape"):
mesmer.core.utils._check_dataarray_form(da, name="test", shape=())

# no error
mesmer.core.utils._check_dataarray_form(da, shape=(2, 2))