MESMER-group · mathause · Apr 7, 2022 · Mar 31, 2022 · Mar 31, 2022 · Apr 1, 2022
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -13,7 +13,9 @@ New Features
 - Add ``mesmer.core.linear_regression``: xarray wrapper for ``mesmer.core._linear_regression``.
   (`#123 <https://github.com/MESMER-group/mesmer/pull/123>`_).
   By `Mathias Hauser <https://github.com/mathause>`_.
-
+- Add ``mesmer.core.auto_regression._fit_auto_regression_xr``: xarray wrapper to fit an
+  auto regression model (`#139 <https://github.com/MESMER-group/mesmer/pull/139>`_).
+  By `Mathias Hauser <https://github.com/mathause>`_.
 
 Breaking changes
 ^^^^^^^^^^^^^^^^

diff --git a/mesmer/calibrate_mesmer/train_gv.py b/mesmer/calibrate_mesmer/train_gv.py
@@ -12,8 +12,11 @@
 import joblib
 import numpy as np
 import statsmodels.api as sm
+import xarray as xr
 from packaging.version import Version
-from statsmodels.tsa.ar_model import AutoReg, ar_select_order
+from statsmodels.tsa.ar_model import ar_select_order
+
+from mesmer.core.auto_regression import _fit_auto_regression_xr
 
 
 def train_gv(gv, targ, esm, cfg, save_params=True, **kwargs):
@@ -193,29 +196,26 @@ def train_gv_AR(params_gv, gv, max_lag, sel_crit):
     AR_order_sel = int(np.percentile(AR_order_scens_tmp, **pct_kwargs))
 
     # determine the AR params for the selected AR order
-    params_gv["AR_int"] = 0
-    params_gv["AR_coefs"] = np.zeros(AR_order_sel)
-    params_gv["AR_order_sel"] = AR_order_sel
-    params_gv["AR_std_innovs"] = 0
-
+    res = list()
     for scen_idx, scen in enumerate(gv.keys()):
-        nr_runs = gv[scen].shape[0]
-        AR_order_runs_tmp = np.zeros(nr_runs)
-        AR_int_tmp = 0
-        AR_coefs_tmp = np.zeros(AR_order_sel)
-        AR_std_innovs_tmp = 0
+        data = gv[scen]
 
-        for run in np.arange(nr_runs):
-            AR_model_tmp = AutoReg(
-                gv[scen][run], lags=AR_order_sel, old_names=False
-            ).fit()
-            AR_int_tmp += AR_model_tmp.params[0] / nr_runs
-            AR_coefs_tmp += AR_model_tmp.params[1:] / nr_runs
-            AR_std_innovs_tmp += np.sqrt(AR_model_tmp.sigma2) / nr_runs
-
-        params_gv["AR_int"] += AR_int_tmp / nr_scens
-        params_gv["AR_coefs"] += AR_coefs_tmp / nr_scens
-        params_gv["AR_std_innovs"] += AR_std_innovs_tmp / nr_scens
+        # create temporary DataArray
+        data = xr.DataArray(data, dims=("run", "time"))
+
+        params = _fit_auto_regression_xr(data, dim="time", lags=AR_order_sel)
+        params = params.mean("run")
+
+        res.append(params)
+
+    res = xr.concat(res, dim="scen")
+    res = res.mean("scen")
+
+    # TODO: remove np.float64(...) (only here so the tests pass)
+    params_gv["AR_order_sel"] = AR_order_sel
+    params_gv["AR_int"] = np.float64(res.trend.values)
+    params_gv["AR_coefs"] = res.coeffs.values.squeeze()
+    params_gv["AR_std_innovs"] = np.float64(res.standard_deviation.values)
 
     # check if fitted AR process is stationary
     # (highly unlikely this test will ever fail but better safe than sorry)

diff --git a/mesmer/calibrate_mesmer/train_lv.py b/mesmer/calibrate_mesmer/train_lv.py
@@ -11,8 +11,10 @@
 
 import joblib
 import numpy as np
+import xarray as xr
 from scipy.stats import multivariate_normal
-from statsmodels.tsa.ar_model import AutoReg
+
+from mesmer.core.auto_regression import _fit_auto_regression_xr
 
 from .train_utils import train_l_prepare_X_y_wgteq
 
@@ -217,38 +219,33 @@ def train_lv_AR1_sci(params_lv, targs, y, wgt_scen_eq, aux, cfg):
     # easier to loop over individ runs / scenarios
     targ_names = list(targs.keys())
     scenarios_tr = list(targs[targ_names[0]].keys())
-    nr_scens = len(scenarios_tr)
 
     # fit parameters for each target individually
     for t, targ_name in enumerate(targ_names):
         targ = targs[targ_name]
         nr_gps = y.shape[1]
         y_targ = y[:, :, t]
 
-        # AR(1)
-        params_lv["AR1_int"][targ_name] = np.zeros(nr_gps)
-        params_lv["AR1_coef"][targ_name] = np.zeros(nr_gps)
-        params_lv["AR1_std_innovs"][targ_name] = np.zeros(nr_gps)
-
+        res = list()
         for scen in scenarios_tr:
-            nr_runs, nr_ts, nr_gps = targ[scen].shape
-            AR1_int_runs = np.zeros(nr_gps)
-            AR1_coef_runs = np.zeros(nr_gps)
-            AR1_std_innovs_runs = np.zeros(nr_gps)
-
-            for run in np.arange(nr_runs):
-                for gp in np.arange(nr_gps):
-                    AR1_model = AutoReg(
-                        targ[scen][run, :, gp], lags=1, old_names=False
-                    ).fit()
-                    AR1_int_runs[gp] += AR1_model.params[0] / nr_runs
-                    AR1_coef_runs[gp] += AR1_model.params[1] / nr_runs
-                    # sqrt of variance = standard deviation
-                    AR1_std_innovs_runs[gp] += np.sqrt(AR1_model.sigma2) / nr_runs
-
-            params_lv["AR1_int"][targ_name] += AR1_int_runs / nr_scens
-            params_lv["AR1_coef"][targ_name] += AR1_coef_runs / nr_scens
-            params_lv["AR1_std_innovs"][targ_name] += AR1_std_innovs_runs / nr_scens
+            data = targ[scen]
+
+            nr_runs, nr_ts, nr_gps = data.shape
+
+            # create temporary DataArray
+            data = xr.DataArray(data, dims=("run", "time", "cell"))
+
+            params = _fit_auto_regression_xr(data, dim="time", lags=1)
+            params = params.mean("run")
+
+            res.append(params)
+
+        res = xr.concat(res, dim="scen")
+        res = res.mean("scen")
+
+        params_lv["AR1_int"][targ_name] = res.trend.values
+        params_lv["AR1_coef"][targ_name] = res.coeffs.values.squeeze()
+        params_lv["AR1_std_innovs"][targ_name] = res.standard_deviation.values
 
         # determine localization radius, empirical cov matrix, and localized ecov matrix
         (

diff --git a/mesmer/core/auto_regression.py b/mesmer/core/auto_regression.py
@@ -0,0 +1,77 @@
+import numpy as np
+import xarray as xr
+
+
+def _fit_auto_regression_xr(data, dim, lags):
+    """
+    fit an auto regression - xarray wrapper
+
+    Parameters
+    ----------
+    data : xr.DataArray
+        A ``xr.DataArray`` to estimate the auto regression over.
+    dim : str
+        Dimension along which to fit the auto regression over.
+    lags : int
+        The number of lags to include in the model.
+
+    Returns
+    -------
+    :obj:`xr.Dataset`
+        Dataset containing the estimated parameters of the ``trend``, the AR ``coeffs``
+        and the ``standard_deviation`` of the residuals.
+    """
+
+    if not isinstance(data, xr.DataArray):
+        raise TypeError(f"Expected a `xr.DataArray`, got {type(data)}")
+
+    trend, coeffs, std = xr.apply_ufunc(
+        _fit_auto_regression_np,
+        data,
+        input_core_dims=[[dim]],
+        output_core_dims=((), ("lags",), ()),
+        vectorize=True,
+        output_dtypes=[float, float, float],
+        kwargs={"lags": lags},
+    )
+
+    # TODO: are the names appropriate?
+    data_vars = {"trend": trend, "coeffs": coeffs, "standard_deviation": std}
+
+    # TODO: add coords for lags?
+    return xr.Dataset(data_vars)
+
+
+def _fit_auto_regression_np(data, lags):
+    """
+    fit an auto regression - numpy wrapper
+
+    Parameters
+    ----------
+    data : np.array
+        A numpy array to estimate the auto regression over. Must be 1D.
+    lags : int
+        The number of lags to include in the model.
+
+    Returns
+    -------
+    trend : :obj:`np.array`
+        Trend part of the fited AR model.
+    coeffs : :obj:`np.array`
+        Coefficients if the AR model. Will have as many entries as ``lags``.
+    std :obj:`np.array`
+        Standard deviation of the residuals.
+    """
+
+    from statsmodels.tsa.ar_model import AutoReg
+
+    AR_model = AutoReg(data, lags=lags, old_names=False)
+    AR_result = AR_model.fit()
+
+    trend = AR_result.params[0]
+    coeffs = AR_result.params[1:]
+
+    # sqrt of variance = standard deviation
+    std = np.sqrt(AR_result.sigma2)
+
+    return trend, coeffs, std
diff --git a/mesmer/core/utils.py b/mesmer/core/utils.py
@@ -71,6 +71,7 @@ def _check_dataarray_form(
     *,
     ndim: int = None,
     required_dims: Union[str, Set[str]] = set(),
+    shape=None,
 ):
     """check if a dataset conforms to some conditions
 
@@ -82,6 +83,8 @@ def _check_dataarray_form(
         Number of required dimensions
     required_dims: str, set of str, optional
         Names of dims that are required for obj
+    shape : tuple of ints, default: None
+        Required shape. Ignored if None.
 
     Raises
     ------
@@ -101,3 +104,6 @@ def _check_dataarray_form(
     if required_dims - set(obj.dims):
         missing_dims = " ,".join(required_dims - set(obj.dims))
         raise ValueError(f"{name} is missing the required dims: {missing_dims}")
+
+    if shape is not None and obj.shape != shape:
+        raise ValueError(f"{name} has wrong shape - expected {shape}, got {obj.shape}")
diff --git a/tests/integration/test_auto_regression.py b/tests/integration/test_auto_regression.py
@@ -0,0 +1,92 @@
+from unittest import mock
+
+import numpy as np
+import pytest
+import xarray as xr
+
+import mesmer.core.auto_regression
+from mesmer.core.utils import _check_dataarray_form, _check_dataset_form
+
+from .utils import trend_data_1D, trend_data_2D
+
+
+@pytest.mark.parametrize("obj", [xr.Dataset(), None])
+def test_auto_regression_xr_errors(obj):
+
+    with pytest.raises(TypeError, match="Expected a `xr.DataArray`"):
+        mesmer.core.auto_regression._fit_auto_regression_xr(obj, "dim", lags=1)
+
+
+@pytest.mark.parametrize("lags", [1, 2])
+def test_auto_regression_xr_1D(lags):
+
+    data = trend_data_1D()
+    res = mesmer.core.auto_regression._fit_auto_regression_xr(data, "time", lags=lags)
+
+    _check_dataset_form(
+        res,
+        "_fit_auto_regression_result",
+        required_vars=["trend", "coeffs", "standard_deviation"],
+    )
+
+    _check_dataarray_form(res.trend, "trend", ndim=0, shape=())
+    _check_dataarray_form(
+        res.coeffs, "coeffs", ndim=1, required_dims={"lags"}, shape=(lags,)
+    )
+    _check_dataarray_form(
+        res.standard_deviation, "standard_deviation", ndim=0, shape=()
+    )
+
+
+@pytest.mark.parametrize("lags", [1, 2])
+def test_auto_regression_xr_2D(lags):
+
+    data = trend_data_2D()
+    res = mesmer.core.auto_regression._fit_auto_regression_xr(data, "time", lags=lags)
+
+    (n_cells,) = data.cells.shape
+
+    _check_dataset_form(
+        res,
+        "_fit_auto_regression_result",
+        required_vars=["trend", "coeffs", "standard_deviation"],
+    )
+
+    _check_dataarray_form(res.trend, "trend", ndim=1, shape=(n_cells,))
+    _check_dataarray_form(
+        res.coeffs,
+        "coeffs",
+        ndim=2,
+        required_dims={"cells", "lags"},
+        shape=(n_cells, lags),
+    )
+    _check_dataarray_form(
+        res.standard_deviation, "standard_deviation", ndim=1, shape=(n_cells,)
+    )
+
+
+@pytest.mark.parametrize("lags", [1, 2])
+def test_auto_regression_np(lags):
+
+    data = np.array([0, 1, 3.14])
+
+    mock_auto_regressor = mock.Mock()
+    mock_auto_regressor.params = np.array([0.1, 0.25])
+    mock_auto_regressor.sigma2 = 3.14
+
+    with mock.patch(
+        "statsmodels.tsa.ar_model.AutoReg"
+    ) as mocked_auto_regression, mock.patch(
+        "statsmodels.tsa.ar_model.AutoRegResults"
+    ) as mocked_auto_regression_result:
+
+        mocked_auto_regression.return_value = mocked_auto_regression_result
+        mocked_auto_regression_result.return_value = mock_auto_regressor
+
+        mesmer.core.auto_regression._fit_auto_regression_np(data, lags=lags)
+
+        mocked_auto_regression.assert_called_once()
+        mocked_auto_regression.assert_called_with(data, lags=lags, old_names=False)
+
+        mocked_auto_regression_result.fit.assert_called_once()
+        mocked_auto_regression_result.fit.assert_called_with()
diff --git a/tests/integration/test_utils.py b/tests/integration/test_utils.py
@@ -109,3 +109,18 @@ def test_check_dataarray_form_required_dims(required_dims):
     mesmer.core.utils._check_dataarray_form(da, required_dims="y")
     mesmer.core.utils._check_dataarray_form(da, required_dims=["x", "y"])
     mesmer.core.utils._check_dataarray_form(da, required_dims={"x", "y"})
+
+
+def test_check_dataarray_form_shape():
+
+    da = xr.DataArray(np.ones((2, 2)), dims=("x", "y"))
+
+    for shape in ((), (1,), (1, 2), (2, 1), (1, 2, 3)):
+        with pytest.raises(ValueError, match="obj has wrong shape"):
+            mesmer.core.utils._check_dataarray_form(da, shape=shape)
+
+    with pytest.raises(ValueError, match="test has wrong shape"):
+        mesmer.core.utils._check_dataarray_form(da, name="test", shape=())
+
+    # no error
+    mesmer.core.utils._check_dataarray_form(da, shape=(2, 2))