Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

extract select_ar_order #176

Merged
merged 9 commits into from Jul 12, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.rst
Expand Up @@ -23,6 +23,9 @@ New Features
- Add ``mesmer.core.auto_regression._draw_auto_regression_correlated_np``: to draw samples of an
auto regression model (`#161 <https://github.com/MESMER-group/mesmer/pull/161>`_).
By `Mathias Hauser <https://github.com/mathause>`_.
- Extract function to select the order of the auto regressive model: ``mesmer.core.auto_regression._select_ar_order_xr``
(`#176 <https://github.com/MESMER-group/mesmer/pull/176>`_).
By `Mathias Hauser <https://github.com/mathause>`_.

Breaking changes
^^^^^^^^^^^^^^^^
Expand Down
1 change: 1 addition & 0 deletions docs/source/api.rst
Expand Up @@ -22,6 +22,7 @@ Statistical core functions
~core.linear_regression.LinearRegression.residuals
~core.linear_regression.LinearRegression.to_netcdf
~core.linear_regression.LinearRegression.from_netcdf
~core.auto_regression._select_ar_order_xr
~core.auto_regression._fit_auto_regression_xr
~core.auto_regression._draw_auto_regression_correlated_np

Expand Down
45 changes: 19 additions & 26 deletions mesmer/calibrate_mesmer/train_gv.py
Expand Up @@ -14,9 +14,8 @@
import statsmodels.api as sm
import xarray as xr
from packaging.version import Version
from statsmodels.tsa.ar_model import ar_select_order

from mesmer.core.auto_regression import _fit_auto_regression_xr
from mesmer.core.auto_regression import _fit_auto_regression_xr, _select_ar_order_xr


def train_gv(gv, targ, esm, cfg, save_params=True, **kwargs):
Expand Down Expand Up @@ -167,33 +166,27 @@ def train_gv_AR(params_gv, gv, max_lag, sel_crit):
params_gv["max_lag"] = max_lag
params_gv["sel_crit"] = sel_crit

if Version(xr.__version__) >= Version("2022.03.0"):
method = "method"
else:
method = "interpolation"

# select the AR Order
nr_scens = len(gv.keys())
AR_order_scens_tmp = np.zeros(nr_scens)
AR_order_scen = list()
for scen in gv.keys():

# np.percentile renamed the keyword in numpy v1.22
if Version(np.__version__) >= Version("1.22.0"):
pct_kwargs = {"q": 50, "method": "nearest"}
else:
pct_kwargs = {"q": 50, "interpolation": "nearest"}
# create temporary DataArray
data = xr.DataArray(gv[scen], dims=["run", "time"])

for scen_idx, scen in enumerate(gv.keys()):
nr_runs = gv[scen].shape[0]
AR_order_runs_tmp = np.zeros(nr_runs)

for run in np.arange(nr_runs):
run_ar_lags = ar_select_order(
gv[scen][run], maxlag=max_lag, ic=sel_crit, old_names=False
).ar_lags
# if order > 0 is selected,add selected order to vector
if len(run_ar_lags) > 0:
AR_order_runs_tmp[run] = run_ar_lags[-1]

# interpolation is not a good way to go here because it could lead to an AR
# order that wasn't chosen by run -> avoid it by just taking nearest
AR_order_scens_tmp[scen_idx] = np.percentile(AR_order_runs_tmp, **pct_kwargs)

AR_order_sel = int(np.percentile(AR_order_scens_tmp, **pct_kwargs))
AR_order = _select_ar_order_xr(data, dim="time", maxlag=max_lag, ic=sel_crit)

# median over all ensemble members ("nearest" ensures an 'existing' lag is selected)
AR_order = AR_order.quantile(q=0.5, **{method: "nearest"})
AR_order_scen.append(AR_order)

# median over all scenarios
AR_order_scen = xr.concat(AR_order_scen, dim="scen")
AR_order_sel = int(AR_order.quantile(q=0.5, **{method: "nearest"}).item())

# determine the AR params for the selected AR order
params_scen = list()
Expand Down
2 changes: 1 addition & 1 deletion mesmer/core/__init__.py
@@ -1,2 +1,2 @@
# flake8: noqa
from . import linear_regression
from . import auto_regression, linear_regression
76 changes: 76 additions & 0 deletions mesmer/core/auto_regression.py
Expand Up @@ -2,6 +2,82 @@
import xarray as xr


def _select_ar_order_xr(data, dim, maxlag, ic="bic"):
mathause marked this conversation as resolved.
Show resolved Hide resolved
"""Select the order of an autoregressive process - xarray wrapper

Parameters
----------
data : DataArray
A ``xr.DataArray`` to estimate the auto regression order.
dim : str
Dimension along which to determine the order.
maxlag : int
The maximum lag to consider.
ic : {'aic', 'hqic', 'bic'}, default 'bic'
The information criterion to use in the selection.

Returns
-------
selected_ar_order : DataArray
Array indicating the selected order with the same size as the input but ``dim``
removed.

Notes
-----
Only full models can be selected along one dimension.
"""

selected_ar_order = xr.apply_ufunc(
_select_ar_order_np,
data,
input_core_dims=[[dim]],
output_core_dims=((),),
vectorize=True,
output_dtypes=[float],
kwargs={"maxlag": maxlag, "ic": ic},
)

# remove zeros
selected_ar_order.data[selected_ar_order.data == 0] = np.NaN

Comment on lines +40 to +42
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
# remove zeros
selected_ar_order.data[selected_ar_order.data == 0] = np.NaN

I think that does not actually happen.

selected_ar_order.name = "selected_ar_order"

return selected_ar_order


def _select_ar_order_np(data, maxlag, ic="bic"):
"""Select the order of an autoregressive AR(p) process - numpy wrapper

Parameters
----------
data : array_like
A numpy array to estimate the auto regression order. Must be 1D.
maxlag : int
The maximum lag to consider.
ic : {'aic', 'hqic', 'bic'}, default 'bic'
The information criterion to use in the selection.

Returns
-------
selected_ar_order : int
The selected order.

Notes
-----
Thin wrapper around ``statsmodels.tsa.ar_model.ar_select_order``. Only full models
can be selected.
"""

from statsmodels.tsa.ar_model import ar_select_order
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wouldn't it be better to have the import of "ar_select_order" just after the import of xarray? To have it once and for all?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are right it is more common to have the imports at the beginning of the file. However, I will probably keep it here to avoid confusion because the name would be similar to select_ar_order. Tangentially relevant: #177

Just as a side remark - imports are cached in python so there is no time penalty to have it in a function as compared to the top.


ar_lags = ar_select_order(data, maxlag=maxlag, ic=ic, old_names=False).ar_lags

# None is returned if no lag is selected
selected_ar_order = np.NaN if ar_lags is None else ar_lags[-1]

return selected_ar_order


def _draw_auto_regression_correlated_np(
*, intercept, coefs, covariance, n_samples, n_ts, seed, buffer
):
Expand Down
66 changes: 65 additions & 1 deletion tests/integration/test_auto_regression.py
Expand Up @@ -7,7 +7,71 @@
import mesmer.core.auto_regression
from mesmer.core.utils import _check_dataarray_form, _check_dataset_form

from .utils import trend_data_1D, trend_data_2D
from .utils import trend_data_1D, trend_data_2D, trend_data_3D


def test_select_ar_order_xr_1d():

data = trend_data_1D()

result = mesmer.core.auto_regression._select_ar_order_xr(data, "time", 4)

_check_dataarray_form(result, "selected_ar_order", ndim=0, shape=())


@pytest.mark.parametrize("n_lon", [1, 2])
@pytest.mark.parametrize("n_lat", [3, 4])
def test_select_ar_order_xr_3d(n_lon, n_lat):

data = trend_data_3D(n_lat=n_lat, n_lon=n_lon)

result = mesmer.core.auto_regression._select_ar_order_xr(data, "time", 1)

_check_dataarray_form(
result,
"selected_ar_order",
ndim=2,
required_dims={"lon", "lat"},
shape=(n_lat, n_lon),
)


def test_select_ar_order_xr_dim():

data = trend_data_3D(n_timesteps=4, n_lon=5)
result = mesmer.core.auto_regression._select_ar_order_xr(data, "lon", 1)

_check_dataarray_form(
result, "selected_ar_order", ndim=2, required_dims={"time", "lat"}, shape=(4, 3)
)


def test_select_ar_order_xr():

data = trend_data_2D()

result = mesmer.core.auto_regression._select_ar_order_xr(data, "time", 4)

coords = data.drop_vars("time").coords

expected = xr.DataArray([4.0, 2.0, 2.0, 3.0, 4.0, 2.0], dims="cells", coords=coords)

xr.testing.assert_equal(result, expected)


def test_select_ar_order_np():

rng = np.random.default_rng(seed=0)
data = rng.normal(size=100)

result = mesmer.core.auto_regression._select_ar_order_np(data, 2)
assert np.isnan(result)

result = mesmer.core.auto_regression._select_ar_order_np(data[:10], 2)
assert result == 2

with pytest.raises(ValueError):
mesmer.core.auto_regression._select_ar_order_np(data[:6], 5)


@pytest.mark.parametrize("ar_order", [1, 8])
Expand Down
15 changes: 15 additions & 0 deletions tests/integration/utils.py
Expand Up @@ -59,3 +59,18 @@ def trend_data_2D(n_timesteps=30, n_lat=3, n_lon=2, intercept=0, slope=1, scale=
}

return xr.DataArray(data, dims=("cells", "time"), coords=coords)


def trend_data_3D(n_timesteps=30, n_lat=3, n_lon=2, intercept=0, slope=1, scale=1):

data = trend_data_2D(
n_timesteps=n_timesteps,
n_lat=n_lat,
n_lon=n_lon,
intercept=intercept,
slope=slope,
scale=scale,
)

# reshape to 3D (time x lat x lon)
return data.set_index(cells=("lat", "lon")).unstack("cells")