MESMER-group · mathause · Sep 5, 2022 · Sep 2, 2022 · Sep 2, 2022 · znicholls
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -26,6 +26,9 @@ New Features
 - Extract function to select the order of the auto regressive model: ``mesmer.core.auto_regression._select_ar_order_xr``
   (`#176 <https://github.com/MESMER-group/mesmer/pull/176>`_).
   By `Mathias Hauser <https://github.com/mathause>`_.
+- Refactor the LOWESS smooting for xarray objects: :py:func:`mesmer.core.smooting.lowess`.
+  (`#193 <https://github.com/MESMER-group/mesmer/pull/193>`_).
+  By `Mathias Hauser <https://github.com/mathause>`_.
 
 Breaking changes
 ^^^^^^^^^^^^^^^^

diff --git a/docs/source/api.rst b/docs/source/api.rst
@@ -25,6 +25,7 @@ Statistical core functions
    ~core.auto_regression._select_ar_order_xr
    ~core.auto_regression._fit_auto_regression_xr
    ~core.auto_regression._draw_auto_regression_correlated_np
+   ~core.smooting.lowess
 
 Computation
 -----------

diff --git a/mesmer/calibrate_mesmer/train_gt.py b/mesmer/calibrate_mesmer/train_gt.py
@@ -12,9 +12,9 @@
 import joblib
 import numpy as np
 import xarray as xr
-from statsmodels.nonparametric.smoothers_lowess import lowess
 
 from mesmer.core.linear_regression import LinearRegression
+from mesmer.core.smoothing import lowess
 from mesmer.io import load_strat_aod
 
 
@@ -163,14 +163,14 @@ def train_gt(var, targ, esm, time, cfg, save_params=True):
     return params_gt
 
 
-def train_gt_ic_LOWESS(var):
+def train_gt_ic_LOWESS(data):
     """
     Derive smooth global trend of variable from single ESM ic ensemble with LOWESS
     smoother.
 
     Parameters
     ----------
-    var : np.ndarray
+    data : np.ndarray
         2d array (run, time) of globally-averaged time series
 
     Returns
@@ -181,22 +181,21 @@ def train_gt_ic_LOWESS(var):
         fraction of the data used when estimating each y-value
     """
 
-    # number time steps
-    nr_ts = var.shape[1]
+    data = xr.DataArray(data, dims=("ensemble", "time"))
 
     # average across all runs to get a first smoothing
-    av_var = np.mean(var, axis=0)
+    data = data.mean("ensemble")
+
+    dim = "time"
 
     # apply lowess smoother to further smooth the Tglob time series
     # rather arbitrarily chosen value that gives a smooth enough trend,
-    frac_lowess = 50 / nr_ts
+    frac = 50 / data.sizes[dim]
 
     # open to changes but if much smaller, var trend ends up very wiggly
     frac_lowess_name = "50/nr_ts"
 
-    gt_lowess = lowess(
-        av_var, np.arange(nr_ts), return_sorted=False, frac=frac_lowess, it=0
-    )
+    gt_lowess = lowess(data, dim=dim, frac=frac)
 
     return gt_lowess, frac_lowess_name
 

diff --git a/mesmer/core/smoothing.py b/mesmer/core/smoothing.py
@@ -0,0 +1,49 @@
+import numpy as np
+import xarray as xr
+
+from mesmer.core.utils import _check_dataarray_form
+
+
+def lowess(data, dim, *, frac, use_coords_as_x=False, it=0):
+    """LOWESS (Locally Weighted Scatterplot Smoothing) for xarray objects
+
+    Parameters
+    ----------
+    data : xr.DataArray
+        Data to smooth (y-values).
+    dim : str
+        Dimension along which to smooth (x-dimension)
+    frac : float
+        Between 0 and 1. The fraction of the data used when estimating each y-value.
+    use_coords_as_x : boolean, default: False
+        If True uses ``data[dim]`` as x-values else uses ``np.arange(data[dim.size)``
+        (useful if ``dim`` are time coordinates).
+    it : int, default: 0
+        The number of residual-based reweightings to perform.
+    """
+
+    from statsmodels.nonparametric.smoothers_lowess import lowess
+
+    if not isinstance(dim, str):
+        raise ValueError("Can only pass a single dimension.")
+
+    coords = data[dim]
+    _check_dataarray_form(coords, name=dim, ndim=1)
+
+    if use_coords_as_x:
+        x = coords
+    else:
+        x = xr.ones_like(coords)
+        x.data = np.arange(coords.size)
+
+    out = xr.apply_ufunc(
+        lowess,
+        data,
+        x,
+        input_core_dims=[[dim], [dim]],
+        output_core_dims=[[dim]],
+        vectorize=True,
+        kwargs={"frac": frac, "it": it, "return_sorted": False},
+    )
+
+    return out
diff --git a/tests/integration/test_smoothing.py b/tests/integration/test_smoothing.py
@@ -0,0 +1,61 @@
+import pytest
+import xarray as xr
+from statsmodels.nonparametric.smoothers_lowess import lowess
+
+import mesmer.core.smoothing
+from mesmer.core.utils import _check_dataarray_form
+
+from .utils import trend_data_1D, trend_data_2D
+
+
+def test_lowess_errors():
+    data = trend_data_2D()
+
+    with pytest.raises(ValueError, match="Can only pass a single dimension."):
+        mesmer.core.smoothing.lowess(data, ("lat", "lon"), frac=0.3)
+
+    with pytest.raises(ValueError, match="data should be 1-dimensional"):
+        mesmer.core.smoothing.lowess(data.to_dataset(), "data", frac=0.3)
+
+
+@pytest.mark.parametrize("it", [0, 3])
+@pytest.mark.parametrize("frac", [0.3, 0.5])
+def test_lowess(it, frac):
+
+    data = trend_data_1D()
+
+    result = mesmer.core.smoothing.lowess(data, "time", frac=frac, it=it)
+
+    expected = lowess(
+        data.values, data.time.values, frac=frac, it=it, return_sorted=False
+    )
+    expected = xr.DataArray(expected, dims="time", coords={"time": data.time})
+
+    xr.testing.assert_allclose(result, expected)
+
+
+def test_lowess_dataset():
+
+    data = trend_data_1D()
+
+    result = mesmer.core.smoothing.lowess(data.to_dataset(), "time", frac=0.3)
+
+    expected = lowess(
+        data.values, data.time.values, frac=0.3, it=0, return_sorted=False
+    )
+    expected = xr.DataArray(
+        expected, dims="time", coords={"time": data.time}, name="data"
+    )
+    expected = expected.to_dataset()
+
+    xr.testing.assert_allclose(result, expected)
+
+
+def test_lowess_2D():
+    data = trend_data_2D()
+
+    result = mesmer.core.smoothing.lowess(data, "time", frac=0.3)
+
+    _check_dataarray_form(
+        result, "result", ndim=2, required_dims=("time", "cells"), shape=data.shape
+    )
diff --git a/tests/integration/utils.py b/tests/integration/utils.py
@@ -38,7 +38,7 @@ def trend_data_1D(n_timesteps=30, intercept=0, slope=1, scale=1):
 
     data = intercept + slope * time + scatter
 
-    return xr.DataArray(data, dims=("time"), coords={"time": time})
+    return xr.DataArray(data, dims=("time"), coords={"time": time}, name="data")
 
 
 def trend_data_2D(n_timesteps=30, n_lat=3, n_lon=2, intercept=0, slope=1, scale=1):
@@ -58,7 +58,7 @@ def trend_data_2D(n_timesteps=30, n_lat=3, n_lon=2, intercept=0, slope=1, scale=
         "lat": ("cells", LAT.flatten()),
     }
 
-    return xr.DataArray(data, dims=("cells", "time"), coords=coords)
+    return xr.DataArray(data, dims=("cells", "time"), coords=coords, name="data")
 
 
 def trend_data_3D(n_timesteps=30, n_lat=3, n_lon=2, intercept=0, slope=1, scale=1):