From 456d3bf29a20a3283deb36384ad0e12d92c61345 Mon Sep 17 00:00:00 2001
From: "J.R. Angevaare" <joran.angevaare@gmail.com>
Date: Fri, 8 Sep 2023 09:19:11 +0000
Subject: [PATCH] add a merge method to deal with offset time series

---
 optim_esm_tools/analyze/combine_variables.py |  22 +++-
 test/test_combine_variables.py               | 112 +++++++++++++++++++
 2 files changed, 133 insertions(+), 1 deletion(-)

diff --git a/optim_esm_tools/analyze/combine_variables.py b/optim_esm_tools/analyze/combine_variables.py
index 1b5e0a3..70018bc 100644
--- a/optim_esm_tools/analyze/combine_variables.py
+++ b/optim_esm_tools/analyze/combine_variables.py
@@ -88,7 +88,7 @@ def _squash_variables(self) -> ty.Mapping:
         return new_ds
 
     @staticmethod
-    def _merge_squash(new_ds) -> xr.Dataset:
+    def _merge_squash(new_ds: dict) -> xr.Dataset:
         try:
             new_ds = xr.Dataset(**new_ds)
         except TypeError as e:  # pragma: no cover
@@ -104,6 +104,26 @@ def _merge_squash(new_ds) -> xr.Dataset:
                     new_ds[k].attrs = v.attrs
                 else:
                     new_ds[k] = v
+        dt = new_ds['time'].values[-1].year - new_ds['time'].values[0].year
+        if len(new_ds['time']) > dt + 1:  # allow off by one
+            new_ds = self._fix_wong_merge(new_ds)
+        return new_ds
+
+    @staticmethod
+    def _fix_wong_merge(ds):
+        """The function `_fix_wong_merge` fixes a wrong merge in a dataset by
+        removing duplicate time values and adjusting the data accordingly.
+
+        :param ds: The parameter `ds` is a dataset object. It is assumed to have a dimension called 'time'
+        and contains multiple variables
+        :return: a new dataset, `new_ds`, which is a modified version of the input dataset `ds`.
+        """
+        new_ds = ds.copy()
+        new_ds = new_ds.isel(time=slice(None, len(new_ds['time']) // 2))
+        new_ds['time'] = ds['time'][::2]
+        for v, a in ds.data_vars.items():
+            if 'time' in a.dims:
+                new_ds[v].data = a[1::2] if np.isnan(a.values[10]) else a[::2]
         return new_ds
 
     def make_fig(self, ds=None, fig_kw=None, add_histograms=False):
diff --git a/test/test_combine_variables.py b/test/test_combine_variables.py
index 0c59222..ff153d5 100644
--- a/test/test_combine_variables.py
+++ b/test/test_combine_variables.py
@@ -1,8 +1,15 @@
 import os
 import tempfile
+from unittest import main
 from unittest import TestCase
 
+import cftime
+import numpy as np
+import pytest
+import xarray as xr
+
 import optim_esm_tools as oet
+from optim_esm_tools.analyze.combine_variables import VariableMerger
 
 
 class TestCombineVariables(TestCase):
@@ -42,3 +49,108 @@ def test_merge_three(self):
 
     def test_merge_w_hist(self):
         self.test_merge_two(add_histograms=True)
+
+
+from hypothesis import given
+from hypothesis import strategies as st
+
+import xarray as xr
+import numpy as np
+import cftime
+
+
+class TestVariableMerger(TestCase):
+    """This unittest was written using the help of CHATGPT, although it
+    required a fair amount of optimization."""
+
+    def create_dummy_dataset(self, length, nx=5, ny=20):
+        time_values = [cftime.DatetimeNoLeap(2000, 1, i + 1) for i in range(length)]
+        lat_values = np.linspace(-90, 90, ny)
+        lon_values = np.linspace(0, 360, nx)
+        variable1 = (
+            ('time', 'lat', 'lon'),
+            np.arange(length * ny * nx).reshape(length, ny, nx),
+        )
+        variable2 = (
+            ('time', 'lat', 'lon'),
+            np.random.rand(length, ny, nx),
+        )
+        variable3 = (
+            ('time', 'lat', 'lon'),
+            np.random.randint(0, 2, size=(length, ny, nx)),
+        )
+
+        # Create global mask as a boolean array
+        global_mask = (('lat', 'lon'), np.random.choice([True, False], size=(ny, nx)))
+        cell_area = (('lat', 'lon'), np.arange(ny * nx).reshape(ny, nx))
+
+        # Create variables with offset time values
+        offset_time1 = [cftime.DatetimeNoLeap(2000, 1, i + 5) for i in range(length)]
+        off_dims, offset_variable1 = (
+            ('time', 'lat', 'lon'),
+            np.random.rand(length, ny, nx),
+        )
+        da_off = xr.DataArray(
+            data=offset_variable1,
+            dims=off_dims,
+            coords={
+                'time': offset_time1,
+                'lat': lat_values,
+                'lon': lon_values,
+            },
+        )
+        dummy_data = {
+            'variable1': variable1,
+            'variable2': variable2,
+            'variable3': variable3,
+            'global_mask': global_mask,
+            'cell_area': cell_area,
+        }
+
+        coords = {
+            'time': time_values,
+            'lat': lat_values,
+            'lon': lon_values,
+        }
+
+        dataset = xr.Dataset(data_vars=dummy_data, coords=coords)
+
+        dataset.attrs['variables'] = list(
+            set(dummy_data) - {'global_mask', 'cell_area'} | {'offset_variable1'},
+        )
+
+        # Add a running mean with 10 samples to each variable while considering the new dimensions
+        for var_name in dummy_data:
+            if var_name in ['cell_area', 'global_mask']:
+                continue
+            rm = np.zeros_like(dataset[var_name].values, dtype=np.float16)
+            rm[:] = np.nan
+            for lat_idx in range(ny):
+                for lon_idx in range(nx):
+                    running_mean = np.convolve(
+                        dataset[var_name][:, lat_idx, lon_idx],
+                        np.ones(10) / 10,
+                        mode='valid',
+                    )
+                    rm[5:-4, lat_idx, lon_idx] = running_mean
+            dataset[var_name + '_run_mean_10'] = (('time', 'lat', 'lon'), rm)
+        dataset['offset_variable1'] = da_off
+        dataset['offset_variable1__run_mean_10'] = da_off
+
+        return dataset
+
+    @given(
+        dummy_dataset_length=st.integers(min_value=11, max_value=20),
+        random_seed=st.integers(min_value=1, max_value=1000),
+    )
+    def test_combine_masks(self, dummy_dataset_length, random_seed):
+        np.random.seed(random_seed)
+        dummy_dataset = self.create_dummy_dataset(dummy_dataset_length)
+        merger = VariableMerger(data_set=dummy_dataset)
+        assert merger.data_set.equals(dummy_dataset)
+        assert merger.mask_paths is None
+        assert merger.merge_method == 'logical_or'
+
+
+if __name__ == '__main__':
+    main()