Improve subsample across Coreg subclasses and pipelines (#436)

* Add subsample to subclass init * Progress on subsampling in Coreg classes and pipelines * Finalize subsampling functionality * Add tests and minor fixes * Add test for get_subsample * Linting * Fix other tests * Get geoutils from git to check everything passes * Linting * Add exception in generate pip from conda script when pulling from repo directly * Change geoutils version required in advance * First commit for eriks comments * Finalize eriks comments * Linting
GlacioHack · Sep 26, 2023 · ead2f1b · ead2f1b
1 parent a5635a5
commit ead2f1b
Show file tree

Hide file tree

Showing 14 changed files with 542 additions and 206 deletions.
diff --git a/.github/scripts/generate_pip_deps_from_conda.py b/.github/scripts/generate_pip_deps_from_conda.py
@@ -98,7 +98,14 @@ def generate_pip_from_conda(conda_path: pathlib.Path, pip_path: pathlib.Path, co
             if conda_dep:
                 pip_deps.append(conda_dep)
         elif isinstance(dep, dict) and len(dep) == 1 and "pip" in dep:
-            pip_deps.extend(dep["pip"])
+            # If pulled directly from GitHub (temporary CI passing),
+            # such as git+https://github.com/GlacioHack/geoutils.git,
+            # rename to the package repo name
+            dep_pips = dep["pip"]
+            for dep_pip in dep_pips:
+                if "+" in dep_pip and dep_pip.split("+")[0] == "git":
+                    dep_pip = dep_pip.split("/")[-1].split(".git")[0]
+                pip_deps.append(dep_pip)
         else:
             raise ValueError(f"Unexpected dependency {dep}")
 

diff --git a/dev-environment.yml b/dev-environment.yml
@@ -15,7 +15,7 @@ dependencies:
   - tqdm
   - scikit-image
   - scikit-gstat>=1.0
-  - geoutils=0.0.13
+  - geoutils=0.0.15
 
   # Development-specific, to mirror manually in setup.cfg [options.extras_require].
   - pip
@@ -52,4 +52,4 @@ dependencies:
     - noisyopt
 
     # To run CI against latest GeoUtils
-#     - git+https://github.com/GlacioHack/GeoUtils.git
+#    - git+https://github.com/GlacioHack/geoutils.git
diff --git a/environment.yml b/environment.yml
@@ -15,8 +15,8 @@ dependencies:
   - tqdm
   - scikit-image
   - scikit-gstat>=1.0
-  - geoutils=0.0.13
+  - geoutils=0.0.15
   - pip
 
 #  - pip:
-#     - git+https://github.com/GlacioHack/GeoUtils.git
+#     - git+https://github.com/GlacioHack/geoutils.git
diff --git a/requirements.txt b/requirements.txt
@@ -13,7 +13,7 @@ scipy
 tqdm
 scikit-image
 scikit-gstat>=1.0
-geoutils==0.0.13
+geoutils==0.0.15
 pip
 setuptools>=42
 setuptools_scm[toml]>=6.2
diff --git a/tests/test_coreg/test_affine.py b/tests/test_coreg/test_affine.py
@@ -139,16 +139,14 @@ def test_coreg_example(self, verbose: bool = False) -> None:
 
         # Run co-registration
         nuth_kaab = xdem.coreg.NuthKaab()
-        nuth_kaab.fit(self.ref, self.tba, inlier_mask=self.inlier_mask, verbose=verbose)
+        nuth_kaab.fit(self.ref, self.tba, inlier_mask=self.inlier_mask, verbose=verbose, random_state=42)
 
         # Check the output metadata is always the same
-        assert nuth_kaab._meta["offset_east_px"] == pytest.approx(-0.46255704521968716)
-        assert nuth_kaab._meta["offset_north_px"] == pytest.approx(-0.13618536563846081)
-        assert nuth_kaab._meta["vshift"] == pytest.approx(-1.9815309753424906)
+        assert nuth_kaab._meta["offset_east_px"] == pytest.approx(-0.45061858808956284)
+        assert nuth_kaab._meta["offset_north_px"] == pytest.approx(-0.14225262689582596)
+        assert nuth_kaab._meta["vshift"] == pytest.approx(-1.987523471566405)
 
-    def test_gradientdescending(
-        self, downsampling: int = 10000, samples: int = 5000, inlier_mask: bool = True, verbose: bool = False
-    ) -> None:
+    def test_gradientdescending(self, subsample: int = 10000, inlier_mask: bool = True, verbose: bool = False) -> None:
         """
         Test the co-registration outputs performed on the example are always the same. This overlaps with the test in
         test_examples.py, but helps identify from where differences arise.
@@ -159,9 +157,14 @@ def test_gradientdescending(
             inlier_mask = self.inlier_mask
 
         # Run co-registration
-        gds = xdem.coreg.GradientDescending(downsampling=downsampling)
+        gds = xdem.coreg.GradientDescending(subsample=subsample)
         gds.fit_pts(
-            self.ref.to_points().ds, self.tba, inlier_mask=inlier_mask, verbose=verbose, samples=samples, z_name="b1"
+            self.ref.to_points().ds,
+            self.tba,
+            inlier_mask=inlier_mask,
+            verbose=verbose,
+            subsample=subsample,
+            z_name="b1",
         )
         assert gds._meta["offset_east_px"] == pytest.approx(-0.496000, rel=1e-1, abs=0.1)
         assert gds._meta["offset_north_px"] == pytest.approx(-0.1875, rel=1e-1, abs=0.1)
@@ -170,7 +173,7 @@ def test_gradientdescending(
     @pytest.mark.parametrize("shift_px", [(1, 1), (2, 2)])  # type: ignore
     @pytest.mark.parametrize("coreg_class", [coreg.NuthKaab, coreg.GradientDescending, coreg.ICP])  # type: ignore
     @pytest.mark.parametrize("points_or_raster", ["raster", "points"])
-    def test_coreg_example_shift(self, shift_px, coreg_class, points_or_raster, verbose=False, downsampling=5000):
+    def test_coreg_example_shift(self, shift_px, coreg_class, points_or_raster, verbose=False, subsample=5000):
         """
         For comparison of coreg algorithms:
         Shift a ref_dem on purpose, e.g. shift_px = (1,1), and then applying coreg to shift it back.
@@ -182,12 +185,12 @@ def test_coreg_example_shift(self, shift_px, coreg_class, points_or_raster, verb
         shifted_ref = self.ref.copy()
         shifted_ref.shift(shift_px[0] * res, shift_px[1] * res)
 
-        shifted_ref_points = shifted_ref.to_points(as_array=False, subset=downsampling, pixel_offset="center").ds
+        shifted_ref_points = shifted_ref.to_points(as_array=False, subset=subsample, pixel_offset="center").ds
         shifted_ref_points["E"] = shifted_ref_points.geometry.x
         shifted_ref_points["N"] = shifted_ref_points.geometry.y
         shifted_ref_points.rename(columns={"b1": "z"}, inplace=True)
 
-        kwargs = {} if coreg_class.__name__ != "GradientDescending" else {"downsampling": downsampling}
+        kwargs = {} if coreg_class.__name__ != "GradientDescending" else {"subsample": subsample}
 
         coreg_obj = coreg_class(**kwargs)
 
@@ -265,28 +268,28 @@ def test_nuth_kaab(self) -> None:
         # Check that the z shift is close to the original vertical shift.
         assert abs((transformed_points[0, 2] - self.points[0, 2]) + vshift) < 0.1
 
-    def test_deramping(self) -> None:
+    def test_tilt(self) -> None:
         warnings.simplefilter("error")
 
         # Try a 1st degree deramping.
-        deramp = coreg.Tilt()
+        tilt = coreg.Tilt()
 
         # Fit the data
-        deramp.fit(**self.fit_params)
+        tilt.fit(**self.fit_params)
 
         # Apply the deramping to a DEM
-        deramped_dem = deramp.apply(self.tba)
+        tilted_dem = tilt.apply(self.tba)
 
         # Get the periglacial offset after deramping
-        periglacial_offset = (self.ref - deramped_dem)[self.inlier_mask]
+        periglacial_offset = (self.ref - tilted_dem)[self.inlier_mask]
         # Get the periglacial offset before deramping
         pre_offset = (self.ref - self.tba)[self.inlier_mask]
 
         # Check that the error improved
         assert np.abs(np.mean(periglacial_offset)) < np.abs(np.mean(pre_offset))
 
         # Check that the mean periglacial offset is low
-        assert np.abs(np.mean(periglacial_offset)) < 1
+        assert np.abs(np.mean(periglacial_offset)) < 0.01
 
     def test_icp_opencv(self) -> None:
         warnings.simplefilter("error")

diff --git a/tests/test_coreg/test_base.py b/tests/test_coreg/test_base.py
@@ -2,6 +2,7 @@
 
 from __future__ import annotations
 
+import inspect
 import re
 import warnings
 from typing import Any, Callable
@@ -106,66 +107,143 @@ def test_ij_xy(self, i: int = 10, j: int = 20) -> None:
         assert i == pytest.approx(10)
         assert j == pytest.approx(20)
 
-    def test_subsample(self) -> None:
-        warnings.simplefilter("error")
-
-        # Test subsampled vertical shift correction
-        vshift_sub = coreg.VerticalShift()
-
-        # Fit the vertical shift using 50% of the unmasked data using a fraction
-        vshift_sub.fit(**self.fit_params, subsample=0.5)
-        # Do the same but specify the pixel count instead.
-        # They are not perfectly equal (np.count_nonzero(self.mask) // 2 would be exact)
-        # But this would just repeat the subsample code, so that makes little sense to test.
-        vshift_sub.fit(**self.fit_params, subsample=self.tba.data.size // 2)
-
-        # Do full vertical shift corr to compare
-        vshift_full = coreg.VerticalShift()
-        vshift_full.fit(**self.fit_params)
-
-        # Check that the estimated vertical shifts are similar
-        assert abs(vshift_sub._meta["vshift"] - vshift_full._meta["vshift"]) < 0.1
-
-        # Test NuthKaab with subsampling
-        nuthkaab_full = coreg.NuthKaab()
-        nuthkaab_sub = coreg.NuthKaab()
+    @pytest.mark.parametrize("subsample", [10, 10000, 0.5, 1])  # type: ignore
+    def test_get_subsample_on_valid_mask(self, subsample: float | int) -> None:
+        """Test the subsampling function called by all subclasses"""
+
+        # Define a valid mask
+        width = height = 50
+        np.random.seed(42)
+        valid_mask = np.random.randint(low=0, high=2, size=(width, height), dtype=bool)
+
+        # Define a class with a subsample and random_state in the metadata
+        coreg = Coreg(meta={"subsample": subsample, "random_state": 42})
+        subsample_mask = coreg._get_subsample_on_valid_mask(valid_mask=valid_mask)
+
+        # Check that it returns a same-shaped array that is boolean
+        assert np.shape(valid_mask) == np.shape(subsample_mask)
+        assert subsample_mask.dtype == bool
+        # Check that the subsampled values are all within valid values
+        assert all(valid_mask[subsample_mask])
+        # Check that the number of subsampled value is coherent, or the maximum possible
+        if subsample <= 1:
+            # If value lower than 1, fraction of valid pixels
+            subsample_val: float | int = int(subsample * np.count_nonzero(valid_mask))
+        else:
+            # Otherwise the number of pixels
+            subsample_val = subsample
+        assert np.count_nonzero(subsample_mask) == min(subsample_val, np.count_nonzero(valid_mask))
 
-        # Measure the start and stop time to get the duration
-        # start_time = time.time()
-        nuthkaab_full.fit(**self.fit_params)
-        # icp_full_duration = time.time() - start_time
+    # TODO: Activate NuthKaab once subsampling there is made consistent
+    all_coregs = [
+        coreg.VerticalShift,
+        # coreg.NuthKaab,
+        coreg.ICP,
+        coreg.Deramp,
+        coreg.TerrainBias,
+        coreg.DirectionalBias,
+    ]
 
-        # Do the same with 50% subsampling
-        # start_time = time.time()
-        nuthkaab_sub.fit(**self.fit_params, subsample=0.5)
-        # icp_sub_duration = time.time() - start_time
+    @pytest.mark.parametrize("coreg", all_coregs)  # type: ignore
+    def test_subsample(self, coreg: Callable) -> None:  # type: ignore
+        warnings.simplefilter("error")
 
-        # Make sure that the subsampling increased performance
-        # Temporarily add a fallback assertion that if it's slower, it shouldn't be much slower (2021-05-17).
-        # This doesn't work with GitHub's CI, but it works locally. I'm disabling this for now (2021-05-20).
-        # assert icp_full_duration > icp_sub_duration or (abs(icp_full_duration - icp_sub_duration) < 1)
+        # Check that default value is set properly
+        coreg_full = coreg()
+        argspec = inspect.getfullargspec(coreg)
+        assert coreg_full._meta["subsample"] == argspec.defaults[argspec.args.index("subsample") - 1]  # type: ignore
 
-        # Calculate the difference in the full vs. subsampled matrices
-        matrix_diff = np.abs(nuthkaab_full.to_matrix() - nuthkaab_sub.to_matrix())
-        # Check that the x/y/z differences do not exceed 30cm
-        assert np.count_nonzero(matrix_diff > 0.5) == 0
+        # But can be overridden during fit
+        coreg_full.fit(**self.fit_params, subsample=10000, random_state=42)
+        assert coreg_full._meta["subsample"] == 10000
 
-        # Test subsampled deramping
-        deramp_sub = coreg.Tilt()
+        # Test subsampled vertical shift correction
+        coreg_sub = coreg(subsample=0.1)
+        assert coreg_sub._meta["subsample"] == 0.1
 
-        # Fit the bias using 50% of the unmasked data using a fraction
-        deramp_sub.fit(**self.fit_params, subsample=0.5)
+        # Fit the vertical shift using 10% of the unmasked data using a fraction
+        coreg_sub.fit(**self.fit_params, random_state=42)
         # Do the same but specify the pixel count instead.
         # They are not perfectly equal (np.count_nonzero(self.mask) // 2 would be exact)
         # But this would just repeat the subsample code, so that makes little sense to test.
-        deramp_sub.fit(**self.fit_params, subsample=self.tba.data.size // 2)
+        coreg_sub = coreg(subsample=self.tba.data.size // 10)
+        assert coreg_sub._meta["subsample"] == self.tba.data.size // 10
+        coreg_sub.fit(**self.fit_params, random_state=42)
+
+        # Add a few performance checks
+        coreg_name = coreg.__name__
+        if coreg_name == "VerticalShift":
+            # Check that the estimated vertical shifts are similar
+            assert abs(coreg_sub._meta["vshift"] - coreg_full._meta["vshift"]) < 0.1
+
+        elif coreg_name == "NuthKaab":
+            # Calculate the difference in the full vs. subsampled matrices
+            matrix_diff = np.abs(coreg_full.to_matrix() - coreg_sub.to_matrix())
+            # Check that the x/y/z differences do not exceed 30cm
+            assert np.count_nonzero(matrix_diff > 0.5) == 0
+
+        elif coreg_name == "Tilt":
+            # Check that the estimated biases are similar
+            assert coreg_sub._meta["coefficients"] == pytest.approx(coreg_full._meta["coefficients"], rel=1e-1)
+
+    def test_subsample__pipeline(self) -> None:
+        """Test that the subsample argument works as intended for pipelines"""
+
+        # Check definition during instantiation
+        pipe = coreg.VerticalShift(subsample=200) + coreg.Deramp(subsample=5000)
+
+        # Check the arguments are properly defined
+        assert pipe.pipeline[0]._meta["subsample"] == 200
+        assert pipe.pipeline[1]._meta["subsample"] == 5000
+
+        # Check definition during fit
+        pipe = coreg.VerticalShift() + coreg.Deramp()
+        pipe.fit(**self.fit_params, subsample=1000)
+        assert pipe.pipeline[0]._meta["subsample"] == 1000
+        assert pipe.pipeline[1]._meta["subsample"] == 1000
+
+    def test_subsample__errors(self) -> None:
+        """Check proper errors are raised when using the subsample argument"""
+
+        # A warning should be raised when overriding with fit if non-default parameter was passed during instantiation
+        vshift = coreg.VerticalShift(subsample=100)
+
+        with pytest.warns(
+            UserWarning,
+            match=re.escape(
+                "Subsample argument passed to fit() will override non-default "
+                "subsample value defined at instantiation. To silence this "
+                "warning: only define 'subsample' in either fit(subsample=...) "
+                "or instantiation e.g. VerticalShift(subsample=...)."
+            ),
+        ):
+            vshift.fit(**self.fit_params, subsample=1000)
 
-        # Do full bias corr to compare
-        deramp_full = coreg.Tilt()
-        deramp_full.fit(**self.fit_params)
+        # Same for a pipeline
+        pipe = coreg.VerticalShift(subsample=200) + coreg.Deramp()
+        with pytest.warns(
+            UserWarning,
+            match=re.escape(
+                "Subsample argument passed to fit() will override non-default "
+                "subsample values defined for individual steps of the pipeline. "
+                "To silence this warning: only define 'subsample' in either "
+                "fit(subsample=...) or instantiation e.g., VerticalShift(subsample=...)."
+            ),
+        ):
+            pipe.fit(**self.fit_params, subsample=1000)
 
-        # Check that the estimated biases are similar
-        assert deramp_sub._meta["coefficients"] == pytest.approx(deramp_full._meta["coefficients"], rel=1e-1)
+        # Same for a blockwise co-registration
+        block = coreg.BlockwiseCoreg(coreg.VerticalShift(subsample=200), subdivision=4)
+        with pytest.warns(
+            UserWarning,
+            match=re.escape(
+                "Subsample argument passed to fit() will override non-default subsample "
+                "values defined in the step within the blockwise method. To silence this "
+                "warning: only define 'subsample' in either fit(subsample=...) or "
+                "instantiation e.g., VerticalShift(subsample=...)."
+            ),
+        ):
+            block.fit(**self.fit_params, subsample=1000)
 
     def test_coreg_raster_and_ndarray_args(self) -> None:
 
@@ -423,7 +501,7 @@ class TestCoregPipeline:
         inlier_mask=inlier_mask,
         transform=ref.transform,
         crs=ref.crs,
-        verbose=False,
+        verbose=True,
     )
     # Create some 3D coordinates with Z coordinates being 0 to try the apply_pts functions.
     points = np.array([[1, 2, 3, 4], [1, 2, 3, 4], [0, 0, 0, 0]], dtype="float64").T

diff --git a/tests/test_coreg/test_biascorr.py b/tests/test_coreg/test_biascorr.py
@@ -39,7 +39,7 @@ class TestBiasCorr:
         reference_dem=ref,
         dem_to_be_aligned=tba,
         inlier_mask=inlier_mask,
-        verbose=False,
+        verbose=True,
     )
     # Create some 3D coordinates with Z coordinates being 0 to try the apply_pts functions.
     points = np.array([[1, 2, 3, 4], [1, 2, 3, 4], [0, 0, 0, 0]], dtype="float64").T
@@ -144,7 +144,7 @@ def test_biascorr__errors(self) -> None:
             scipy.optimize.curve_fit,
         ],
     )  # type: ignore
-    def test_biascorr__fit_1d(self, fit_func, fit_optimizer) -> None:
+    def test_biascorr__fit_1d(self, fit_func, fit_optimizer, capsys) -> None:
         """Test the _fit_func and apply_func methods of BiasCorr for the fit case (called by all its subclasses)."""
 
         # Create a bias correction object

diff --git a/tests/test_examples.py b/tests/test_examples.py
@@ -34,11 +34,11 @@ class TestExamples:
                 ddem,
                 np.array(
                     [
-                        -4.669189453125000000e-02,
-                        -7.413940429687500000e-01,
-                        1.499481201171875000e-01,
-                        1.095550537109375000e00,
-                        -5.904846191406250000e00,
+                        -0.22833252,
+                        -0.82458496,
+                        0.18843079,
+                        1.0004578,
+                        -5.755249,
                     ],
                     dtype=np.float32,
                 ),