From 864fabd13ad0cfd9979ccbcaea1143dac8991a58 Mon Sep 17 00:00:00 2001 From: rhugonne Date: Fri, 13 Aug 2021 15:21:15 +0200 Subject: [PATCH 01/21] fix deramp residual and optimizer --- xdem/coreg.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/xdem/coreg.py b/xdem/coreg.py index 983ae6ef..02614465 100644 --- a/xdem/coreg.py +++ b/xdem/coreg.py @@ -1033,7 +1033,8 @@ def poly2d(x_coordinates: np.ndarray, y_coordinates: np.ndarray, return estimated_values # type: ignore def residuals(coefs: np.ndarray, x_coords: np.ndarray, y_coords: np.ndarray, targets: np.ndarray): - return np.median(np.abs(targets - poly2d(x_coords, y_coords, coefs))) + res = targets - poly2d(x_coords, y_coords, coefs) + return res[np.isfinite(res)] if verbose: print("Estimating deramp function...") @@ -1055,15 +1056,14 @@ def residuals(coefs: np.ndarray, x_coords: np.ndarray, y_coords: np.ndarray, tar ddem = ddem[indices] # Optimize polynomial parameters - coefs = scipy.optimize.fmin( + coefs = scipy.optimize.leastsq( func=residuals, x0=np.zeros(shape=((self.degree + 1) * (self.degree + 2) // 2)), - args=(x_coords, y_coords, ddem), - disp=verbose + args=(x_coords, y_coords, ddem) ) - self._meta["coefficients"] = coefs - self._meta["func"] = lambda x, y: poly2d(x, y, coefs) + self._meta["coefficients"] = coefs[0] + self._meta["func"] = lambda x, y: poly2d(x, y, coefs[0]) def _apply_func(self, dem: np.ndarray, transform: rio.transform.Affine) -> np.ndarray: """Apply the deramp function to a DEM.""" From 9863231785be4bf1cd50078c7a03f920ecb861ea Mon Sep 17 00:00:00 2001 From: rhugonne Date: Thu, 18 Aug 2022 22:59:24 +0200 Subject: [PATCH 02/21] Add wrapper neff function --- xdem/spatialstats.py | 83 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 81 insertions(+), 2 deletions(-) diff --git a/xdem/spatialstats.py b/xdem/spatialstats.py index ecb83e93..33a6dfb7 100644 --- a/xdem/spatialstats.py +++ b/xdem/spatialstats.py @@ -17,6 +17,7 @@ from numba import njit import numpy as np import pandas as pd +import geopandas as gpd from scipy import integrate from scipy.optimize import curve_fit from scipy.spatial.distance import pdist, squareform @@ -25,6 +26,7 @@ from scipy.stats import binned_statistic, binned_statistic_2d, binned_statistic_dd from geoutils.spatial_tools import subsample_raster, get_array_and_mask from geoutils.georaster import RasterType, Raster +from geoutils.geovector import VectorType, Vector with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=DeprecationWarning) @@ -1259,7 +1261,7 @@ def hcov_sum(h): return neff -def neff_exact(coords: np.ndarray, errors: np.ndarray, params_variogram_model: pd.DataFrame, vectorized=True) -> float: +def neff_exact(coords: np.ndarray, errors: np.ndarray, params_variogram_model: pd.DataFrame, vectorized: bool = True) -> float: """ Exact number of effective samples derived from a double sum of covariance with euclidean coordinates based on the provided variogram parameters. This method works for any shape of area. @@ -1270,6 +1272,7 @@ def neff_exact(coords: np.ndarray, errors: np.ndarray, params_variogram_model: p (e.g., ["spherical", "matern"]), "range" for the correlation ranges (e.g., [2, 100]), "psill" for the partial sills (e.g., [0.8, 0.2]) and "smooth" for the smoothness parameter if it exists for this model (e.g., [None, 0.2]). + :param vectorized: Perform the vectorized calculation (used for testing). :return: Number of effective samples """ @@ -1318,7 +1321,7 @@ def neff_exact(coords: np.ndarray, errors: np.ndarray, params_variogram_model: p return neff def neff_hugonnet_approx(coords: np.ndarray, errors: np.ndarray, params_variogram_model: pd.DataFrame, subsample: int = 1000, - vectorized=True, random_state: None | np.random.RandomState | np.random.Generator | int = None) -> float: + vectorized: bool = True, random_state: None | np.random.RandomState | np.random.Generator | int = None) -> float: """ Approximated number of effective samples derived from a double sum of covariance subsetted on one of the two sums, based on euclidean coordinates with the provided variogram parameters. This method works for any shape of area. @@ -1394,6 +1397,82 @@ def neff_hugonnet_approx(coords: np.ndarray, errors: np.ndarray, params_variogra return neff +def number_effective_samples(area: float | int | VectorType | gpd.GeoDataFrame, params_variogram_model: pd.DataFrame, + rasterize_resolution: RasterType | float = None, **kwargs) -> float: + """ + Compute the number of effective samples, i.e. the number of uncorrelated samples, in an area accounting for spatial + correlations described by a sum of variogram models. + + This function wraps two methods: + + - A discretized integration method that provides the exact estimate for any shape of area using a double sum of + covariance. By default, this method is approximated using Equation 18 of Hugonnet et al. (2022), https://doi.org/10.1109/jstars.2022.3188922 + to decrease computing times while preserving a good approximation. + + - A continuous integration method that provides a conservative (i.e., slightly overestimated) value for a disk + area shape, based on a generalization of the approach of Rolstad et al. (2009), http://dx.doi.org/10.3189/002214309789470950. + + By default, if a numeric value is passed for an area, the continuous method is used considering a disk shape. If a + vector is passed, the discretized method is computed on that shape. + + :param area: Area of interest either as a numeric value of surface in the same unit as the variogram ranges (will + assume a circular shape), or as a vector (shapefile) of the area + :param params_variogram_model: Dataframe of variogram models to sum with three to four columns, "model" for the model types + (e.g., ["spherical", "matern"]), "range" for the correlation ranges (e.g., [2, 100]), "psill" for the partial + sills (e.g., [0.8, 0.2]) and "smooth" for the smoothness parameter if it exists for this model (e.g., + [None, 0.2]). + :param rasterize_resolution: Resolution to rasterize the area if passed as a vector. Can be a float value or a Raster. + :param kwargs: Keyword argument to pass to the `neff_hugonnet_approx` function. + :return: Number of effective samples + """ + + # Check input for variogram parameters + _check_validity_params_variogram(params_variogram_model=params_variogram_model) + + # If area is numeric, run the continuous circular approximation + if isinstance(area, (float, int, np.floating, np.integer)): + neff = neff_circular_approx_numerical(area=area, params_variogram_model=params_variogram_model) + + # Otherwise, run the discrete sum of covariance + elif isinstance(area, (Vector, gpd.GeoDataFrame)): + + # If the input is a geopandas dataframe, put into a Vector object + if isinstance(area, gpd.GeoDataFrame): + V = Vector(area) + else: + V = area + + # Rasterize with numeric resolution or Raster metadata + if isinstance(rasterize_resolution, (float, int, np.floating, np.integer)): + + # We only need relative mask and coordinates, not absolute + mask = V.create_mask(xres=rasterize_resolution) + x = rasterize_resolution * np.arange(0, mask.shape[0]) + y = rasterize_resolution * np.arange(0, mask.shape[1]) + coords = np.meshgrid(x, y) + coords_on_mask = coords[:, mask] + + elif isinstance(rasterize_resolution, Raster): + + # With a Raster we can get the coordinates directly + mask = V.create_mask(rst=rasterize_resolution) + coords = rasterize_resolution.coords() + coords_on_mask = coords[:, mask] + + else: + ValueError('The rasterize resolution must be a float, integer or Raster subclass.') + + # Here we don't care about heteroscedasticity, so all errors are standardized to one + errors_on_mask = np.ones(len(coords_on_mask)) + + neff = neff_hugonnet_approx(coords=coords_on_mask, errors=errors_on_mask, + params_variogram_model=params_variogram_model, **kwargs) + + else: + ValueError('Area must be a float, integer, Vector subclass or geopandas dataframe.') + + return neff + def _std_err_finite(std: float, neff_tot: float, neff: float) -> float: """ From 044c312cca6678a733a253c9d445902632d1fc4a Mon Sep 17 00:00:00 2001 From: rhugonne Date: Fri, 19 Aug 2022 16:58:32 +0200 Subject: [PATCH 03/21] Finalize neff wrapper function and add tests --- tests/test_spatialstats.py | 72 ++++++++++++++++++++++++++++++++++---- xdem/spatialstats.py | 29 ++++++++++----- 2 files changed, 85 insertions(+), 16 deletions(-) diff --git a/tests/test_spatialstats.py b/tests/test_spatialstats.py index 28cfd6d0..a2e6bae9 100644 --- a/tests/test_spatialstats.py +++ b/tests/test_spatialstats.py @@ -8,6 +8,7 @@ import skgstat import geoutils as gu +from geoutils import Raster, Vector import numpy as np import pandas as pd import pytest @@ -21,13 +22,13 @@ PLOT = False -def load_ref_and_diff() -> tuple[gu.georaster.Raster, gu.georaster.Raster, np.ndarray]: +def load_ref_and_diff() -> tuple[Raster, Raster, np.ndarray]: """Load example files to try coregistration methods with.""" - reference_raster = gu.georaster.Raster(examples.get_path("longyearbyen_ref_dem")) - glacier_mask = gu.geovector.Vector(examples.get_path("longyearbyen_glacier_outlines")) + reference_raster = Raster(examples.get_path("longyearbyen_ref_dem")) + glacier_mask = Vector(examples.get_path("longyearbyen_glacier_outlines")) - ddem = gu.georaster.Raster(examples.get_path('longyearbyen_ddem')) + ddem = Raster(examples.get_path('longyearbyen_ddem')) mask = glacier_mask.create_mask(ddem) return reference_raster, ddem, mask @@ -351,8 +352,7 @@ def test_neff_exact_and_approx_hugonnet(self): # Generate a gridded dataset with varying errors associated to each pixel shape = (15, 15) - np.random.seed(42) - errors = np.abs(np.random.normal(0, 1, size=shape)) + errors = np.ones(shape) # Coordinates x = np.arange(0, shape[0]) @@ -372,7 +372,8 @@ def test_neff_exact_and_approx_hugonnet(self): t1 = time.time() # Check that the non-vectorized version gives the same result - neff_exact_nv = xdem.spatialstats.neff_exact(coords=coords, errors=errors, params_variogram_model=params_variogram_model, vectorized=False) + neff_exact_nv = xdem.spatialstats.neff_exact(coords=coords, errors=errors, + params_variogram_model=params_variogram_model, vectorized=False) t2 = time.time() assert neff_exact == pytest.approx(neff_exact_nv, rel=0.001) @@ -397,6 +398,63 @@ def test_neff_exact_and_approx_hugonnet(self): # Check that the approximation is about the same as the original estimate within 10% assert neff_approx == pytest.approx(neff_exact, rel=0.1) + def test_number_effective_samples(self): + """Test that the wrapper function for neff functions behaves correctly and that output values are robust""" + + # The function should return the same result as neff_circular_approx_numerical when using a numerical area + area = 10000 + params_variogram_model = pd.DataFrame(data={'model':['spherical', 'gaussian'], 'range':[300, 3000], 'psill':[0.5, 0.5]}) + + neff1 = xdem.spatialstats.neff_circular_approx_numerical(area=area, params_variogram_model=params_variogram_model) + neff2 = xdem.spatialstats.number_effective_samples(area=area, params_variogram_model=params_variogram_model) + + assert neff1 == pytest.approx(neff2, rel=0.0001) + + # The function should return the same results as neff_hugonnet_approx when using a shape area + # First, get the vector area and compute with the wrapper function + res = 100. + outlines = Vector(xdem.examples.get_path("longyearbyen_glacier_outlines")) + outlines_medals = Vector(outlines.ds[outlines.ds['NAME']=='Medalsbreen']) + neff1 = xdem.spatialstats.number_effective_samples(area=outlines_medals, params_variogram_model=params_variogram_model, + rasterize_resolution=res, random_state=42) + # Second, get coordinates manually and compute with the neff_approx_hugonnet function + mask = outlines_medals.create_mask(xres=res) + x = res * np.arange(0, mask.shape[0]) + y = res * np.arange(0, mask.shape[1]) + coords = np.array(np.meshgrid(y, x)) + coords_on_mask = coords[:, mask].T + errors_on_mask = np.ones(len(coords_on_mask)) + neff2 = xdem.spatialstats.neff_hugonnet_approx(coords=coords_on_mask, errors=errors_on_mask, + params_variogram_model=params_variogram_model, random_state=42) + # We can test the match between values accurately thanks to the random_state + assert neff1 == pytest.approx(neff2, rel=0.00001) + + # Check that using a Raster as input for the resolution works + raster = Raster(examples.get_path("longyearbyen_ref_dem")) + neff3 = xdem.spatialstats.number_effective_samples(area=outlines_medals, + params_variogram_model=params_variogram_model, + rasterize_resolution=raster, random_state=42) + # The value should be nearly the same within 1% (the discretization grid is different so affects a tiny bit the result) + assert neff3 == pytest.approx(neff2, rel=0.01) + + # Check that the number of effective samples matches that of the circular approximation within 20% + area_medals = np.sum(outlines_medals.ds.area.values) + neff4 = xdem.spatialstats.number_effective_samples(area=area_medals, params_variogram_model=params_variogram_model) + assert neff4 == pytest.approx(neff2, rel=0.2) + # The circular approximation is always conservative, so should yield a smaller value + assert neff4 < neff2 + + # Check that errors are correctly raised + with pytest.warns(UserWarning, match='Resolution for vector rasterization is not defined and thus set at 20% ' + 'of the shortest correlation range, which might result in large memory usage.'): + xdem.spatialstats.number_effective_samples(area=outlines_medals, params_variogram_model=params_variogram_model) + with pytest.raises(ValueError, match='Area must be a float, integer, Vector subclass or geopandas dataframe.'): + xdem.spatialstats.number_effective_samples(area='not supported', params_variogram_model=params_variogram_model) + with pytest.raises(ValueError, match='The rasterize resolution must be a float, integer or Raster subclass.'): + xdem.spatialstats.number_effective_samples(area=10, params_variogram_model=params_variogram_model, + rasterize_resolution=(10, 10)) + + class TestSubSampling: def test_circular_masking(self): diff --git a/xdem/spatialstats.py b/xdem/spatialstats.py index 33a6dfb7..bf79ec99 100644 --- a/xdem/spatialstats.py +++ b/xdem/spatialstats.py @@ -1266,7 +1266,7 @@ def neff_exact(coords: np.ndarray, errors: np.ndarray, params_variogram_model: p Exact number of effective samples derived from a double sum of covariance with euclidean coordinates based on the provided variogram parameters. This method works for any shape of area. - :param coords: Center coordinates with size (N,) for each spatial support (typically, pixel) + :param coords: Center coordinates with size (N,2) for each spatial support (typically, pixel) :param errors: Errors at the coordinates with size (N,) for each spatial support (typically, pixel) :param params_variogram_model: Dataframe of variogram models to sum with three to four columns, "model" for the model types (e.g., ["spherical", "matern"]), "range" for the correlation ranges (e.g., [2, 100]), "psill" for the partial @@ -1327,7 +1327,7 @@ def neff_hugonnet_approx(coords: np.ndarray, errors: np.ndarray, params_variogra based on euclidean coordinates with the provided variogram parameters. This method works for any shape of area. See Hugonnet et al. (2022), https://doi.org/10.1109/jstars.2022.3188922, in particular Supplementary Fig. S16. - :param coords: Center coordinates with size (N,) for each spatial support (typically, pixel) + :param coords: Center coordinates with size (N,2) for each spatial support (typically, pixel) :param errors: Errors at the coordinates with size (N,) for each spatial support (typically, pixel) :param params_variogram_model: Dataframe of variogram models to sum with three to four columns, "model" for the model types (e.g., ["spherical", "matern"]), "range" for the correlation ranges (e.g., [2, 100]), "psill" for the partial @@ -1358,8 +1358,11 @@ def neff_hugonnet_approx(coords: np.ndarray, errors: np.ndarray, params_variogra n = len(coords) pds = pdist(coords) + # At maximum, the number of subsamples has to be equal to number of points + subsample = min(subsample, n) + # Get random subset of points for one of the sums - rand_points = rnd.choice(n, size=min(subsample, n), replace=False) + rand_points = rnd.choice(n, size=subsample, replace=False) # Now we compute the double covariance sum # Either using for-loop-version @@ -1413,7 +1416,9 @@ def number_effective_samples(area: float | int | VectorType | gpd.GeoDataFrame, area shape, based on a generalization of the approach of Rolstad et al. (2009), http://dx.doi.org/10.3189/002214309789470950. By default, if a numeric value is passed for an area, the continuous method is used considering a disk shape. If a - vector is passed, the discretized method is computed on that shape. + vector is passed, the discretized method is computed on that shape. If the discretized method is used, a resolution + for rasterization is generally expected, otherwise is arbitrarily chosen as a fifth of the shortest correlation + range to ensure a sufficiently fine grid for propagation of the shortest range. :param area: Area of interest either as a numeric value of surface in the same unit as the variogram ranges (will assume a circular shape), or as a vector (shapefile) of the area @@ -1423,6 +1428,7 @@ def number_effective_samples(area: float | int | VectorType | gpd.GeoDataFrame, [None, 0.2]). :param rasterize_resolution: Resolution to rasterize the area if passed as a vector. Can be a float value or a Raster. :param kwargs: Keyword argument to pass to the `neff_hugonnet_approx` function. + :return: Number of effective samples """ @@ -1442,6 +1448,11 @@ def number_effective_samples(area: float | int | VectorType | gpd.GeoDataFrame, else: V = area + if rasterize_resolution is None: + rasterize_resolution = np.min(params_variogram_model['range'].values)/5. + warnings.warn('Resolution for vector rasterization is not defined and thus set at 20% of the shortest ' + 'correlation range, which might result in large memory usage.') + # Rasterize with numeric resolution or Raster metadata if isinstance(rasterize_resolution, (float, int, np.floating, np.integer)): @@ -1449,15 +1460,15 @@ def number_effective_samples(area: float | int | VectorType | gpd.GeoDataFrame, mask = V.create_mask(xres=rasterize_resolution) x = rasterize_resolution * np.arange(0, mask.shape[0]) y = rasterize_resolution * np.arange(0, mask.shape[1]) - coords = np.meshgrid(x, y) - coords_on_mask = coords[:, mask] + coords = np.array(np.meshgrid(y, x)) + coords_on_mask = coords[:, mask].T elif isinstance(rasterize_resolution, Raster): # With a Raster we can get the coordinates directly - mask = V.create_mask(rst=rasterize_resolution) - coords = rasterize_resolution.coords() - coords_on_mask = coords[:, mask] + mask = V.create_mask(rst=rasterize_resolution).squeeze() + coords = np.array(rasterize_resolution.coords()) + coords_on_mask = coords[:, mask].T else: ValueError('The rasterize resolution must be a float, integer or Raster subclass.') From eb5b4ee03517062fa4ec369ca3a0a1d1f9517979 Mon Sep 17 00:00:00 2001 From: rhugonne Date: Fri, 19 Aug 2022 23:22:29 +0200 Subject: [PATCH 04/21] Fix tests --- tests/test_spatialstats.py | 20 ++++++++++---------- xdem/spatialstats.py | 4 ++-- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/tests/test_spatialstats.py b/tests/test_spatialstats.py index a2e6bae9..4030f731 100644 --- a/tests/test_spatialstats.py +++ b/tests/test_spatialstats.py @@ -414,11 +414,11 @@ def test_number_effective_samples(self): # First, get the vector area and compute with the wrapper function res = 100. outlines = Vector(xdem.examples.get_path("longyearbyen_glacier_outlines")) - outlines_medals = Vector(outlines.ds[outlines.ds['NAME']=='Medalsbreen']) - neff1 = xdem.spatialstats.number_effective_samples(area=outlines_medals, params_variogram_model=params_variogram_model, + outlines_brom = Vector(outlines.ds[outlines.ds['NAME']=='Brombreen']) + neff1 = xdem.spatialstats.number_effective_samples(area=outlines_brom, params_variogram_model=params_variogram_model, rasterize_resolution=res, random_state=42) # Second, get coordinates manually and compute with the neff_approx_hugonnet function - mask = outlines_medals.create_mask(xres=res) + mask = outlines_brom.create_mask(xres=res) x = res * np.arange(0, mask.shape[0]) y = res * np.arange(0, mask.shape[1]) coords = np.array(np.meshgrid(y, x)) @@ -431,15 +431,15 @@ def test_number_effective_samples(self): # Check that using a Raster as input for the resolution works raster = Raster(examples.get_path("longyearbyen_ref_dem")) - neff3 = xdem.spatialstats.number_effective_samples(area=outlines_medals, + neff3 = xdem.spatialstats.number_effective_samples(area=outlines_brom, params_variogram_model=params_variogram_model, rasterize_resolution=raster, random_state=42) - # The value should be nearly the same within 1% (the discretization grid is different so affects a tiny bit the result) - assert neff3 == pytest.approx(neff2, rel=0.01) + # The value should be nearly the same within 2% (the discretization grid is different so affects a tiny bit the result) + assert neff3 == pytest.approx(neff2, rel=0.02) # Check that the number of effective samples matches that of the circular approximation within 20% - area_medals = np.sum(outlines_medals.ds.area.values) - neff4 = xdem.spatialstats.number_effective_samples(area=area_medals, params_variogram_model=params_variogram_model) + area_brom = np.sum(outlines_brom.ds.area.values) + neff4 = xdem.spatialstats.number_effective_samples(area=area_brom, params_variogram_model=params_variogram_model) assert neff4 == pytest.approx(neff2, rel=0.2) # The circular approximation is always conservative, so should yield a smaller value assert neff4 < neff2 @@ -447,11 +447,11 @@ def test_number_effective_samples(self): # Check that errors are correctly raised with pytest.warns(UserWarning, match='Resolution for vector rasterization is not defined and thus set at 20% ' 'of the shortest correlation range, which might result in large memory usage.'): - xdem.spatialstats.number_effective_samples(area=outlines_medals, params_variogram_model=params_variogram_model) + xdem.spatialstats.number_effective_samples(area=outlines_brom, params_variogram_model=params_variogram_model) with pytest.raises(ValueError, match='Area must be a float, integer, Vector subclass or geopandas dataframe.'): xdem.spatialstats.number_effective_samples(area='not supported', params_variogram_model=params_variogram_model) with pytest.raises(ValueError, match='The rasterize resolution must be a float, integer or Raster subclass.'): - xdem.spatialstats.number_effective_samples(area=10, params_variogram_model=params_variogram_model, + xdem.spatialstats.number_effective_samples(area=outlines_brom, params_variogram_model=params_variogram_model, rasterize_resolution=(10, 10)) diff --git a/xdem/spatialstats.py b/xdem/spatialstats.py index bf79ec99..b6df8c65 100644 --- a/xdem/spatialstats.py +++ b/xdem/spatialstats.py @@ -1471,7 +1471,7 @@ def number_effective_samples(area: float | int | VectorType | gpd.GeoDataFrame, coords_on_mask = coords[:, mask].T else: - ValueError('The rasterize resolution must be a float, integer or Raster subclass.') + raise ValueError('The rasterize resolution must be a float, integer or Raster subclass.') # Here we don't care about heteroscedasticity, so all errors are standardized to one errors_on_mask = np.ones(len(coords_on_mask)) @@ -1480,7 +1480,7 @@ def number_effective_samples(area: float | int | VectorType | gpd.GeoDataFrame, params_variogram_model=params_variogram_model, **kwargs) else: - ValueError('Area must be a float, integer, Vector subclass or geopandas dataframe.') + raise ValueError('Area must be a float, integer, Vector subclass or geopandas dataframe.') return neff From d4bec4c0d5c0a6c7d5272307f68d515c7e97f6c1 Mon Sep 17 00:00:00 2001 From: rhugonne Date: Sat, 20 Aug 2022 22:04:13 +0200 Subject: [PATCH 05/21] Add pipeline error functions --- xdem/spatialstats.py | 350 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 344 insertions(+), 6 deletions(-) diff --git a/xdem/spatialstats.py b/xdem/spatialstats.py index b6df8c65..7138def1 100644 --- a/xdem/spatialstats.py +++ b/xdem/spatialstats.py @@ -8,7 +8,7 @@ import warnings from functools import partial -from typing import Callable, Union, Iterable, Optional, Sequence, Any +from typing import Callable, Union, Iterable, Optional, Sequence, Any, overload import itertools import matplotlib @@ -164,7 +164,7 @@ def nd_binning(values: np.ndarray, list_var: Iterable[np.ndarray], list_var_name return df_concat -def interp_nd_binning(df: pd.DataFrame, list_var_names: Union[str,list[str]], statistic : Union[str, Callable[[np.ndarray],float]] = nmad, +def interp_nd_binning(df: pd.DataFrame, list_var_names: Union[str, Iterable[str]], statistic : Union[str, Callable[[np.ndarray],float]] = nmad, min_count: Optional[int] = 100) -> Callable[[tuple[np.ndarray, ...]], np.ndarray]: """ Estimate an interpolant function for an N-dimensional binning. Preferably based on the output of nd_binning. @@ -246,7 +246,7 @@ def interp_nd_binning(df: pd.DataFrame, list_var_names: Union[str,list[str]], st # Remove statistic values calculated with a sample count under the minimum count if min_count is not None: - df_sub.loc[df_sub['count'] < min_count,statistic_name] = np.nan + df_sub.loc[df_sub['count'] < min_count, statistic_name] = np.nan values = df_sub[statistic_name].values ind_valid = np.isfinite(values) @@ -282,10 +282,204 @@ def interp_nd_binning(df: pd.DataFrame, list_var_names: Union[str,list[str]], st return interp_fun +def two_step_standardization(dvalues: np.ndarray, list_var: Iterable[np.ndarray], + unscaled_error_fun: Callable[[tuple[np.ndarray, ...]], np.ndarray], + spread_statistic: Callable = nmad, + fac_spread_outliers: float | None = 7 + ) -> tuple[np.ndarray, Callable[[tuple[np.ndarray, ...]], np.ndarray]]: + """ + Standardize the proxy differenced values using the modelled heteroscedasticity, re-scaled to the spread statistic, + and generate the final standardization function. + + :param dvalues: Proxy values array of size (N,) + :param list_var: List of size (L) of explanatory variables array of size (N,) + :param unscaled_error_fun: Function of the spread with explanatory variables not yet re-scaled + :param spread_statistic: Statistic to be computed for the spread; defaults to nmad + :param fac_spread_outliers: Exclude outliers outside this spread after standardizing; pass None to ignore. + + :return: Standardized values array of size (N,), Function to destandardize + """ + + # Standardize a first time with the function + zscores = dvalues / unscaled_error_fun(tuple(list_var)) + + # Set large outliers that might have been created by the standardization to NaN, central tendency should already be + # around zero so only need to take the absolute value + if fac_spread_outliers is not None: + zscores[np.abs(zscores) > fac_spread_outliers * spread_statistic(zscores)] = np.nan + + # Re-compute the spread statistic to re-standardize, as dividing by the function will not necessarily bring the + # z-score exactly equal to one due to approximations of N-D binning, interpolating and due to the outlier filtering + zscore_nmad = spread_statistic(zscores) + + # Re-standardize + zscores /= zscore_nmad + + # Define the exact function for de-standardization to pass as output + def error_fun(*args): + return zscore_nmad * unscaled_error_fun(*args) + + return zscores, error_fun + + +def estimate_model_heteroscedasticity(dvalues: np.ndarray, list_var: Iterable[np.ndarray], list_var_names: Iterable[str], + spread_statistic: Callable = nmad, + list_var_bins: Optional[Union[int,Iterable[Iterable]]] = None, + min_count: Optional[int] = 100, + fac_spread_outliers: float | None = 7 + ) -> tuple[pd.DataFrame, Callable[[tuple[np.ndarray, ...]], np.ndarray]]: + """ + Estimate and model the heteroscedasticity (i.e., variability in error) according to a list of explanatory variables + from a proxy of differenced values (e.g., elevation differences), if possible compared to a source of higher + precision. + + This function performs N-D data binning with the list of explanatory variable for a spread statistic, then + performs N-D interpolation on this statistic, scales the output with a two-step standardization to return an error + function of the explanatory variables. + + The functions used are `nd_binning`, `interp_nd_binning` and `two_step_standardization`. + + :param dvalues: Proxy values array of size (N,) + :param list_var: List of size (L) of explanatory variables array of size (N,) + :param list_var_names: List of size (L) of names of the explanatory variables + :param spread_statistic: Statistic to be computed for the spread; defaults to nmad + :param list_var_bins: Count of size (1), or list of size (L) of counts or custom bin edges for the explanatory variables; defaults to 10 bins + :param min_count: Minimum number of samples to be used as a valid statistic (replaced by nodata) + :param fac_spread_outliers: Exclude outliers outside this spread after standardizing; pass None to ignore. + + :return: Dataframe of binned spread statistic with explanatory variables, Error function with explanatory variables + """ + + # Perform N-D binning with the differenced values computing the spread statistic + df = nd_binning(values=dvalues, list_var=list_var, list_var_names=list_var_names, statistics=[spread_statistic], + list_var_bins=list_var_bins) + + # Perform N-D linear interpolation for the spread statistic + fun = interp_nd_binning(df, list_var_names=list_var_names, statistic=spread_statistic.__name__, min_count=min_count) + + # Get the final function based on a two-step standardization + final_fun = two_step_standardization(dvalues=dvalues, list_var=list_var, unscaled_error_fun=fun, + spread_statistic=spread_statistic, fac_spread_outliers=fac_spread_outliers)[1] + + return df, final_fun + + +@overload +def infer_heteroscedasticy_from_stable(dvalues: np.ndarray , list_var: list[np.ndarray | RasterType], + stable_mask: np.ndarray | VectorType | gpd.GeoDataFrame, + list_var_names: Iterable[str], + spread_statistic: Callable, + list_var_bins: Optional[Union[int,Iterable[Iterable]]], + min_count: Optional[int], + factor_spread_exclude_outliers: float | None, + ) -> tuple[np.ndarray, + pd.DataFrame, + Callable[[tuple[np.ndarray, ...]], np.ndarray]]: ... + +@overload +def infer_heteroscedasticy_from_stable(dvalues: RasterType , list_var: list[np.ndarray | RasterType], + stable_mask: np.ndarray | VectorType | gpd.GeoDataFrame, + list_var_names: Iterable[str], + spread_statistic: Callable, + list_var_bins: Optional[Union[int,Iterable[Iterable]]], + min_count: Optional[int], + factor_spread_exclude_outliers: float | None, + ) -> tuple[RasterType, + pd.DataFrame, + Callable[[tuple[np.ndarray, ...]], np.ndarray]]: ... + +def infer_heteroscedasticy_from_stable(dvalues: np.ndarray | RasterType, list_var: list[np.ndarray | RasterType], + stable_mask: np.ndarray | VectorType | gpd.GeoDataFrame, + list_var_names: Iterable[str] = None, + spread_statistic: Callable = nmad, + list_var_bins: Optional[Union[int,Iterable[Iterable]]] = None, + min_count: Optional[int] = 100, + fac_spread_outliers: float | None = 7, + ) -> tuple[np.ndarray | RasterType, + pd.DataFrame, + Callable[[tuple[np.ndarray, ...]], np.ndarray]]: + """ + Infer heteroscedasticity from differenced values on stable terrain and a list of explanatory variables. + + This function returns an error map, a dataframe of spread values and the error function with explanatory variables. + It is a convenience wrapper for `estimate_model_heteroscedasticity` to work on either Raster or array, compute the + stable mask and return an error map. + + :param dvalues: Proxy values as array or Raster + :param list_var: List of size (L) of explanatory variables as array or Raster of same shape as dvalues + :param stable_mask: Vector shapefile of stable terrain (if dvalues is Raster), or boolean array of same shape as dvalues + :param list_var_names: List of size (L) of names of the explanatory variables, otherwise named var1, var2, etc. + :param spread_statistic: Statistic to be computed for the spread; defaults to nmad + :param list_var_bins: Count of size (1), or list of size (L) of counts or custom bin edges for the explanatory variables; defaults to 10 bins + :param min_count: Minimum number of samples to be used as a valid statistic (replaced by nodata) + :param fac_spread_outliers: Exclude outliers outside this spread after standardizing; pass None to ignore. + + :return: Inferred error map (array or Raster, same as input proxy values), + Dataframe of binned spread statistic with explanatory variables, + Error function with explanatory variables + """ + + # Check inputs + if not isinstance(dvalues, (Raster, np.ndarray)): + raise ValueError('The dvalues must be a Raster or NumPy array.') + if not isinstance(stable_mask, (np.ndarray, Vector, gpd.GeoDataFrame)): + raise ValueError('The stable mask must be a Vector, GeoDataFrame or NumPy array.') + + # Check that input stable mask can only be a georeferenced vector if the proxy values are a Raster to project onto + if isinstance(dvalues, Raster) and isinstance(stable_mask, (Vector, gpd.GeoDataFrame)): + raise ValueError('The stable mask can only passed as a Vector or GeoDataFrame if the input dvalues is a Raster.') + + # Create placeholder variables names if those don't exist + if list_var_names is None: + list_var_names = ['var'+str(i) for i in range(len(list_var))] + + # Get the arrays for proxy values and explanatory variables + if isinstance(dvalues, Raster): + dvalues_arr = get_array_and_mask(dvalues)[0] + else: + dvalues_arr = dvalues + list_var_arr = [get_array_and_mask(var)[0] if isinstance(var, Raster) else var + for var in list_var if isinstance(var, Raster)] + + # If the stable mask is not an array, create it + if not isinstance(stable_mask, np.ndarray): + + # If the stable mask is a geopandas dataframe, wrap it in a Vector object + if isinstance(stable_mask, gpd.GeoDataFrame): + stable_vector = Vector(stable_mask) + else: + stable_vector = stable_mask + + # Create the mask + stable_mask_arr = stable_vector.create_mask(dvalues) + # If the mask is already an array, just pass it + else: + stable_mask_arr = stable_mask + + # Get the subsets on stable terrain + dvalues_stable_arr = dvalues_arr[stable_mask_arr] + list_var_stable_arr = [var_arr[stable_mask_arr] for var_arr in list_var_arr] + + # Estimate and model the heteroscedasticity using only stable terrain + df, fun = estimate_model_heteroscedasticity(dvalues=dvalues_stable_arr, list_var=list_var_stable_arr, + list_var_names=list_var_names, spread_statistic=spread_statistic, + list_var_bins=list_var_bins, min_count=min_count, + fac_spread_outliers=fac_spread_outliers) + + # Use the standardization function to get the error array for the entire input array (not only stable) + error = fun(tuple(list_var_arr)) + + # Return the right type, depending on dvalues input + if isinstance(dvalues, Raster): + return dvalues.copy(new_array=error) + else: + return error + + def _create_circular_mask(shape: Union[int, Sequence[int]], center: Optional[list[float]] = None, radius: Optional[float] = None) -> np.ndarray: """ - Create circular mask on a raster, defaults to the center of the array and it's half width + Create circular mask on a raster, defaults to the center of the array and its half width :param shape: shape of array :param center: center @@ -959,9 +1153,9 @@ def rho(h): return rho -def fit_sum_model_variogram(list_models: list[str] | list[Callable], empirical_variogram: pd.DataFrame, +def fit_sum_model_variogram(list_models: list[str | Callable], empirical_variogram: pd.DataFrame, bounds: list[tuple[float, float]] = None, - p0: list[float] = None) -> tuple[Callable, pd.DataFrame]: + p0: list[float] = None) -> tuple[Callable[[np.ndarray], np.ndarray], pd.DataFrame]: """ Fit a sum of variogram models to an empirical variogram, with weighted least-squares based on sampling errors. To use preferably with the empirical variogram dataframe returned by the `sample_empirical_variogram` function. @@ -1069,6 +1263,150 @@ def variogram_sum(h, *args): return variogram_sum_fit, df_params +def estimate_model_spatial_correlation(dvalues: Union[np.ndarray, RasterType], list_models: list[str | Callable], + estimator = 'dowd', gsd: float = None, coords: np.ndarray = None, subsample: int = 1000, + subsample_method: str = 'cdist_equidistant', n_variograms: int = 1, + n_jobs: int = 1, verbose = False, + random_state: None | np.random.RandomState | np.random.Generator | int = None, + bounds: list[tuple[float, float]] = None, p0: list[float] = None, + **kwargs) -> tuple[pd.DataFrame, pd.DataFrame, Callable[[np.ndarray], np.ndarray]]: + + """ + Estimate and model the spatial correlation of the input variable by empirical variogram sampling and fitting of a + sum of variogram model. + + This function samples an empirical variogram using skgstat.Variogram, then optimizes by weighted least-squares the + sum of a defined number of models, using the functions `sample_empirical_variogram` and `fit_sum_model_variogram`. + + :param dvalues: Proxy values of studied variable + :param list_models: List of K variogram models to sum for the fit in order from short to long ranges. Can either be + a 3-letter string, full string of the variogram name or SciKit-GStat model function (e.g., for a + spherical model "Sph", "Spherical" or skgstat.models.spherical). + :param estimator: Estimator for the empirical variogram; default to Dowd's variogram (see skgstat.Variogram for + the list of available estimators). + :param gsd: Ground sampling distance + :param coords: Coordinates + :param subsample: Number of samples to randomly draw from the values + :param subsample_method: Spatial subsampling method + :param n_variograms: Number of independent empirical variogram estimations + :param n_jobs: Number of processing cores + :param verbose: Print statements during processing + :param random_state: Random state or seed number to use for calculations (to fix random sampling during testing) + :param bounds: Bounds of range and sill parameters for each model (shape K x 4 = K x range lower, range upper, sill lower, sill upper). + :param p0: Initial guess of ranges and sills each model (shape K x 2 = K x range first guess, sill first guess). + + :return: Dataframe of empirical variogram, Dataframe of optimized model parameters, Spatial correlation function + """ + + empirical_variogram = sample_empirical_variogram(values=dvalues, estimator=estimator, gsd=gsd, coords=coords, + subsample=subsample, subsample_method=subsample_method, + n_variograms=n_variograms, n_jobs=n_jobs, verbose=verbose, + random_state=random_state, **kwargs) + + params_variogram_model = fit_sum_model_variogram(list_models=list_models, empirical_variogram=empirical_variogram, + bounds=bounds, p0=p0)[1] + + spatial_correlation_func = correlation_from_variogram(params_variogram_model=params_variogram_model) + + return empirical_variogram, params_variogram_model, spatial_correlation_func + +def infer_spatial_correlation_from_stable(dvalues: np.ndarray | RasterType, + stable_mask: np.ndarray | VectorType | gpd.GeoDataFrame, + list_models: list[str | Callable], + errors: np.ndarray | RasterType = None, + estimator = 'dowd', gsd: float = None, coords: np.ndarray = None, + subsample: int = 1000, subsample_method: str = 'cdist_equidistant', + n_variograms: int = 1, n_jobs: int = 1, verbose = False, + random_state: None | np.random.RandomState | np.random.Generator | int = None, + bounds: list[tuple[float, float]] = None, p0: list[float] = None, + **kwargs + ) -> tuple[pd.DataFrame, pd.DataFrame, Callable[[np.ndarray], np.ndarray]]: + """ + Infer spatial correlation of errors from differenced values on stable terrain and a list of variogram model to fit + as a sum. + + This function returns a dataframe of the empirical variogram, a dataframe of optimized model parameters, and a + spatial correlation function. + It is a convenience wrapper for `estimate_model_spatial_correlation` to work on either Raster or array and compute + the stable mask. + + :param dvalues: Proxy values as array or Raster + :param stable_mask: Vector shapefile of stable terrain (if dvalues is Raster), or boolean array of same shape as dvalues + :param errors: Error values to account for heteroscedasticity (ignored if None). + :param list_models: List of K variogram models to sum for the fit in order from short to long ranges. Can either be + a 3-letter string, full string of the variogram name or SciKit-GStat model function (e.g., for a + spherical model "Sph", "Spherical" or skgstat.models.spherical). + :param estimator: Estimator for the empirical variogram; default to Dowd's variogram (see skgstat.Variogram for + the list of available estimators). + :param gsd: Ground sampling distance + :param coords: Coordinates + :param subsample: Number of samples to randomly draw from the values + :param subsample_method: Spatial subsampling method + :param n_variograms: Number of independent empirical variogram estimations + :param n_jobs: Number of processing cores + :param verbose: Print statements during processing + :param random_state: Random state or seed number to use for calculations (to fix random sampling during testing) + :param bounds: Bounds of range and sill parameters for each model (shape K x 4 = K x range lower, range upper, sill lower, sill upper). + :param p0: Initial guess of ranges and sills each model (shape K x 2 = K x range first guess, sill first guess). + + :return: Dataframe of empirical variogram, Dataframe of optimized model parameters, Spatial correlation function + """ + + # Check inputs + if not isinstance(dvalues, (Raster, np.ndarray)): + raise ValueError('The dvalues must be a Raster or NumPy array.') + if not isinstance(stable_mask, (np.ndarray, Vector, gpd.GeoDataFrame)): + raise ValueError('The stable mask must be a Vector, GeoDataFrame or NumPy array.') + + # Check that input stable mask can only be a georeferenced vector if the proxy values are a Raster to project onto + if isinstance(dvalues, Raster) and isinstance(stable_mask, (Vector, gpd.GeoDataFrame)): + raise ValueError( + 'The stable mask can only passed as a Vector or GeoDataFrame if the input dvalues is a Raster.') + + # Get array if input is a Raster + if isinstance(dvalues, Raster): + dvalues_arr = get_array_and_mask(dvalues)[0] + else: + dvalues_arr = dvalues + + # If the stable mask is not an array, create it + if not isinstance(stable_mask, np.ndarray): + + # If the stable mask is a geopandas dataframe, wrap it in a Vector object + if isinstance(stable_mask, gpd.GeoDataFrame): + stable_vector = Vector(stable_mask) + else: + stable_vector = stable_mask + + # Create the mask + stable_mask_arr = stable_vector.create_mask(dvalues) + # If the mask is already an array, just pass it + else: + stable_mask_arr = stable_mask + + # Get the subsets on stable terrain + dvalues_stable_arr = dvalues_arr[stable_mask_arr] + + # Perform standardization if error array is provided + if errors is not None: + if isinstance(errors, Raster): + errors_arr = get_array_and_mask(errors)[0] + else: + errors_arr = errors + errors_stable_arr = errors_arr[stable_mask_arr] + + # Standardize + dvalues_stable_arr /= errors_stable_arr + + # Estimate and model spatial correlations + empirical_variogram, params_variogram_model, spatial_correlation_func = estimate_model_spatial_correlation( + values=dvalues_stable_arr, list_models=list_models, estimator=estimator, gsd=gsd, coords=coords, + subsample=subsample, subsample_method=subsample_method, n_variograms=n_variograms, n_jobs=n_jobs, + verbose=verbose, random_state=random_state, bounds=bounds, p0=p0, **kwargs) + + return empirical_variogram, params_variogram_model, spatial_correlation_func + + def _check_validity_params_variogram(params_variogram_model: pd.DataFrame): """Check the validity of the modelled variogram parameters dataframe (mostly in the case it is passed manually).""" From 45df8ec0e54fef8a326f53ee276080b5c7ab3ad6 Mon Sep 17 00:00:00 2001 From: rhugonne Date: Sun, 21 Aug 2022 11:17:46 +0200 Subject: [PATCH 06/21] Finalize convenience error pipeline functions and add tests --- tests/test_spatialstats.py | 183 +++++++++++++++++++++++++++++++++++-- xdem/spatialstats.py | 104 ++++++++++++++++----- 2 files changed, 256 insertions(+), 31 deletions(-) diff --git a/tests/test_spatialstats.py b/tests/test_spatialstats.py index 4030f731..f2749b6e 100644 --- a/tests/test_spatialstats.py +++ b/tests/test_spatialstats.py @@ -8,6 +8,7 @@ import skgstat import geoutils as gu +import geoutils.spatial_tools from geoutils import Raster, Vector import numpy as np import pandas as pd @@ -22,7 +23,7 @@ PLOT = False -def load_ref_and_diff() -> tuple[Raster, Raster, np.ndarray]: +def load_ref_and_diff() -> tuple[Raster, Raster, np.ndarray, Vector]: """Load example files to try coregistration methods with.""" reference_raster = Raster(examples.get_path("longyearbyen_ref_dem")) @@ -31,7 +32,7 @@ def load_ref_and_diff() -> tuple[Raster, Raster, np.ndarray]: ddem = Raster(examples.get_path('longyearbyen_ddem')) mask = glacier_mask.create_mask(ddem) - return reference_raster, ddem, mask + return reference_raster, ddem, mask, glacier_mask class TestVariogram: @@ -262,6 +263,65 @@ def test_check_params_variogram_model(self): xdem.spatialstats._check_validity_params_variogram( pd.DataFrame(data={'model': ['stable'], 'range': [100], 'psill': [1], 'smooth': [-1]})) + def test_estimate_model_spatial_correlation_and_infer_from_stable(self): + """Test consistency of outputs and errors in wrapper functions for estimation of spatial correlation""" + + # Load data + diff, mask_glacier, vector_glacier = load_ref_and_diff()[1:4] + + # Keep only data on stable + diff_on_stable = diff.copy() + diff_on_stable.set_mask(mask_glacier) + + # Run wrapper estimate and model function + emp_vgm_1, params_model_vgm_1, _ = \ + xdem.spatialstats.estimate_model_spatial_correlation(dvalues=diff_on_stable, list_models=['Gau', 'Sph'], + subsample=100, random_state=42) + + # Check that the output matches that of the original function under the same random state + emp_vgm_2 = xdem.spatialstats.sample_empirical_variogram(values=diff_on_stable, estimator='dowd', subsample=100, + random_state=42) + pd.testing.assert_frame_equal(emp_vgm_1, emp_vgm_2) + params_model_vgm_2 = xdem.spatialstats.fit_sum_model_variogram(list_models=['Gau', 'Sph'], + empirical_variogram=emp_vgm_2)[1] + pd.testing.assert_frame_equal(params_model_vgm_1, params_model_vgm_2) + + # Run wrapper infer from stable function with a Raster and the mask, and check the consistency there as well + emp_vgm_3, params_model_vgm_3, _ = \ + xdem.spatialstats.infer_spatial_correlation_from_stable(dvalues=diff_on_stable, stable_mask=~mask_glacier.squeeze(), + list_models=['Gau', 'Sph'], subsample=100, random_state=42) + pd.testing.assert_frame_equal(emp_vgm_1, emp_vgm_3) + pd.testing.assert_frame_equal(params_model_vgm_1, params_model_vgm_3) + + # Run again with array instead of Raster as input + diff_on_stable_arr = gu.spatial_tools.get_array_and_mask(diff_on_stable)[0] + emp_vgm_4, params_model_vgm_4, _ = \ + xdem.spatialstats.infer_spatial_correlation_from_stable(dvalues=diff_on_stable_arr, gsd=diff_on_stable.res[0], + stable_mask=~mask_glacier.squeeze(), + list_models=['Gau', 'Sph'], subsample=100, + random_state=42) + pd.testing.assert_frame_equal(emp_vgm_1, emp_vgm_4) + pd.testing.assert_frame_equal(params_model_vgm_1, params_model_vgm_4) + + # Check that errors are raised with wrong input + with pytest.raises(ValueError, match='The dvalues must be a Raster or NumPy array.'): + xdem.spatialstats.infer_spatial_correlation_from_stable(dvalues='not_an_array', stable_mask=~mask_glacier.squeeze(), + list_models=['Gau', 'Sph'], random_state=42) + with pytest.raises(ValueError, match='The stable mask must be a Vector, GeoDataFrame or NumPy array.'): + xdem.spatialstats.infer_spatial_correlation_from_stable(dvalues=diff, + stable_mask='not_a_vector_or_array', + list_models=['Gau', 'Sph'], random_state=42) + with pytest.raises(ValueError, match='The unstable mask must be a Vector, GeoDataFrame or NumPy array.'): + xdem.spatialstats.infer_spatial_correlation_from_stable(dvalues=diff, + unstable_mask='not_a_vector_or_array', + list_models=['Gau', 'Sph'], random_state=42) + diff_on_stable_arr = gu.spatial_tools.get_array_and_mask(diff_on_stable)[0] + with pytest.raises(ValueError, match='The stable mask can only passed as a Vector or GeoDataFrame if the input ' + 'dvalues is a Raster.'): + xdem.spatialstats.infer_spatial_correlation_from_stable(dvalues=diff_on_stable_arr, + stable_mask=vector_glacier, + list_models=['Gau', 'Sph'], random_state=42) + def test_empirical_fit_plotting(self): """Verify that the shape of the empirical variogram output works with the fit and plotting""" @@ -401,6 +461,9 @@ def test_neff_exact_and_approx_hugonnet(self): def test_number_effective_samples(self): """Test that the wrapper function for neff functions behaves correctly and that output values are robust""" + raster = load_ref_and_diff()[0] + outlines = load_ref_and_diff()[3] + # The function should return the same result as neff_circular_approx_numerical when using a numerical area area = 10000 params_variogram_model = pd.DataFrame(data={'model':['spherical', 'gaussian'], 'range':[300, 3000], 'psill':[0.5, 0.5]}) @@ -413,7 +476,6 @@ def test_number_effective_samples(self): # The function should return the same results as neff_hugonnet_approx when using a shape area # First, get the vector area and compute with the wrapper function res = 100. - outlines = Vector(xdem.examples.get_path("longyearbyen_glacier_outlines")) outlines_brom = Vector(outlines.ds[outlines.ds['NAME']=='Brombreen']) neff1 = xdem.spatialstats.number_effective_samples(area=outlines_brom, params_variogram_model=params_variogram_model, rasterize_resolution=res, random_state=42) @@ -430,7 +492,6 @@ def test_number_effective_samples(self): assert neff1 == pytest.approx(neff2, rel=0.00001) # Check that using a Raster as input for the resolution works - raster = Raster(examples.get_path("longyearbyen_ref_dem")) neff3 = xdem.spatialstats.number_effective_samples(area=outlines_brom, params_variogram_model=params_variogram_model, rasterize_resolution=raster, random_state=42) @@ -544,9 +605,10 @@ class TestBinning: def test_nd_binning(self): - ref, diff, mask = load_ref_and_diff() + ref, diff, mask = load_ref_and_diff()[0:3] - slope, aspect = xdem.coreg.calculate_slope_and_aspect(ref.data.squeeze()) + ref_arr = gu.spatial_tools.get_array_and_mask(ref)[0] + slope, aspect = xdem.terrain.get_terrain_attribute(ref_arr, resolution=ref.res, attribute=['slope', 'aspect']) # 1D binning, by default will create 10 bins df = xdem.spatialstats.nd_binning(values=diff.data.flatten(), list_var=[slope.flatten()], list_var_names=['slope']) @@ -660,8 +722,10 @@ def test_interp_nd_binning(self): df[np.logical_and.reduce((df['var1'] == i, df['var2'] == j, df['var3'] == k))]['statistic'].values[0] # Check if it works with nd_binning output - ref, diff, mask = load_ref_and_diff() - slope, aspect = xdem.coreg.calculate_slope_and_aspect(ref.data.squeeze()) + ref, diff, mask = load_ref_and_diff()[0:3] + ref_arr = gu.spatial_tools.get_array_and_mask(ref)[0] + + slope, aspect = xdem.terrain.get_terrain_attribute(ref_arr, resolution=ref.res, attribute=['slope', 'aspect']) df = xdem.spatialstats.nd_binning(values=diff.data.flatten(), list_var=[slope.flatten(), ref.data.flatten(), aspect.flatten()], list_var_names=['slope', 'elevation', 'aspect']) @@ -684,7 +748,7 @@ def test_interp_nd_binning(self): # Check a value is returned inside the grid assert np.isfinite(fun([15, 1000])) # Check the nmad increases with slope - assert fun([20, 800]) > fun([0, 800]) + assert fun([40, 800]) > fun([0, 800]) # Check a value is returned outside the grid assert all(np.isfinite(fun(([-5, 50], [-500,3000])))) @@ -699,6 +763,107 @@ def test_interp_nd_binning(self): # Check a value is returned outside the grid assert all(np.isfinite(fun(([-5, 50], [-500,3000], [-2*np.pi,4*np.pi])))) + def test_two_step_standardization(self): + """Test two-step standardization function""" + + ref_dem, diff, mask_glacier, vector_glacier = load_ref_and_diff() + + # Get attributes + slope, maximum_curv = xdem.terrain.get_terrain_attribute(ref_dem, attribute=['slope', 'maximum_curvature']) + + # Test this gives the same results as when using the base functions + diff_arr = gu.spatial_tools.get_array_and_mask(diff)[0] + slope_arr = gu.spatial_tools.get_array_and_mask(slope)[0] + maximum_curv_arr = gu.spatial_tools.get_array_and_mask(maximum_curv)[0] + stable_mask_arr = ~vector_glacier.create_mask(ref_dem).squeeze() + + # Reproduce the first steps of binning + df_binning = xdem.spatialstats.nd_binning(values=diff_arr[stable_mask_arr], + list_var=[slope_arr[stable_mask_arr], + maximum_curv_arr[stable_mask_arr]], + list_var_names=['var1', 'var2'], + statistics=[xdem.spatialstats.nmad]) + unscaled_fun = xdem.spatialstats.interp_nd_binning(df_binning, list_var_names=['var1', 'var2'], + statistic='nmad') + # The zscore spread should not be one right after binning + zscores = diff_arr[stable_mask_arr] / unscaled_fun((slope_arr[stable_mask_arr], maximum_curv_arr[stable_mask_arr])) + scale_fac = xdem.spatialstats.nmad(zscores) + assert scale_fac != 1 + + # Filter with a factor of 5 and the standard deviation and check the function outputs the exact same array + zscores[np.abs(zscores) > 3*np.nanstd(zscores)] = np.nan + scale_fac_std = np.nanstd(zscores) + zscores /= scale_fac_std + zscores_2, final_func = xdem.spatialstats.two_step_standardization(diff_arr[stable_mask_arr], + list_var=[slope_arr[stable_mask_arr], + maximum_curv_arr[stable_mask_arr]], + unscaled_error_fun=unscaled_fun, + spread_statistic=np.nanstd, + fac_spread_outliers=3) + assert np.array_equal(zscores, zscores_2, equal_nan=True) + + # Check the output of the scaled function is simply the unscaled one times the spread statistic + test_slopes = np.linspace(0, 50, 50) + test_max_curvs = np.linspace(0, 10, 50) + assert np.array_equal(unscaled_fun((test_slopes, test_max_curvs)) * scale_fac_std, + final_func((test_slopes, test_max_curvs))) + + + def test_estimate_model_heteroscedasticity_and_infer_from_stable(self): + """Test consistency of outputs and errors in wrapper functions for estimation of heteroscedasticity""" + + # Load data + ref_dem, diff, mask_glacier, vector_glacier = load_ref_and_diff() + + # Get attributes + slope, maximum_curv = xdem.terrain.get_terrain_attribute(ref_dem, attribute=['slope', 'maximum_curvature']) + + # Test infer function + errors_1, df_binning_1, err_fun_1 = \ + xdem.spatialstats.infer_heteroscedasticy_from_stable(dvalues=diff, + list_var=[slope, maximum_curv], + unstable_mask=vector_glacier) + + # Test this gives the same results as when using the base functions + diff_arr = gu.spatial_tools.get_array_and_mask(diff)[0] + slope_arr = gu.spatial_tools.get_array_and_mask(slope)[0] + maximum_curv_arr = gu.spatial_tools.get_array_and_mask(maximum_curv)[0] + stable_mask_arr = ~vector_glacier.create_mask(ref_dem).squeeze() + + df_binning_2, err_fun_2 = \ + xdem.spatialstats.estimate_model_heteroscedasticity(dvalues=diff_arr[stable_mask_arr], + list_var=[slope_arr[stable_mask_arr], + maximum_curv_arr[stable_mask_arr]], + list_var_names=['var1', 'var2']) + + pd.testing.assert_frame_equal(df_binning_1, df_binning_2) + test_slopes = np.linspace(0, 50, 50) + test_max_curvs = np.linspace(0, 10, 50) + assert np.array_equal(err_fun_1((test_slopes, test_max_curvs)), err_fun_2((test_slopes, test_max_curvs))) + + # Test the error map is consistent as well + errors_2_arr = err_fun_2((slope_arr, maximum_curv_arr)) + errors_1_arr = gu.spatial_tools.get_array_and_mask(errors_1)[0] + assert np.array_equal(errors_1_arr, errors_2_arr, equal_nan=True) + + # Check that errors are raised with wrong input + with pytest.raises(ValueError, match='The dvalues must be a Raster or NumPy array.'): + xdem.spatialstats.infer_heteroscedasticy_from_stable(dvalues='not_an_array', + stable_mask=~mask_glacier.squeeze(), + list_var=[slope_arr]) + with pytest.raises(ValueError, match='The stable mask must be a Vector, GeoDataFrame or NumPy array.'): + xdem.spatialstats.infer_heteroscedasticy_from_stable(dvalues=diff, + stable_mask='not_a_vector_or_array', + list_var=[slope_arr]) + with pytest.raises(ValueError, match='The unstable mask must be a Vector, GeoDataFrame or NumPy array.'): + xdem.spatialstats.infer_heteroscedasticy_from_stable(dvalues=diff, + unstable_mask='not_a_vector_or_array', + list_var=[slope_arr]) + + with pytest.raises(ValueError, match='The stable mask can only passed as a Vector or GeoDataFrame if the input ' + 'dvalues is a Raster.'): + xdem.spatialstats.infer_heteroscedasticy_from_stable(dvalues=diff_arr, stable_mask=vector_glacier, + list_var=[slope_arr]) def test_plot_binning(self): diff --git a/xdem/spatialstats.py b/xdem/spatialstats.py index 7138def1..608dc27f 100644 --- a/xdem/spatialstats.py +++ b/xdem/spatialstats.py @@ -367,6 +367,7 @@ def estimate_model_heteroscedasticity(dvalues: np.ndarray, list_var: Iterable[np @overload def infer_heteroscedasticy_from_stable(dvalues: np.ndarray , list_var: list[np.ndarray | RasterType], stable_mask: np.ndarray | VectorType | gpd.GeoDataFrame, + unstable_mask: np.ndarray | VectorType | gpd.GeoDataFrame, list_var_names: Iterable[str], spread_statistic: Callable, list_var_bins: Optional[Union[int,Iterable[Iterable]]], @@ -379,6 +380,7 @@ def infer_heteroscedasticy_from_stable(dvalues: np.ndarray , list_var: list[np.n @overload def infer_heteroscedasticy_from_stable(dvalues: RasterType , list_var: list[np.ndarray | RasterType], stable_mask: np.ndarray | VectorType | gpd.GeoDataFrame, + unstable_mask: np.ndarray | VectorType | gpd.GeoDataFrame, list_var_names: Iterable[str], spread_statistic: Callable, list_var_bins: Optional[Union[int,Iterable[Iterable]]], @@ -389,7 +391,8 @@ def infer_heteroscedasticy_from_stable(dvalues: RasterType , list_var: list[np.n Callable[[tuple[np.ndarray, ...]], np.ndarray]]: ... def infer_heteroscedasticy_from_stable(dvalues: np.ndarray | RasterType, list_var: list[np.ndarray | RasterType], - stable_mask: np.ndarray | VectorType | gpd.GeoDataFrame, + stable_mask: np.ndarray | VectorType | gpd.GeoDataFrame = None, + unstable_mask: np.ndarray | VectorType | gpd.GeoDataFrame = None, list_var_names: Iterable[str] = None, spread_statistic: Callable = nmad, list_var_bins: Optional[Union[int,Iterable[Iterable]]] = None, @@ -405,9 +408,12 @@ def infer_heteroscedasticy_from_stable(dvalues: np.ndarray | RasterType, list_va It is a convenience wrapper for `estimate_model_heteroscedasticity` to work on either Raster or array, compute the stable mask and return an error map. + If no stable or unstable mask is provided to mask in or out the values, all terrain is used. + :param dvalues: Proxy values as array or Raster :param list_var: List of size (L) of explanatory variables as array or Raster of same shape as dvalues :param stable_mask: Vector shapefile of stable terrain (if dvalues is Raster), or boolean array of same shape as dvalues + :param unstable_mask: Vector shapefile of unstable terrain (if dvalues is Raster), or boolean array of same shape as dvalues :param list_var_names: List of size (L) of names of the explanatory variables, otherwise named var1, var2, etc. :param spread_statistic: Statistic to be computed for the spread; defaults to nmad :param list_var_bins: Count of size (1), or list of size (L) of counts or custom bin edges for the explanatory variables; defaults to 10 bins @@ -422,16 +428,18 @@ def infer_heteroscedasticy_from_stable(dvalues: np.ndarray | RasterType, list_va # Check inputs if not isinstance(dvalues, (Raster, np.ndarray)): raise ValueError('The dvalues must be a Raster or NumPy array.') - if not isinstance(stable_mask, (np.ndarray, Vector, gpd.GeoDataFrame)): + if stable_mask is not None and not isinstance(stable_mask, (np.ndarray, Vector, gpd.GeoDataFrame)): raise ValueError('The stable mask must be a Vector, GeoDataFrame or NumPy array.') + if unstable_mask is not None and not isinstance(unstable_mask, (np.ndarray, Vector, gpd.GeoDataFrame)): + raise ValueError('The unstable mask must be a Vector, GeoDataFrame or NumPy array.') # Check that input stable mask can only be a georeferenced vector if the proxy values are a Raster to project onto - if isinstance(dvalues, Raster) and isinstance(stable_mask, (Vector, gpd.GeoDataFrame)): + if not isinstance(dvalues, Raster) and (isinstance(stable_mask, (Vector, gpd.GeoDataFrame)) or isinstance(unstable_mask, (Vector, gpd.GeoDataFrame))): raise ValueError('The stable mask can only passed as a Vector or GeoDataFrame if the input dvalues is a Raster.') # Create placeholder variables names if those don't exist if list_var_names is None: - list_var_names = ['var'+str(i) for i in range(len(list_var))] + list_var_names = ['var'+str(i+1) for i in range(len(list_var))] # Get the arrays for proxy values and explanatory variables if isinstance(dvalues, Raster): @@ -442,7 +450,9 @@ def infer_heteroscedasticy_from_stable(dvalues: np.ndarray | RasterType, list_va for var in list_var if isinstance(var, Raster)] # If the stable mask is not an array, create it - if not isinstance(stable_mask, np.ndarray): + if stable_mask is None: + stable_mask_arr = np.ones(np.shape(dvalues_arr), dtype=bool) + elif not isinstance(stable_mask, np.ndarray): # If the stable mask is a geopandas dataframe, wrap it in a Vector object if isinstance(stable_mask, gpd.GeoDataFrame): @@ -456,6 +466,25 @@ def infer_heteroscedasticy_from_stable(dvalues: np.ndarray | RasterType, list_va else: stable_mask_arr = stable_mask + # If the unstable mask is not an array, create it + if unstable_mask is None: + unstable_mask_arr = np.zeros(np.shape(dvalues_arr), dtype=bool) + elif not isinstance(unstable_mask, np.ndarray): + + # If the unstable mask is a geopandas dataframe, wrap it in a Vector object + if isinstance(unstable_mask, gpd.GeoDataFrame): + unstable_vector = Vector(unstable_mask) + else: + unstable_vector = unstable_mask + + # Create the mask + unstable_mask_arr = unstable_vector.create_mask(dvalues) + # If the mask is already an array, just pass it + else: + unstable_mask_arr = unstable_mask + + stable_mask_arr = np.logical_and(stable_mask_arr, ~unstable_mask_arr).squeeze() + # Get the subsets on stable terrain dvalues_stable_arr = dvalues_arr[stable_mask_arr] list_var_stable_arr = [var_arr[stable_mask_arr] for var_arr in list_var_arr] @@ -471,9 +500,9 @@ def infer_heteroscedasticy_from_stable(dvalues: np.ndarray | RasterType, list_va # Return the right type, depending on dvalues input if isinstance(dvalues, Raster): - return dvalues.copy(new_array=error) + return dvalues.copy(new_array=error), df, fun else: - return error + return error, df, fun def _create_circular_mask(shape: Union[int, Sequence[int]], center: Optional[list[float]] = None, @@ -889,17 +918,19 @@ def sample_empirical_variogram(values: Union[np.ndarray, RasterType], gsd: float elif isinstance(values, (np.ndarray, np.ma.masked_array)): values, mask = get_array_and_mask(values) else: - raise TypeError('Values must be of type np.ndarray, np.ma.masked_array or Raster subclass.') + raise ValueError('Values must be of type np.ndarray, np.ma.masked_array or Raster subclass.') values = values.squeeze() # Then, check if the logic between values, coords and gsd is respected if (gsd is not None or subsample_method in ['cdist_equidistant', 'pdist_disk','pdist_ring']) and values.ndim == 1: - raise TypeError('Values array must be 2D when using any of the "cdist_equidistant", "pdist_disk" and ' + raise ValueError('Values array must be 2D when using any of the "cdist_equidistant", "pdist_disk" and ' '"pdist_ring" methods, or providing a ground sampling distance instead of coordinates.') elif coords is not None and values.ndim != 1: - raise TypeError('Values array must be 1D when providing coordinates.') + raise ValueError('Values array must be 1D when providing coordinates.') elif coords is not None and (coords.shape[0] != 2 and coords.shape[1] != 2): - raise TypeError('The coordinates array must have one dimension with length equal to 2') + raise ValueError('The coordinates array must have one dimension with length equal to 2') + elif values.ndim == 2 and gsd is None: + raise ValueError('The ground sampling distance must be defined when passing a 2D values array.') # Check the subsample method provided exists, otherwise list options if subsample_method not in ['cdist_equidistant','cdist_point','pdist_point','pdist_disk','pdist_ring']: @@ -1311,8 +1342,9 @@ def estimate_model_spatial_correlation(dvalues: Union[np.ndarray, RasterType], l return empirical_variogram, params_variogram_model, spatial_correlation_func def infer_spatial_correlation_from_stable(dvalues: np.ndarray | RasterType, - stable_mask: np.ndarray | VectorType | gpd.GeoDataFrame, list_models: list[str | Callable], + stable_mask: np.ndarray | VectorType | gpd.GeoDataFrame = None, + unstable_mask: np.ndarray | VectorType | gpd.GeoDataFrame = None, errors: np.ndarray | RasterType = None, estimator = 'dowd', gsd: float = None, coords: np.ndarray = None, subsample: int = 1000, subsample_method: str = 'cdist_equidistant', @@ -1330,12 +1362,15 @@ def infer_spatial_correlation_from_stable(dvalues: np.ndarray | RasterType, It is a convenience wrapper for `estimate_model_spatial_correlation` to work on either Raster or array and compute the stable mask. + If no stable or unstable mask is provided to mask in or out the values, all terrain is used. + :param dvalues: Proxy values as array or Raster - :param stable_mask: Vector shapefile of stable terrain (if dvalues is Raster), or boolean array of same shape as dvalues - :param errors: Error values to account for heteroscedasticity (ignored if None). :param list_models: List of K variogram models to sum for the fit in order from short to long ranges. Can either be a 3-letter string, full string of the variogram name or SciKit-GStat model function (e.g., for a spherical model "Sph", "Spherical" or skgstat.models.spherical). + :param stable_mask: Vector shapefile of stable terrain (if dvalues is Raster), or boolean array of same shape as dvalues + :param unstable_mask: Vector shapefile of unstable terrain (if dvalues is Raster), or boolean array of same shape as dvalues + :param errors: Error values to account for heteroscedasticity (ignored if None). :param estimator: Estimator for the empirical variogram; default to Dowd's variogram (see skgstat.Variogram for the list of available estimators). :param gsd: Ground sampling distance @@ -1355,22 +1390,28 @@ def infer_spatial_correlation_from_stable(dvalues: np.ndarray | RasterType, # Check inputs if not isinstance(dvalues, (Raster, np.ndarray)): raise ValueError('The dvalues must be a Raster or NumPy array.') - if not isinstance(stable_mask, (np.ndarray, Vector, gpd.GeoDataFrame)): + if stable_mask is not None and not isinstance(stable_mask, (np.ndarray, Vector, gpd.GeoDataFrame)): raise ValueError('The stable mask must be a Vector, GeoDataFrame or NumPy array.') + if unstable_mask is not None and not isinstance(unstable_mask, (np.ndarray, Vector, gpd.GeoDataFrame)): + raise ValueError('The unstable mask must be a Vector, GeoDataFrame or NumPy array.') # Check that input stable mask can only be a georeferenced vector if the proxy values are a Raster to project onto - if isinstance(dvalues, Raster) and isinstance(stable_mask, (Vector, gpd.GeoDataFrame)): + if not isinstance(dvalues, Raster) and isinstance(stable_mask, (Vector, gpd.GeoDataFrame)): raise ValueError( 'The stable mask can only passed as a Vector or GeoDataFrame if the input dvalues is a Raster.') # Get array if input is a Raster if isinstance(dvalues, Raster): dvalues_arr = get_array_and_mask(dvalues)[0] + gsd = dvalues.res[0] else: dvalues_arr = dvalues # If the stable mask is not an array, create it - if not isinstance(stable_mask, np.ndarray): + # If the stable mask is not an array, create it + if stable_mask is None: + stable_mask_arr = np.ones(np.shape(dvalues_arr), dtype=bool) + elif not isinstance(stable_mask, np.ndarray): # If the stable mask is a geopandas dataframe, wrap it in a Vector object if isinstance(stable_mask, gpd.GeoDataFrame): @@ -1384,8 +1425,28 @@ def infer_spatial_correlation_from_stable(dvalues: np.ndarray | RasterType, else: stable_mask_arr = stable_mask - # Get the subsets on stable terrain - dvalues_stable_arr = dvalues_arr[stable_mask_arr] + # If the unstable mask is not an array, create it + if unstable_mask is None: + unstable_mask_arr = np.zeros(np.shape(dvalues_arr), dtype=bool) + elif not isinstance(unstable_mask, np.ndarray): + + # If the unstable mask is a geopandas dataframe, wrap it in a Vector object + if isinstance(unstable_mask, gpd.GeoDataFrame): + unstable_vector = Vector(unstable_mask) + else: + unstable_vector = unstable_mask + + # Create the mask + unstable_mask_arr = unstable_vector.create_mask(dvalues) + # If the mask is already an array, just pass it + else: + unstable_mask_arr = unstable_mask + + stable_mask_arr = np.logical_and(stable_mask_arr, ~unstable_mask_arr).squeeze() + + # Need to preserve the shape, so setting as NaNs all points not on stable terrain + dvalues_stable_arr = dvalues_arr.copy() + dvalues_stable_arr[~stable_mask_arr] = np.nan # Perform standardization if error array is provided if errors is not None: @@ -1393,14 +1454,13 @@ def infer_spatial_correlation_from_stable(dvalues: np.ndarray | RasterType, errors_arr = get_array_and_mask(errors)[0] else: errors_arr = errors - errors_stable_arr = errors_arr[stable_mask_arr] # Standardize - dvalues_stable_arr /= errors_stable_arr + dvalues_stable_arr /= errors_arr # Estimate and model spatial correlations empirical_variogram, params_variogram_model, spatial_correlation_func = estimate_model_spatial_correlation( - values=dvalues_stable_arr, list_models=list_models, estimator=estimator, gsd=gsd, coords=coords, + dvalues=dvalues_stable_arr, list_models=list_models, estimator=estimator, gsd=gsd, coords=coords, subsample=subsample, subsample_method=subsample_method, n_variograms=n_variograms, n_jobs=n_jobs, verbose=verbose, random_state=random_state, bounds=bounds, p0=p0, **kwargs) From 30496720ff398a5d008885683466fba0f0263304 Mon Sep 17 00:00:00 2001 From: rhugonne Date: Sun, 21 Aug 2022 17:25:00 +0200 Subject: [PATCH 07/21] Add two basic gallery examples for heteroscedasticity and spatial correlation of errors --- examples/error_map.py | 56 ++++++++++++++++++ ...or.py => heterosc_estimation_modelling.py} | 0 examples/spatial_correlation.py | 59 +++++++++++++++++++ ...r.py => variogram_estimation_modelling.py} | 0 xdem/spatialstats.py | 11 +++- 5 files changed, 123 insertions(+), 3 deletions(-) create mode 100644 examples/error_map.py rename examples/{plot_nonstationary_error.py => heterosc_estimation_modelling.py} (100%) create mode 100644 examples/spatial_correlation.py rename examples/{plot_vgm_error.py => variogram_estimation_modelling.py} (100%) diff --git a/examples/error_map.py b/examples/error_map.py new file mode 100644 index 00000000..987d9ed1 --- /dev/null +++ b/examples/error_map.py @@ -0,0 +1,56 @@ +""" +Elevation error map +=================== + +Digital elevation models have a precision that can vary with terrain and instrument-related variables. Here, we apply +the framework of `Hugonnet et al. (2022) `_ to estimate and model this +variability in elevation error, using terrain slope and maximum curvature as explanatory variables and stable terrain +as an error proxy for moving terrain. + +**References**: `Hugonnet et al. (2022) `_. See in particular Figure 4. +Errors in elevation difference can be converted in elevation errors following Equation 7 (equal if other source of much +higher precision) or Equation 8 (divided by sqrt(2) if the two sources are of same precision). +""" +# sphinx_gallery_thumbnail_number = 1 +import xdem +import geoutils as gu + +# %% +# We load a difference of DEMs at Longyearbyen, already coregistered using :ref:`coregistration_nuthkaab` as shown in +# the :ref:`sphx_glr_auto_examples_plot_nuth_kaab.py` example. We also load the reference DEM to derive terrain +# attributes and the glacier outlines here corresponding to moving terrain. +dh = xdem.DEM(xdem.examples.get_path("longyearbyen_ddem")) +ref_dem = xdem.DEM(xdem.examples.get_path("longyearbyen_ref_dem")) +glacier_outlines = gu.Vector(xdem.examples.get_path("longyearbyen_glacier_outlines")) + +# %% +# We derive the terrain slope and maximum curvature from the reference DEM. +slope, maximum_curvature = xdem.terrain.get_terrain_attribute(ref_dem, attribute=['slope', 'maximum_curvature']) + +# %% +# Then, we run the pipeline for inference of elevation heteroscedasticity from stable terrain: +error_map, df_binning, error_function = \ + xdem.spatialstats.infer_heteroscedasticy_from_stable(dvalues=dh, list_var=[slope, maximum_curvature], + list_var_names=['slope', 'maxc'], + unstable_mask=glacier_outlines) + +# %% +# The first output corresponds to the error map for the DEM (1-sigma): +error_map.show(vmin=2, vmax=7, cmap='Reds', cb_title='Elevation error (1$\sigma$)') + +# %% +# The second output is the dataframe of 2D binning with slope and maximum curvature: +df_binning + +# %% +# The third output is the 2D binning interpolant, i.e. an error function with the slope and maximum curvature +# (*Note: below we multiply the maximum curvature by 100 to convert it in m-1*): +print('Error for a slope of {:.0f} degrees and {:.0f} m-1 max. curvature: {:.1f} m'.format(0, 0, error_function((0, 0)))) +print('Error for a slope of {:.0f} degrees and {:.0f} m-1 max. curvature: {:.1f} m'.format(40, 0, error_function((40, 0)))) +print('Error for a slope of {:.0f} degrees and {:.0f} m-1 max. curvature: {:.1f} m'.format(0, 100*5, error_function((0, 5)))) + +# %% +# This pipeline will not always work optimally with default parameters: spread estimates can be affected by skewed +# distributions, the binning by extreme range of values, some DEMs do not have any error variability with terrain (e.g., +# terrestrial photogrammetry). **To learn how to tune more parameters and use the subfunctions, see the gallery example:** +# :ref:`sphx_glr_auto_examples_heterosc_estimation_modelling.py`! \ No newline at end of file diff --git a/examples/plot_nonstationary_error.py b/examples/heterosc_estimation_modelling.py similarity index 100% rename from examples/plot_nonstationary_error.py rename to examples/heterosc_estimation_modelling.py diff --git a/examples/spatial_correlation.py b/examples/spatial_correlation.py new file mode 100644 index 00000000..6d21446e --- /dev/null +++ b/examples/spatial_correlation.py @@ -0,0 +1,59 @@ +""" +Spatial correlation of errors +============================= + +Digital elevation models have errors that are spatially correlated due to instrument or processing effects. Here, we apply +the framework of `Hugonnet et al. (2022) `_ to estimate and model this +spatial correlation in elevation error, using a sum of variogram forms to model this correlation, and stable terrain +as an error proxy for moving terrain. + +**References**: `Hugonnet et al. (2022) `_. See in particular Figure 5. +""" +# sphinx_gallery_thumbnail_number = 1 +import xdem +import geoutils as gu + +# %% +# We load a difference of DEMs at Longyearbyen, already coregistered using :ref:`coregistration_nuthkaab` as shown in +# the :ref:`sphx_glr_auto_examples_plot_nuth_kaab.py` example. We also load the glacier outlines here corresponding to +# moving terrain. +dh = xdem.DEM(xdem.examples.get_path("longyearbyen_ddem")) +glacier_outlines = gu.Vector(xdem.examples.get_path("longyearbyen_glacier_outlines")) + +# %% +# Then, we run the pipeline for inference of elevation heteroscedasticity from stable terrain (*Note: we pass a +# random_state argument to ensure a fixed random subsampling in this example, useful for result reproducibility*): +df_empirical_variogram, df_model_params, spatial_corr_function = \ + xdem.spatialstats.infer_spatial_correlation_from_stable(dvalues=dh, list_models=['Gaussian', 'Spherical'], + unstable_mask=glacier_outlines, random_state=42) + +# %% +# The first output corresponds to the dataframe of the empirical variogram, by default estimated using Dowd's estimator +# and the circular sampling scheme of `skgstat.RasterEquidistantMetricSpace` (Fig. S13 of Hugonnet et al. (2022)): +df_empirical_variogram + +# %% +# The second output is the dataframe of optimized model parameters for a sum of gaussian and spherical models: +df_model_params + +# %% +# The third output is the spatial correlation function with spatial lags, derived from the variogram: +print('Errors are correlated at {:.1f}% for a {:,.0f} m spatial lag'.format(spatial_corr_function(0)*100, 0)) +print('Errors are correlated at {:.1f}% for a {:,.0f} m spatial lag'.format(spatial_corr_function(100)*100, 100)) +print('Errors are correlated at {:.1f}% for a {:,.0f} m spatial lag'.format(spatial_corr_function(1000)*100, 1000)) +print('Errors are correlated at {:.1f}% for a {:,.0f} m spatial lag'.format(spatial_corr_function(10000)*100, 10000)) +print('Errors are correlated at {:.1f}% for a {:,.0f} m spatial lag'.format(spatial_corr_function(30000)*100, 30000)) + +# %% +# We can plot the empirical variogram and its model on a non-linear X-axis to identify the multi-scale correlations. +xdem.spatialstats.plot_variogram(df=df_empirical_variogram, + list_fit_fun=[xdem.spatialstats.get_variogram_model_func(df_model_params)], + xlabel='Spatial lag (m)', + ylabel='Variance of\nelevation differences (m)', + xscale_range_split=[100, 1000]) + +# %% +# This pipeline will not always work optimally with default parameters: variogram sampling is more robust with a lot of +# samples but takes long computing times, and the fitting might require multiple tries for forms and possibly bounds +# and first guesses to help the least-squares optimization. **To learn how to tune more parameters and use the +# subfunctions, see the gallery example:** :ref:`sphx_glr_auto_examples_variogram_estimation_modelling.py`! \ No newline at end of file diff --git a/examples/plot_vgm_error.py b/examples/variogram_estimation_modelling.py similarity index 100% rename from examples/plot_vgm_error.py rename to examples/variogram_estimation_modelling.py diff --git a/xdem/spatialstats.py b/xdem/spatialstats.py index 608dc27f..0f75960a 100644 --- a/xdem/spatialstats.py +++ b/xdem/spatialstats.py @@ -1333,6 +1333,8 @@ def estimate_model_spatial_correlation(dvalues: Union[np.ndarray, RasterType], l subsample=subsample, subsample_method=subsample_method, n_variograms=n_variograms, n_jobs=n_jobs, verbose=verbose, random_state=random_state, **kwargs) + # TODO: prevent this last bin with few samples to be returned by `sample_empirical_variogram` + empirical_variogram = empirical_variogram[:-1] params_variogram_model = fit_sum_model_variogram(list_models=list_models, empirical_variogram=empirical_variogram, bounds=bounds, p0=p0)[1] @@ -2059,7 +2061,7 @@ def patches_method(values: np.ndarray, gsd: float, area: float, mask: Optional[n return df_all -def plot_variogram(df: pd.DataFrame, list_fit_fun: Optional[list[Callable[[float], float]]] = None, +def plot_variogram(df: pd.DataFrame, list_fit_fun: Optional[list[Callable[[np.ndarray], np.ndarray]]] = None, list_fit_fun_label: Optional[list[str]] = None, ax: matplotlib.axes.Axes | None = None, xscale='linear', xscale_range_split: Optional[list] = None, xlabel = None, ylabel = None, xlim = None, ylim = None): @@ -2210,12 +2212,15 @@ def plot_variogram(df: pd.DataFrame, list_fit_fun: Optional[list[Callable[[float if ylim is not None: ax1.set_ylim(ylim) else: - ax1.set_ylim((0, np.nanmax(df.exp)+np.nanmean(df.err_exp))) + if np.all(np.isnan(df.err_exp)): + ax1.set_ylim((0, 1.05*np.nanmax(df.exp))) + else: + ax1.set_ylim((0, np.nanmax(df.exp)+np.nanmean(df.err_exp))) if k == int(nb_subpanels/2): ax1.set_xlabel(xlabel) if k == nb_subpanels - 1: - ax1.legend(loc='best') + ax1.legend(loc='lower right') if k == 0: ax1.set_ylabel(ylabel) else: From a82cd1c84314dc0bd295f70ca9a673c5c2d80142 Mon Sep 17 00:00:00 2001 From: rhugonne Date: Sun, 21 Aug 2022 18:19:52 +0200 Subject: [PATCH 08/21] Add spatial error propagation function and tests --- tests/test_spatialstats.py | 30 +++++++++++++++++++++ xdem/spatialstats.py | 53 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 83 insertions(+) diff --git a/tests/test_spatialstats.py b/tests/test_spatialstats.py index f2749b6e..209b944c 100644 --- a/tests/test_spatialstats.py +++ b/tests/test_spatialstats.py @@ -515,6 +515,36 @@ def test_number_effective_samples(self): xdem.spatialstats.number_effective_samples(area=outlines_brom, params_variogram_model=params_variogram_model, rasterize_resolution=(10, 10)) + def test_spatial_error_propagation(self): + """Test that the spatial error propagation wrapper function runs properly""" + + ref, diff, mask_glacier, vector_glacier = load_ref_and_diff() + + # Get the error map and variogram model with standardization + slope, maxc = xdem.terrain.get_terrain_attribute(ref, attribute=['slope', 'maximum_curvature']) + errors = xdem.spatialstats.infer_heteroscedasticy_from_stable( + dvalues=diff, list_var=[slope, maxc], unstable_mask=vector_glacier)[0] + # Standardize the differences + zscores = diff / errors + params_variogram_model = xdem.spatialstats.infer_spatial_correlation_from_stable( + dvalues=zscores, list_models=['Gaussian', 'Spherical'], unstable_mask=vector_glacier, subsample=100)[1] + + # Run the function with vector areas + areas_vector = [vector_glacier.ds[vector_glacier.ds['NAME']=='Brombreen'], + vector_glacier.ds[vector_glacier.ds['NAME']=='Medalsbreen']] + + list_stderr_vec = xdem.spatialstats.spatial_error_propagation(areas=areas_vector, errors=errors, + params_variogram_model=params_variogram_model) + + # Run the function with numeric areas (sum needed for Medalsbreen that has two separate polygons) + areas_numeric = [np.sum(area_vec.area.values) for area_vec in areas_vector] + list_stderr = xdem.spatialstats.spatial_error_propagation(areas=areas_numeric, errors=errors, + params_variogram_model=params_variogram_model) + + # Check that the outputs are consistent: the numeric method should give smaller neff + for i in range(2): + assert list_stderr_vec[i] > list_stderr[i] + class TestSubSampling: diff --git a/xdem/spatialstats.py b/xdem/spatialstats.py index 0f75960a..37d5e39d 100644 --- a/xdem/spatialstats.py +++ b/xdem/spatialstats.py @@ -1884,6 +1884,59 @@ def number_effective_samples(area: float | int | VectorType | gpd.GeoDataFrame, return neff +def spatial_error_propagation(areas: list[float | VectorType | gpd.GeoDataFrame], + errors: RasterType, + params_variogram_model: pd.Dataframe, + **kwargs) -> list[float]: + """ + Spatial propagation of elevation errors to an area using the estimated heteroscedasticity and spatial correlations. + + This function is based on the `number_effective_samples` function to estimate uncorrelated samples. If given a + vector area, it uses Equation 18 of Hugonnet et al. (2022), https://doi.org/10.1109/jstars.2022.3188922. If given + a numeric area, it uses a generalization of Rolstad et al. (2009), http://dx.doi.org/10.3189/002214309789470950. + + The standard error SE (1-sigma) is then computed as SE = mean(SD) / Neff, where mean(SD) is the mean of errors in + the area of interest which accounts for heteroscedasticity, and Neff is the number of effective samples. + + + :param areas: Area of interest either as a numeric value of surface in the same unit as the variogram ranges (will + assume a circular shape), or as a vector (shapefile) of the area + :param errors: Errors from heteroscedasticity estimation and modelling, as an array or Raster + :param params_variogram_model: Dataframe of variogram models to sum with three to four columns, "model" for the model types + (e.g., ["spherical", "matern"]), "range" for the correlation ranges (e.g., [2, 100]), "psill" for the partial + sills (e.g., [0.8, 0.2]) and "smooth" for the smoothness parameter if it exists for this model (e.g., + [None, 0.2]). + :param kwargs: Keyword argument to pass to the `neff_hugonnet_approx` function. + + :return: List of standard errors (1-sigma) for the input areas + """ + + standard_errors = [] + errors_arr = get_array_and_mask(errors)[0] + for area in areas: + # We estimate the number of effective samples in the area + neff = number_effective_samples(area=area, params_variogram_model=params_variogram_model, + rasterize_resolution=errors, **kwargs) + + # We compute the average error in this area + # If the area is only a value, take the average error over the entire Raster + if isinstance(area, float): + average_spread = np.nanmean(errors_arr) + else: + if isinstance(area, gpd.GeoDataFrame): + area_vector = Vector(area) + else: + area_vector = area + area_mask = area_vector.create_mask(errors).squeeze() + + average_spread = np.nanmean(errors_arr[area_mask]) + + # Compute the standard error from those two values + standard_error = average_spread / np.sqrt(neff) + standard_errors.append(standard_error) + + return standard_errors + def _std_err_finite(std: float, neff_tot: float, neff: float) -> float: """ From 3a9b5b2620702b2f7462cf03406513f4171bbec1 Mon Sep 17 00:00:00 2001 From: rhugonne Date: Sun, 21 Aug 2022 19:01:08 +0200 Subject: [PATCH 09/21] Add spatial propagation gallery example --- examples/{error_map.py => infer_heterosc.py} | 5 +- ...lation.py => infer_spatial_correlation.py} | 3 +- examples/spatial_error_propagation.py | 66 +++++++++++++++++++ tests/test_spatialstats.py | 4 +- 4 files changed, 74 insertions(+), 4 deletions(-) rename examples/{error_map.py => infer_heterosc.py} (96%) rename examples/{spatial_correlation.py => infer_spatial_correlation.py} (98%) create mode 100644 examples/spatial_error_propagation.py diff --git a/examples/error_map.py b/examples/infer_heterosc.py similarity index 96% rename from examples/error_map.py rename to examples/infer_heterosc.py index 987d9ed1..b4a07e9d 100644 --- a/examples/error_map.py +++ b/examples/infer_heterosc.py @@ -8,6 +8,7 @@ as an error proxy for moving terrain. **References**: `Hugonnet et al. (2022) `_. See in particular Figure 4. + Errors in elevation difference can be converted in elevation errors following Equation 7 (equal if other source of much higher precision) or Equation 8 (divided by sqrt(2) if the two sources are of same precision). """ @@ -29,14 +30,14 @@ # %% # Then, we run the pipeline for inference of elevation heteroscedasticity from stable terrain: -error_map, df_binning, error_function = \ +errors, df_binning, error_function = \ xdem.spatialstats.infer_heteroscedasticy_from_stable(dvalues=dh, list_var=[slope, maximum_curvature], list_var_names=['slope', 'maxc'], unstable_mask=glacier_outlines) # %% # The first output corresponds to the error map for the DEM (1-sigma): -error_map.show(vmin=2, vmax=7, cmap='Reds', cb_title='Elevation error (1$\sigma$)') +errors.show(vmin=2, vmax=7, cmap='Reds', cb_title='Elevation error (1$\sigma$)') # %% # The second output is the dataframe of 2D binning with slope and maximum curvature: diff --git a/examples/spatial_correlation.py b/examples/infer_spatial_correlation.py similarity index 98% rename from examples/spatial_correlation.py rename to examples/infer_spatial_correlation.py index 6d21446e..72f72d70 100644 --- a/examples/spatial_correlation.py +++ b/examples/infer_spatial_correlation.py @@ -7,7 +7,8 @@ spatial correlation in elevation error, using a sum of variogram forms to model this correlation, and stable terrain as an error proxy for moving terrain. -**References**: `Hugonnet et al. (2022) `_. See in particular Figure 5. +**References**: `Hugonnet et al. (2022) `_. See in particular Figure 5, +Equation 13–16. """ # sphinx_gallery_thumbnail_number = 1 import xdem diff --git a/examples/spatial_error_propagation.py b/examples/spatial_error_propagation.py new file mode 100644 index 00000000..64bb4b16 --- /dev/null +++ b/examples/spatial_error_propagation.py @@ -0,0 +1,66 @@ +""" +Spatial propagation of errors +============================= + +Digital elevation models have errors that are spatially correlated due to instrument or processing effects. Here, we apply +the framework of `Hugonnet et al. (2022) `_ to estimate and model this +spatial correlation in elevation error, using a sum of variogram forms to model this correlation, and stable terrain +as an error proxy for moving terrain. + +**References**: `Hugonnet et al. (2022) `_. Figure S16, Equations 17–19, +`Rolstad et al. (2009) `_, Equation 8. +""" +import numpy as np + +import xdem +import geoutils as gu +import matplotlib.pyplot as plt + +# %% +# We load the same data, and perform the same calculations on heteroscedasticity and spatial correlations of errors as +# in the :ref:`sphx_glr_auto_examples_infer_heterosc.py` and :ref:`sphx_glr_auto_examples_infer_spatial_correlation.py` +# examples. + +dh = xdem.DEM(xdem.examples.get_path("longyearbyen_ddem")) +ref_dem = xdem.DEM(xdem.examples.get_path("longyearbyen_ref_dem")) +glacier_outlines = gu.Vector(xdem.examples.get_path("longyearbyen_glacier_outlines")) +slope, maximum_curvature = xdem.terrain.get_terrain_attribute(ref_dem, attribute=['slope', 'maximum_curvature']) +errors, df_binning, error_function = \ + xdem.spatialstats.infer_heteroscedasticy_from_stable(dvalues=dh, list_var=[slope, maximum_curvature], + list_var_names=['slope', 'maxc'], + unstable_mask=glacier_outlines) + +# %% +# We use the error map to standardize the elevation differences before variogram estimation, as in Equation 12 of +# Hugonnet et al. (2022), which is more robust as it removes the variance variability due to heteroscedasticity. +zscores = dh / errors +emp_variogram, params_variogram_model, spatial_corr_function = \ + xdem.spatialstats.infer_spatial_correlation_from_stable(dvalues=zscores, list_models=['Gaussian', 'Spherical'], + unstable_mask=glacier_outlines, random_state=42) + +# %% +# With our estimated heteroscedasticity and spatial correlation, we can now perform the spatial propagation of errors. +# We select two glaciers intersecting this elevation change map in Svalbard. The best estimation of their standard error +# is done by directly providing the shapefile, which relies on Equation 18 of Hugonnet et al. (2022). + +areas = [glacier_outlines.ds[glacier_outlines.ds['NAME'] == 'Brombreen'], + glacier_outlines.ds[glacier_outlines.ds['NAME'] == 'Medalsbreen']] +stderr_glaciers = xdem.spatialstats.spatial_error_propagation(areas=areas, errors=errors, + params_variogram_model=params_variogram_model) + +# When passing a numerical area value, we compute an approximation with disk shape from Equation 8 of Rolstad et al. (2009). +# This approximation is practical to visualize changes in elevation error when averaging over different area sizes, +# but is less accurate to estimate the standard error of a certain area shape. +areas = 10**np.linspace(1, 12) +stderrs = xdem.spatialstats.spatial_error_propagation(areas=areas, errors=errors, + params_variogram_model=params_variogram_model) +plt.plot(areas / 10**6 , stderrs) +plt.xlabel('Averaging area (km²)') +plt.ylabel('Standard error (m)') +plt.vlines(x=[np.pi*params_variogram_model['range'].values[0]**2 / 10**6, + np.pi*params_variogram_model['range'].values[1]**2 / 10**6], + ymin=np.min(stderrs), ymax=np.max(stderrs), colors='red', linestyles='dashed', + label='Area of disk with radius the correlation range') +plt.xscale('log') +plt.legend() + diff --git a/tests/test_spatialstats.py b/tests/test_spatialstats.py index 209b944c..1bb735d7 100644 --- a/tests/test_spatialstats.py +++ b/tests/test_spatialstats.py @@ -541,9 +541,11 @@ def test_spatial_error_propagation(self): list_stderr = xdem.spatialstats.spatial_error_propagation(areas=areas_numeric, errors=errors, params_variogram_model=params_variogram_model) - # Check that the outputs are consistent: the numeric method should give smaller neff + # Check that the outputs are consistent: the numeric method should always give smaller neff, but not too far + # off (20% relative) for those two glaciers as their shape is not too different from a disk for i in range(2): assert list_stderr_vec[i] > list_stderr[i] + assert list_stderr_vec[i] == pytest.approx(list_stderr[i], rel=0.2) class TestSubSampling: From b9e6a38740c6255954aa00648d2f68a6f5977224 Mon Sep 17 00:00:00 2001 From: rhugonne Date: Mon, 22 Aug 2022 12:28:11 +0200 Subject: [PATCH 10/21] Incremental commit on updating the documentation --- ... => plot_heterosc_estimation_modelling.py} | 36 +++++++++---------- ...fer_heterosc.py => plot_infer_heterosc.py} | 12 +++---- ...n.py => plot_infer_spatial_correlation.py} | 14 ++++---- ...n.py => plot_spatial_error_propagation.py} | 22 ++++++------ ...=> plot_variogram_estimation_modelling.py} | 7 ++-- 5 files changed, 43 insertions(+), 48 deletions(-) rename examples/{heterosc_estimation_modelling.py => plot_heterosc_estimation_modelling.py} (87%) rename examples/{infer_heterosc.py => plot_infer_heterosc.py} (84%) rename examples/{infer_spatial_correlation.py => plot_infer_spatial_correlation.py} (82%) rename examples/{spatial_error_propagation.py => plot_spatial_error_propagation.py} (78%) rename examples/{variogram_estimation_modelling.py => plot_variogram_estimation_modelling.py} (98%) diff --git a/examples/heterosc_estimation_modelling.py b/examples/plot_heterosc_estimation_modelling.py similarity index 87% rename from examples/heterosc_estimation_modelling.py rename to examples/plot_heterosc_estimation_modelling.py index bc764aec..7d6f63c3 100644 --- a/examples/heterosc_estimation_modelling.py +++ b/examples/plot_heterosc_estimation_modelling.py @@ -1,23 +1,21 @@ """ -Non-stationarity of elevation measurement errors -================================================ - -Digital elevation models have a precision that can vary with terrain and instrument-related variables. However, quantifying -this precision is complex and non-stationarities, i.e. variability of the measurement error, has rarely been -accounted for, with only some studies that used arbitrary filtering thresholds on the slope or other variables (see :ref:`intro`). - -Quantifying the non-stationarities in elevation measurement errors is essential to use stable terrain as a proxy for -assessing the precision on other types of terrain (Hugonnet et al., in prep) and allows to standardize the measurement -errors to reach a stationary variance, an assumption necessary for spatial statistics (see :ref:`spatialstats`). - -Here, we show an example in which we identify terrain-related non-stationarities for a DEM difference at Longyearbyen. -We quantify those non-stationarities by `binning `_ robustly -in N-dimension using :func:`xdem.spatialstats.nd_binning` and applying a N-dimensional interpolation -:func:`xdem.spatialstats.interp_nd_binning` to estimate a numerical function of the measurement error and derive the spatial -distribution of elevation measurement errors of the difference of DEMs. - -**Reference**: `Hugonnet et al. (2021) `_, applied to the terrain slope -and quality of stereo-correlation (Equation 1, Extended Data Fig. 3a). +Estimate and model elevation heteroscedasticity +=============================================== + +Digital elevation models have a precision that can vary with terrain and instrument-related variables. This variability +in variance is called `heteroscedasticy `_, +and rarely accounted for in DEM studies (see :ref:`intro`). Quantifying elevation heteroscedasticity is essential to +use stable terrain as an error proxy for moving terrain, and standardize data towards a stationary variance, necessary +to apply spatial statistics (see :ref:`spatialstats`). + +Here, we show an advanced example in which we look for terrain-dependent explanatory variables to explain the +heteroscedasticity for a DEM difference at Longyearbyen. We use `data binning `_ +and robust statistics in N-dimension with :func:`xdem.spatialstats.nd_binning`, apply a N-dimensional interpolation with +:func:`xdem.spatialstats.interp_nd_binning`, and scale our interpolant function by :func:`xdem.spatialstats.two_step_standardization` +to produce an elevation error function. + +**References**: `Hugonnet et al. (2021) `_, Equation 1, Extended Data Fig. +3a and `Hugonnet et al. (2022) `_, Figs. 4 and S6–S9. """ # sphinx_gallery_thumbnail_number = 8 import matplotlib.pyplot as plt diff --git a/examples/infer_heterosc.py b/examples/plot_infer_heterosc.py similarity index 84% rename from examples/infer_heterosc.py rename to examples/plot_infer_heterosc.py index b4a07e9d..3cbd9cd0 100644 --- a/examples/infer_heterosc.py +++ b/examples/plot_infer_heterosc.py @@ -2,12 +2,12 @@ Elevation error map =================== -Digital elevation models have a precision that can vary with terrain and instrument-related variables. Here, we apply -the framework of `Hugonnet et al. (2022) `_ to estimate and model this -variability in elevation error, using terrain slope and maximum curvature as explanatory variables and stable terrain -as an error proxy for moving terrain. +Digital elevation models have a precision that can vary with terrain and instrument-related variables. Here, we +rely on a non-stationary spatial statistics framework to estimate and model this variability in elevation error, +using terrain slope and maximum curvature as explanatory variables, with stable terrain as an error proxy for moving +terrain. -**References**: `Hugonnet et al. (2022) `_. See in particular Figure 4. +**Reference**: `Hugonnet et al. (2022) `_, Figs. 4 and S6–S9. Errors in elevation difference can be converted in elevation errors following Equation 7 (equal if other source of much higher precision) or Equation 8 (divided by sqrt(2) if the two sources are of same precision). @@ -54,4 +54,4 @@ # This pipeline will not always work optimally with default parameters: spread estimates can be affected by skewed # distributions, the binning by extreme range of values, some DEMs do not have any error variability with terrain (e.g., # terrestrial photogrammetry). **To learn how to tune more parameters and use the subfunctions, see the gallery example:** -# :ref:`sphx_glr_auto_examples_heterosc_estimation_modelling.py`! \ No newline at end of file +# :ref:`sphx_glr_auto_examples_plot_heterosc_estimation_modelling.py`! \ No newline at end of file diff --git a/examples/infer_spatial_correlation.py b/examples/plot_infer_spatial_correlation.py similarity index 82% rename from examples/infer_spatial_correlation.py rename to examples/plot_infer_spatial_correlation.py index 72f72d70..b6980901 100644 --- a/examples/infer_spatial_correlation.py +++ b/examples/plot_infer_spatial_correlation.py @@ -2,13 +2,11 @@ Spatial correlation of errors ============================= -Digital elevation models have errors that are spatially correlated due to instrument or processing effects. Here, we apply -the framework of `Hugonnet et al. (2022) `_ to estimate and model this -spatial correlation in elevation error, using a sum of variogram forms to model this correlation, and stable terrain -as an error proxy for moving terrain. +Digital elevation models have errors that are spatially correlated due to instrument or processing effects. Here, we +rely on a non-stationary spatial statistics framework to estimate and model spatial correlations in elevation error. +We use a sum of variogram forms to model this correlation, with stable terrain as an error proxy for moving terrain. -**References**: `Hugonnet et al. (2022) `_. See in particular Figure 5, -Equation 13–16. +**Reference**: `Hugonnet et al. (2022) `_, Figure 5 and Equations 13–16. """ # sphinx_gallery_thumbnail_number = 1 import xdem @@ -23,7 +21,7 @@ # %% # Then, we run the pipeline for inference of elevation heteroscedasticity from stable terrain (*Note: we pass a -# random_state argument to ensure a fixed random subsampling in this example, useful for result reproducibility*): +# random_state argument to ensure a fixed, reproducible random subsampling in this example*): df_empirical_variogram, df_model_params, spatial_corr_function = \ xdem.spatialstats.infer_spatial_correlation_from_stable(dvalues=dh, list_models=['Gaussian', 'Spherical'], unstable_mask=glacier_outlines, random_state=42) @@ -57,4 +55,4 @@ # This pipeline will not always work optimally with default parameters: variogram sampling is more robust with a lot of # samples but takes long computing times, and the fitting might require multiple tries for forms and possibly bounds # and first guesses to help the least-squares optimization. **To learn how to tune more parameters and use the -# subfunctions, see the gallery example:** :ref:`sphx_glr_auto_examples_variogram_estimation_modelling.py`! \ No newline at end of file +# subfunctions, see the gallery example:** :ref:`sphx_glr_auto_examples_plot_variogram_estimation_modelling.py`! \ No newline at end of file diff --git a/examples/spatial_error_propagation.py b/examples/plot_spatial_error_propagation.py similarity index 78% rename from examples/spatial_error_propagation.py rename to examples/plot_spatial_error_propagation.py index 64bb4b16..c846dc69 100644 --- a/examples/spatial_error_propagation.py +++ b/examples/plot_spatial_error_propagation.py @@ -1,15 +1,16 @@ """ -Spatial propagation of errors -============================= +Spatial propagation of elevation errors +======================================= -Digital elevation models have errors that are spatially correlated due to instrument or processing effects. Here, we apply -the framework of `Hugonnet et al. (2022) `_ to estimate and model this -spatial correlation in elevation error, using a sum of variogram forms to model this correlation, and stable terrain -as an error proxy for moving terrain. +Propagating elevation errors spatially accounting for heteroscedasticity and spatial correlation is complex. It +requires computing the pairwise correlations between all points of an area of interest (be it for a sum, mean, or +other operation), which is computationally intensive. Here, we rely on published formulations to perform +computationally-efficient spatial propagation for the mean of elevation (or elevation differences) in an area. **References**: `Hugonnet et al. (2022) `_. Figure S16, Equations 17–19, `Rolstad et al. (2009) `_, Equation 8. """ +# sphinx_gallery_thumbnail_number = 1 import numpy as np import xdem @@ -31,7 +32,7 @@ unstable_mask=glacier_outlines) # %% -# We use the error map to standardize the elevation differences before variogram estimation, as in Equation 12 of +# We use the error map to standardize the elevation differences before variogram estimation, following Equation 12 of # Hugonnet et al. (2022), which is more robust as it removes the variance variability due to heteroscedasticity. zscores = dh / errors emp_variogram, params_variogram_model, spatial_corr_function = \ @@ -42,15 +43,14 @@ # With our estimated heteroscedasticity and spatial correlation, we can now perform the spatial propagation of errors. # We select two glaciers intersecting this elevation change map in Svalbard. The best estimation of their standard error # is done by directly providing the shapefile, which relies on Equation 18 of Hugonnet et al. (2022). - areas = [glacier_outlines.ds[glacier_outlines.ds['NAME'] == 'Brombreen'], glacier_outlines.ds[glacier_outlines.ds['NAME'] == 'Medalsbreen']] stderr_glaciers = xdem.spatialstats.spatial_error_propagation(areas=areas, errors=errors, params_variogram_model=params_variogram_model) -# When passing a numerical area value, we compute an approximation with disk shape from Equation 8 of Rolstad et al. (2009). -# This approximation is practical to visualize changes in elevation error when averaging over different area sizes, -# but is less accurate to estimate the standard error of a certain area shape. +# When passing a numerical area value, we compute an approximation with disk shape from Equation 8 of Rolstad et al. +# (2009). This approximation is practical to visualize changes in elevation error when averaging over different area +# sizes, but is less accurate to estimate the standard error of a certain area shape. areas = 10**np.linspace(1, 12) stderrs = xdem.spatialstats.spatial_error_propagation(areas=areas, errors=errors, params_variogram_model=params_variogram_model) diff --git a/examples/variogram_estimation_modelling.py b/examples/plot_variogram_estimation_modelling.py similarity index 98% rename from examples/variogram_estimation_modelling.py rename to examples/plot_variogram_estimation_modelling.py index 44569cda..90047561 100644 --- a/examples/variogram_estimation_modelling.py +++ b/examples/plot_variogram_estimation_modelling.py @@ -86,8 +86,7 @@ # large grid data in `scikit-gstat `_, which are encapsulated # conveniently by :func:`xdem.spatialstats.sample_empirical_variogram`: -df = xdem.spatialstats.sample_empirical_variogram( - values=dh.data, gsd=dh.res[0], subsample=300, n_variograms=10, random_state=42) +df = xdem.spatialstats.sample_empirical_variogram(values=dh.data, gsd=dh.res[0], subsample=100, n_variograms=10, random_state=42) # %% # *Note: in this example, we add a* ``random_state`` *argument to yield a reproducible random sampling of pixels within @@ -169,7 +168,7 @@ for area_emp in areas_emp: # First, sample intensively circular patches of a given area, and derive the mean elevation differences - df_patches = xdem.spatialstats.patches_method(dh.data.data, gsd=dh.res[0], area=area_emp, n_patches=200, random_state=42) + df_patches = xdem.spatialstats.patches_method(dh.data.data, gsd=dh.res[0], area=area_emp, n_patches=100, random_state=42) # Second, estimate the dispersion of the means of each patch, i.e. the standard error of the mean stderr_empirical = np.nanstd(df_patches['nanmedian'].values) list_stderr_empirical.append(stderr_empirical) @@ -219,7 +218,7 @@ for area in areas: # For a double-range model - neff_doublerange = xdem.spatialstats.neff_circular_approx_numerical(area = area, params_variogram_model = params_vgm2) + neff_doublerange = xdem.spatialstats.neff_circular_approx_numerical(area=area, params_variogram_model=params_vgm2) # About 5% of the variance might be fully correlated, the other 95% has the random part that we quantified stderr_fullycorr = np.sqrt(0.05*np.nanvar(dh.data)) From 198026d06549fd7e8545ab1100890eab688e20f7 Mon Sep 17 00:00:00 2001 From: rhugonne Date: Mon, 22 Aug 2022 13:24:40 +0200 Subject: [PATCH 11/21] Move nd column in nd_binning output tofirst position --- xdem/spatialstats.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/xdem/spatialstats.py b/xdem/spatialstats.py index 37d5e39d..717c8fa9 100644 --- a/xdem/spatialstats.py +++ b/xdem/spatialstats.py @@ -98,15 +98,15 @@ def nd_binning(values: np.ndarray, list_var: Iterable[np.ndarray], list_var_name list_df_1d = [] for i, var in enumerate(list_var): df_stats_1d = pd.DataFrame() - # get statistics + # Get statistics for j, statistic in enumerate(statistics): stats_binned_1d, bedges_1d = binned_statistic(var,values,statistic=statistic,bins=list_var_bins[i],range=list_ranges)[:2] - # save in a dataframe + # Save in a dataframe df_stats_1d[statistics_name[j]] = stats_binned_1d - # we need to get the middle of the bins from the edges, to get the same dimension length + # We need to get the middle of the bins from the edges, to get the same dimension length df_stats_1d[list_var_names[i]] = pd.IntervalIndex.from_breaks(bedges_1d,closed='left') - # report number of dimensions used - df_stats_1d['nd'] = 1 + # Report number of dimensions used + df_stats_1d.insert(0, 'nd', 1) list_df_1d.append(df_stats_1d) @@ -116,22 +116,22 @@ def nd_binning(values: np.ndarray, list_var: Iterable[np.ndarray], list_var_name combs = list(itertools.combinations(list_var_names, 2)) for i, comb in enumerate(combs): var1_name, var2_name = comb - # corresponding variables indexes + # Corresponding variables indexes i1, i2 = list_var_names.index(var1_name), list_var_names.index(var2_name) df_stats_2d = pd.DataFrame() for j, statistic in enumerate(statistics): stats_binned_2d, bedges_var1, bedges_var2 = binned_statistic_2d(list_var[i1],list_var[i2],values,statistic=statistic ,bins=[list_var_bins[i1],list_var_bins[i2]] ,range=list_ranges)[:3] - # get statistics + # Get statistics df_stats_2d[statistics_name[j]] = stats_binned_2d.flatten() - # derive interval indexes and convert bins into 2d indexes + # Derive interval indexes and convert bins into 2d indexes ii1 = pd.IntervalIndex.from_breaks(bedges_var1,closed='left') ii2 = pd.IntervalIndex.from_breaks(bedges_var2,closed='left') df_stats_2d[var1_name] = [i1 for i1 in ii1 for i2 in ii2] df_stats_2d[var2_name] = [i2 for i1 in ii1 for i2 in ii2] - # report number of dimensions used - df_stats_2d['nd'] = 2 + # Report number of dimensions used + df_stats_2d.insert(0, 'nd', 2) list_df_2d.append(df_stats_2d) @@ -153,7 +153,7 @@ def nd_binning(values: np.ndarray, list_var: Iterable[np.ndarray], list_var_name df_stats_nd[var_name] = iind[i].flatten() # Report number of dimensions used - df_stats_nd['nd'] = len(list_var_names) + df_stats_nd.insert(0, 'nd', len(list_var_names)) # Concatenate everything list_all_dfs = list_df_1d + list_df_2d + [df_stats_nd] From dbc3d309b4faf03453c3b8cb239721c646ffd26b Mon Sep 17 00:00:00 2001 From: rhugonne Date: Mon, 22 Aug 2022 14:04:54 +0200 Subject: [PATCH 12/21] Corrections and update of previous non-stationarity example --- .../plot_heterosc_estimation_modelling.py | 110 +++++++++++------- examples/plot_infer_heterosc.py | 11 +- examples/plot_infer_spatial_correlation.py | 8 +- .../plot_variogram_estimation_modelling.py | 4 +- 4 files changed, 77 insertions(+), 56 deletions(-) diff --git a/examples/plot_heterosc_estimation_modelling.py b/examples/plot_heterosc_estimation_modelling.py index 7d6f63c3..ccd47cb5 100644 --- a/examples/plot_heterosc_estimation_modelling.py +++ b/examples/plot_heterosc_estimation_modelling.py @@ -11,11 +11,15 @@ Here, we show an advanced example in which we look for terrain-dependent explanatory variables to explain the heteroscedasticity for a DEM difference at Longyearbyen. We use `data binning `_ and robust statistics in N-dimension with :func:`xdem.spatialstats.nd_binning`, apply a N-dimensional interpolation with -:func:`xdem.spatialstats.interp_nd_binning`, and scale our interpolant function by :func:`xdem.spatialstats.two_step_standardization` -to produce an elevation error function. +:func:`xdem.spatialstats.interp_nd_binning`, and scale our interpolant function with a two-step standardization + :func:`xdem.spatialstats.two_step_standardization` to produce the final elevation error function. **References**: `Hugonnet et al. (2021) `_, Equation 1, Extended Data Fig. 3a and `Hugonnet et al. (2022) `_, Figs. 4 and S6–S9. + +Errors in elevation difference can be converted in elevation errors following Equation 7 (equal if other source of much +higher precision) or Equation 8 (divided by sqrt(2) if the two sources are of same precision). Below we consider errors +in elevation differences. """ # sphinx_gallery_thumbnail_number = 8 import matplotlib.pyplot as plt @@ -24,10 +28,8 @@ import geoutils as gu # %% -# We start by loading example files including a difference of DEMs at Longyearbyen, the reference DEM later used to derive -# several terrain attributes, and the outlines to rasterize a glacier mask. -# Prior to differencing, the DEMs were aligned using :ref:`coregistration_nuthkaab` as shown in -# the :ref:`sphx_glr_auto_examples_plot_nuth_kaab.py` example. We later refer to those elevation differences as *dh*. +# Here, we detail the steps used by ``xdem.spatialstats.infer_heteroscedasticity_from_stable`` exemplified in +# :ref:`sphx_glr_auto_examples_plot_infer_heterosc.py`. First, we load example files and create a glacier mask. ref_dem = xdem.DEM(xdem.examples.get_path("longyearbyen_ref_dem")) dh = xdem.DEM(xdem.examples.get_path("longyearbyen_ddem")) @@ -35,17 +37,15 @@ mask_glacier = glacier_outlines.create_mask(dh) # %% -# We use the reference DEM to derive terrain variables such as slope, aspect, curvature (see :ref:`sphx_glr_auto_examples_plot_terrain_attributes.py`) -# that we'll use to explore potential non-stationarities in elevation measurement error - -# We compute the slope, aspect, and both plan and profile curvatures: +# We derive terrain attributes from the reference DEM (see :ref:`sphx_glr_auto_examples_plot_terrain_attributes.py`), +# which we will use to explore the variability in elevation error. slope, aspect, planc, profc = \ xdem.terrain.get_terrain_attribute(dem=ref_dem.data, attribute=['slope','aspect', 'planform_curvature', 'profile_curvature'], resolution=ref_dem.res) # %% -# We remove values on unstable terrain +# We keep only stable terrain for the analysis of variability dh_arr = dh.data[~mask_glacier] slope_arr = slope[~mask_glacier] aspect_arr = aspect[~mask_glacier] @@ -77,21 +77,19 @@ # using :func:`xdem.spatialstats.plot_1d_binning`. # We can start with the slope that has been long known to be related to the elevation measurement error (e.g., # `Toutin (2002) `_). -xdem.spatialstats.plot_1d_binning(df, 'slope', 'nmad', 'Slope (degrees)', 'NMAD of dh (m)') +xdem.spatialstats.plot_1d_binning(df, var_name='slope', statistic_name='nmad', + label_var='Slope (degrees)', label_statistic='NMAD of dh (m)') # %% # We identify a clear variability, with the dispersion estimated from the NMAD increasing from ~2 meters for nearly flat # slopes to above 12 meters for slopes steeper than 50°. -# In statistical terms, such a variability of `variance `_ is referred as -# `heteroscedasticity `_. Here we observe heteroscedastic elevation -# differences due to a non-stationarity of variance with the terrain slope. # # What about the aspect? xdem.spatialstats.plot_1d_binning(df, 'aspect', 'nmad', 'Aspect (degrees)', 'NMAD of dh (m)') # %% -# There is no variability with the aspect which shows a dispersion averaging 2-3 meters, i.e. that of the complete sample. +# There is no variability with the aspect that shows a dispersion averaging 2-3 meters, i.e. that of the complete sample. # # What about the plan curvature? @@ -101,40 +99,40 @@ # The relation with the plan curvature remains ambiguous. # We should better define our bins to avoid sampling bins with too many or too few samples. For this, we can partition # the data in quantiles in :func:`xdem.spatialstats.nd_binning`. -# Note: we need a higher number of bins to work with quantiles and still resolve the edges of the distribution. Thus, as -# with many dimensions the N dimensional bin size increases exponentially, we avoid binning all variables at the same -# time and instead bin one at a time. +# *Note: we need a higher number of bins to work with quantiles and still resolve the edges of the distribution. As +# with many dimensions the ND bin size increases exponentially, we avoid binning all variables at the same +# time and instead bin one at a time.* # We define 1000 quantile bins of size 0.001 (equivalent to 0.1% percentile bins) for the profile curvature: df = xdem.spatialstats.nd_binning(values=dh_arr, list_var=[profc_arr], list_var_names=['profc'], statistics=['count', np.nanmedian, xdem.spatialstats.nmad], - list_var_bins=[np.nanquantile(profc_arr,np.linspace(0,1,1000))]) + list_var_bins=[np.nanquantile(profc_arr, np.linspace(0,1,1000))]) xdem.spatialstats.plot_1d_binning(df, 'profc', 'nmad', 'Profile curvature (100 m$^{-1}$)', 'NMAD of dh (m)') # %% -# We now clearly identify the variability with the profile curvature, from 2 meters for low curvatures to above 4 meters +# We clearly identify a variability with the profile curvature, from 2 meters for low curvatures to above 4 meters # for higher positive or negative curvature. +# # What about the role of the plan curvature? df = xdem.spatialstats.nd_binning(values=dh_arr, list_var=[planc_arr], list_var_names=['planc'], statistics=['count', np.nanmedian, xdem.spatialstats.nmad], - list_var_bins=[np.nanquantile(planc_arr,np.linspace(0,1,1000))]) + list_var_bins=[np.nanquantile(planc_arr, np.linspace(0,1,1000))]) xdem.spatialstats.plot_1d_binning(df, 'planc', 'nmad', 'Planform curvature (100 m$^{-1}$)', 'NMAD of dh (m)') # %% # The plan curvature shows a similar relation. Those are symmetrical with 0, and almost equal for both types of curvature. # To simplify the analysis, we here combine those curvatures into the maximum absolute curvature: -# Derive maximum absolute curvature maxc_arr = np.maximum(np.abs(planc_arr),np.abs(profc_arr)) df = xdem.spatialstats.nd_binning(values=dh_arr, list_var=[maxc_arr], list_var_names=['maxc'], statistics=['count', np.nanmedian, xdem.spatialstats.nmad], - list_var_bins=[np.nanquantile(maxc_arr,np.linspace(0,1,1000))]) + list_var_bins=[np.nanquantile(maxc_arr, np.linspace(0,1,1000))]) xdem.spatialstats.plot_1d_binning(df, 'maxc', 'nmad', 'Maximum absolute curvature (100 m$^{-1}$)', 'NMAD of dh (m)') # %% # Here's our simplified relation! We now have both slope and maximum absolute curvature with clear variability of -# the elevation measurement error. +# the elevation error. # # **But, one might wonder: high curvatures might occur more often around steep slopes than flat slope, # so what if those two dependencies are actually one and the same?** @@ -145,7 +143,10 @@ statistics=['count', np.nanmedian, xdem.spatialstats.nmad], list_var_bins=30) -xdem.spatialstats.plot_2d_binning(df, 'slope', 'maxc', 'nmad', 'Slope (degrees)', 'Maximum absolute curvature (100 m$^{-1}$)', 'NMAD of dh (m)') +xdem.spatialstats.plot_2d_binning(df, var_name_1='slope', var_name_2='maxc', statistic_name='nmad', + label_var_name_1='Slope (degrees)', + label_var_name_2='Maximum absolute curvature (100 m$^{-1}$)', + label_statistic='NMAD of dh (m)') # %% # We can see that part of the variability seems to be independent, but with the uniform bins it is hard to tell much @@ -153,18 +154,19 @@ # # If we use custom quantiles for both binning variables, and adjust the plot scale: -custom_bin_slope = np.unique(np.concatenate([np.nanquantile(slope_arr,np.linspace(0,0.95,20)), - np.nanquantile(slope_arr,np.linspace(0.96,0.99,5)), - np.nanquantile(slope_arr,np.linspace(0.991,1,10))])) +custom_bin_slope = np.unique(np.concatenate([np.nanquantile(slope_arr, np.linspace(0,0.95,20)), + np.nanquantile(slope_arr, np.linspace(0.96,0.99,5)), + np.nanquantile(slope_arr, np.linspace(0.991,1,10))])) -custom_bin_curvature = np.unique(np.concatenate([np.nanquantile(maxc_arr,np.linspace(0,0.95,20)), - np.nanquantile(maxc_arr,np.linspace(0.96,0.99,5)), - np.nanquantile(maxc_arr,np.linspace(0.991,1,10))])) +custom_bin_curvature = np.unique(np.concatenate([np.nanquantile(maxc_arr, np.linspace(0,0.95,20)), + np.nanquantile(maxc_arr, np.linspace(0.96,0.99,5)), + np.nanquantile(maxc_arr, np.linspace(0.991,1,10))])) df = xdem.spatialstats.nd_binning(values=dh_arr, list_var=[slope_arr, maxc_arr], list_var_names=['slope', 'maxc'], statistics=['count', np.nanmedian, xdem.spatialstats.nmad], list_var_bins=[custom_bin_slope,custom_bin_curvature]) -xdem.spatialstats.plot_2d_binning(df, 'slope', 'maxc', 'nmad', 'Slope (degrees)', 'Maximum absolute curvature (100 m$^{-1}$)', 'NMAD of dh (m)', scale_var_2='log', vmin=2, vmax=10) +xdem.spatialstats.plot_2d_binning(df, 'slope', 'maxc', 'nmad', 'Slope (degrees)', + 'Maximum absolute curvature (100 m$^{-1}$)', 'NMAD of dh (m)', scale_var_2='log', vmin=2, vmax=10) # %% @@ -180,22 +182,35 @@ # approximation i.e. a piecewise linear interpolation/extrapolation based on the binning results. # To ensure that only robust statistic values are used in the interpolation, we set a ``min_count`` value at 30 samples. -slope_curv_to_dh_err = xdem.spatialstats.interp_nd_binning(df, list_var_names=['slope', 'maxc'], statistic='nmad', min_count=30) +unscaled_dh_err_fun = xdem.spatialstats.interp_nd_binning(df, list_var_names=['slope', 'maxc'], + statistic='nmad', min_count=30) # %% -# The output is an interpolant function of slope and curvature that we can use to estimate the elevation measurement -# error at any point. +# The output is an interpolant function of slope and curvature that predicts the elevation error at any point. However, +# this predicted error might have a spread slightly off from that of the data: # -# For instance: +# We compare the spread of the elevation difference on stable terrain and the average predicted error: +dh_err_stable = unscaled_dh_err_fun((slope_arr, maxc_arr)) + +print('The spread of elevation difference is {:.2f} ' + 'compared to a mean predicted elevation error of {:.2f}.'.format(xdem.spatialstats.nmad(dh_arr), + np.nanmean(dh_err_stable))) + +# %% +# Thus, we rescale the function to exactly match the spread on stable terrain using the +# ``xdem.spatialstats.two_step_standardization`` function, and get our final error function. + +dh_err_fun = xdem.spatialstats.two_step_standardization(dh_arr, list_var=[slope_arr, maxc_arr], + unscaled_error_fun=unscaled_dh_err_fun) -for s, c in [(0.,0.1), (50.,0.1), (0.,20.), (50.,20.)]: +for s, c in [(0., 0.1), (50., 0.1), (0., 20.), (50., 20.)]: print('Elevation measurement error for slope of {0:.0f} degrees, ' - 'curvature of {1:.2f} m-1: {2:.1f}'.format(s, c/100, slope_curv_to_dh_err((s,c)))+ ' meters.') + 'curvature of {1:.2f} m-1: {2:.1f}'.format(s, c/100, dh_err_fun((s, c)))+ ' meters.') # %% -# The same function can be used to estimate the spatial distribution of the elevation measurement error over the area: +# This function can be used to estimate the spatial distribution of the elevation error on the extent of our DEMs: maxc = np.maximum(np.abs(profc), np.abs(planc)) -dh_err = slope_curv_to_dh_err((slope, maxc)) +errors = dh_err_fun((slope, maxc)) plt.figure(figsize=(8, 5)) plt_extent = [ @@ -204,7 +219,14 @@ ref_dem.bounds.bottom, ref_dem.bounds.top, ] -plt.imshow(dh_err.squeeze(), cmap="Reds", vmin=2, vmax=8, extent=plt_extent) +plt.imshow(errors.squeeze(), cmap="Reds", vmin=2, vmax=8, extent=plt_extent) cbar = plt.colorbar() -cbar.set_label('Elevation measurement error (m)') -plt.show() \ No newline at end of file +cbar.set_label('Elevation error ($1\sigma$, m)') +plt.show() + +# %% +# These 3 steps can be done in one go using the ``xdem.spatialstats.estimate_model_heteroscedasticity`` function, which +# wraps those three funtions, expecting ``np.ndarray`` inputs subset to stable terrain. + +df, dh_err_fun = xdem.spatialstats.estimate_model_heteroscedasticity(dvalues=dh_arr, list_var=[slope_arr, maxc_arr], + list_var_names=['slope', 'maxc']) diff --git a/examples/plot_infer_heterosc.py b/examples/plot_infer_heterosc.py index 3cbd9cd0..8f920c95 100644 --- a/examples/plot_infer_heterosc.py +++ b/examples/plot_infer_heterosc.py @@ -10,7 +10,8 @@ **Reference**: `Hugonnet et al. (2022) `_, Figs. 4 and S6–S9. Errors in elevation difference can be converted in elevation errors following Equation 7 (equal if other source of much -higher precision) or Equation 8 (divided by sqrt(2) if the two sources are of same precision). +higher precision) or Equation 8 (divided by sqrt(2) if the two sources are of same precision). Below we consider errors +in elevation differences. """ # sphinx_gallery_thumbnail_number = 1 import xdem @@ -37,7 +38,7 @@ # %% # The first output corresponds to the error map for the DEM (1-sigma): -errors.show(vmin=2, vmax=7, cmap='Reds', cb_title='Elevation error (1$\sigma$)') +errors.show(vmin=2, vmax=7, cmap='Reds', cb_title='Elevation error (1$\sigma$, m)') # %% # The second output is the dataframe of 2D binning with slope and maximum curvature: @@ -46,9 +47,9 @@ # %% # The third output is the 2D binning interpolant, i.e. an error function with the slope and maximum curvature # (*Note: below we multiply the maximum curvature by 100 to convert it in m-1*): -print('Error for a slope of {:.0f} degrees and {:.0f} m-1 max. curvature: {:.1f} m'.format(0, 0, error_function((0, 0)))) -print('Error for a slope of {:.0f} degrees and {:.0f} m-1 max. curvature: {:.1f} m'.format(40, 0, error_function((40, 0)))) -print('Error for a slope of {:.0f} degrees and {:.0f} m-1 max. curvature: {:.1f} m'.format(0, 100*5, error_function((0, 5)))) +for slope, maxc in [(0, 0), (40, 0), (0, 5), (40, 5)]: + print('Error for a slope of {:.0f} degrees and' + ' {:.0f} m-1 max. curvature: {:.1f} m'.format(slope, maxc * 100., error_function((slope, maxc)))) # %% # This pipeline will not always work optimally with default parameters: spread estimates can be affected by skewed diff --git a/examples/plot_infer_spatial_correlation.py b/examples/plot_infer_spatial_correlation.py index b6980901..0f0d6c71 100644 --- a/examples/plot_infer_spatial_correlation.py +++ b/examples/plot_infer_spatial_correlation.py @@ -37,11 +37,9 @@ # %% # The third output is the spatial correlation function with spatial lags, derived from the variogram: -print('Errors are correlated at {:.1f}% for a {:,.0f} m spatial lag'.format(spatial_corr_function(0)*100, 0)) -print('Errors are correlated at {:.1f}% for a {:,.0f} m spatial lag'.format(spatial_corr_function(100)*100, 100)) -print('Errors are correlated at {:.1f}% for a {:,.0f} m spatial lag'.format(spatial_corr_function(1000)*100, 1000)) -print('Errors are correlated at {:.1f}% for a {:,.0f} m spatial lag'.format(spatial_corr_function(10000)*100, 10000)) -print('Errors are correlated at {:.1f}% for a {:,.0f} m spatial lag'.format(spatial_corr_function(30000)*100, 30000)) +for spatial_lag in [0, 100, 1000, 10000, 30000]: + print('Errors are correlated at {:.1f}% for a {:,.0f} m spatial lag'. + format(spatial_corr_function(spatial_lag)*100, spatial_lag)) # %% # We can plot the empirical variogram and its model on a non-linear X-axis to identify the multi-scale correlations. diff --git a/examples/plot_variogram_estimation_modelling.py b/examples/plot_variogram_estimation_modelling.py index 90047561..09e6ab48 100644 --- a/examples/plot_variogram_estimation_modelling.py +++ b/examples/plot_variogram_estimation_modelling.py @@ -1,6 +1,6 @@ """ -Spatial correlation of elevation measurement errors -=================================================== +Estimate and model spatial variograms +===================================== Digital elevation models have elevation measurement errors that can vary with terrain or instrument-related variables (see :ref:`sphx_glr_auto_examples_plot_nonstationary_error.py`), but those measurement errors are also often From 5ee882c0d7f10ac706fcaa9ca2b4a88884742915 Mon Sep 17 00:00:00 2001 From: rhugonne Date: Mon, 22 Aug 2022 14:43:36 +0200 Subject: [PATCH 13/21] Update spatial correlation advanced example --- examples/plot_icp_coregistration.py | 4 +- examples/plot_nuth_kaab.py | 4 +- .../plot_variogram_estimation_modelling.py | 59 ++++++++----------- 3 files changed, 27 insertions(+), 40 deletions(-) diff --git a/examples/plot_icp_coregistration.py b/examples/plot_icp_coregistration.py index 0d8b4caf..c1d3495b 100644 --- a/examples/plot_icp_coregistration.py +++ b/examples/plot_icp_coregistration.py @@ -1,6 +1,6 @@ """ -Iterative Closest Point (ICP) coregistration. -============================================= +Iterative Closest Point coregistration +====================================== Some DEMs may for one or more reason be erroneously rotated in the X, Y or Z directions. Established coregistration approaches like :ref:`coregistration_nuthkaab` work great for X, Y and Z *translations*, but rotation is not accounted for at all. diff --git a/examples/plot_nuth_kaab.py b/examples/plot_nuth_kaab.py index edfa4fc6..4de9a509 100644 --- a/examples/plot_nuth_kaab.py +++ b/examples/plot_nuth_kaab.py @@ -1,6 +1,6 @@ """ -Nuth & Kääb (2011) coregistration -================================= +Nuth and Kääb coregistration +============================ Nuth and Kääb (`2011 `_) coregistration allows for horizontal and vertical shifts to be estimated and corrected for. In ``xdem``, this approach is implemented through the :class:`xdem.coreg.NuthKaab` class. diff --git a/examples/plot_variogram_estimation_modelling.py b/examples/plot_variogram_estimation_modelling.py index 09e6ab48..542549b4 100644 --- a/examples/plot_variogram_estimation_modelling.py +++ b/examples/plot_variogram_estimation_modelling.py @@ -2,29 +2,21 @@ Estimate and model spatial variograms ===================================== -Digital elevation models have elevation measurement errors that can vary with terrain or instrument-related variables -(see :ref:`sphx_glr_auto_examples_plot_nonstationary_error.py`), but those measurement errors are also often -`correlated in space `_. -While many DEM studies have been using short-range `variogram `_ to +Digital elevation models have errors that are often `correlated in space `_. +While many DEM studies used solely short-range `variogram `_ to estimate the correlation of elevation measurement errors (e.g., `Howat et al. (2008) `_ , `Wang and Kääb (2015) `_), recent studies show that variograms of multiple ranges -provide larger, more reliable estimates of spatial correlation for DEMs (e.g., `Dehecq et al. (2020) `_ -, `Hugonnet et al. (2021) `_). - -Quantifying the spatial correlation in elevation measurement errors is essential to integrate measurement errors over -an area of interest (e.g, to estimate the error of a mean or sum of samples). Once the spatial correlations are quantified, -several methods exist to derive the related measurement error integrated in space (`Rolstad et al. (2009) `_ -, Hugonnet et al. (in prep)). More details are available in :ref:`spatialstats`. - -Here, we show an example in which we estimate spatially integrated elevation measurement errors for a DEM difference at -Longyearbyen, demonstrated in :ref:`sphx_glr_auto_examples_plot_nuth_kaab.py`. We first quantify the spatial -correlations using :func:`xdem.spatialstats.sample_empirical_variogram` based on routines of `scikit-gstat -`_. We then model the empirical variogram using a sum of variogram -models using :func:`xdem.spatialstats.fit_sum_model_variogram`. -Finally, we integrate the variogram models for varying surface areas to estimate the spatially integrated elevation -measurement errors using :func:`xdem.spatialstats.neff_circ`, and empirically validate the improved robustness of -our results using :func:`xdem.spatialstats.patches_method`, an intensive Monte-Carlo sampling approach. +provide larger, more reliable estimates of spatial correlation for DEMs. +Here, we show an example in which we estimate the spatial correlation for a DEM difference at Longyearbyen, and its +impact on the standard error with averaging area. We first estimate an empirical variogram with +:func:`xdem.spatialstats.sample_empirical_variogram` based on routines of `scikit-gstat +`_. We then fit the empirical variogram with a sum of variogram +models using :func:`xdem.spatialstats.fit_sum_model_variogram`. Finally, we perform spatial propagation for a range of +averaging area using :func:`xdem.spatialstats.number_effective_samples`, and empirically validate the improved +robustness of our results using :func:`xdem.spatialstats.patches_method`, an intensive Monte-Carlo sampling approach. + +**Reference:** `Hugonnet et al. (2022) `_, Figure 5 and Equations 13–16. """ # sphinx_gallery_thumbnail_number = 6 import matplotlib.pyplot as plt @@ -33,23 +25,19 @@ import geoutils as gu # %% -# We start by loading example files including a difference of DEMs at Longyearbyen and the outlines to rasterize -# a glacier mask. -# Prior to differencing, the DEMs were aligned using :ref:`coregistration_nuthkaab` as shown in -# the :ref:`sphx_glr_auto_examples_plot_nuth_kaab.py` example. We later refer to those elevation differences as *dh*. +# We load example files. dh = xdem.DEM(xdem.examples.get_path("longyearbyen_ddem")) glacier_outlines = gu.Vector(xdem.examples.get_path("longyearbyen_glacier_outlines")) mask_glacier = glacier_outlines.create_mask(dh) # %% -# We remove values on glacier terrain in order to isolate stable terrain, our proxy for elevation measurement errors. +# We exclude values on glacier terrain in order to isolate stable terrain, our proxy for elevation errors. dh.data[mask_glacier] = np.nan # %% -# We estimate the average per-pixel elevation measurement error on stable terrain, using both the standard deviation -# and normalized median absolute deviation. For this example, we do not account for the non-stationarity in elevation -# measurement errors quantified in :ref:`sphx_glr_auto_examples_plot_nonstationary_error.py`. +# We estimate the average per-pixel elevation error on stable terrain, using both the standard deviation +# and normalized median absolute deviation. For this example, we do not account for elevation heteroscedasticity. print('STD: {:.2f} meters.'.format(np.nanstd(dh.data))) print('NMAD: {:.2f} meters.'.format(xdem.spatialstats.nmad(dh.data))) @@ -64,16 +52,16 @@ # %% # We clearly see that the residual elevation differences on stable terrain are not random. The positive and negative # differences (blue and red, respectively) appear correlated over large distances. These correlated errors are what -# we aim to quantify. +# we want to estimate and model. # %% # Additionally, we notice that the elevation differences are still polluted by unrealistically large elevation -# differences near glaciers, probably because the glacier inventory is more recent than the data, and the outlines are too small. +# differences near glaciers, probably because the glacier inventory is more recent than the data, hence with too small outlines. # To remedy this, we filter large elevation differences outside 4 NMAD. dh.data[np.abs(dh.data) > 4 * xdem.spatialstats.nmad(dh.data)] = np.nan # %% -# We plot the elevation differences after filtering to check that we successively removed the reminaing glacier signals. +# We plot the elevation differences after filtering to check that we successively removed glacier signals. plt.figure(figsize=(8, 5)) _ = dh.show(ax=plt.gca(), cmap='RdYlBu', vmin=-4, vmax=4, cb_title='Elevation differences (m)') @@ -159,10 +147,9 @@ list_stderr_doublerange.append(stderr_doublerange) # %% -# We add an empirical error based on intensive Monte-Carlo sampling ("patches" method) to validate our results -# (Dehecq et al. (2020), Hugonnet et al., in prep). This method is implemented in :func:`xdem.spatialstats.patches_method`. -# Here, we sample fewer areas to avoid for the patches method to run over long processing times, increasing from areas -# of 5 pixels to areas of 10000 pixels exponentially. +# We add an empirical error based on intensive Monte-Carlo sampling ("patches" method) to validate our results. +# This method is implemented in :func:`xdem.spatialstats.patches_method`. Here, we sample fewer areas to avoid for the +# patches method to run over long processing times, increasing from areas of 5 pixels to areas of 10000 pixels exponentially. areas_emp = [10 * 400 * 2 ** i for i in range(10)] for area_emp in areas_emp: @@ -186,7 +173,7 @@ # %% # *Note: in this example, we add a* ``random_state`` *argument to the patches method to yield a reproducible random -# sampling, and set* ``n_patches`` *to limit computing time.* +# sampling, and set* ``n_patches`` *to reduce computing time.* # %% # Using a single-range variogram highly underestimates the measurement error integrated over an area, by over a factor From 39cb70878d1b84502a15552d74e13d1d29ade04f Mon Sep 17 00:00:00 2001 From: rhugonne Date: Mon, 22 Aug 2022 15:37:15 +0200 Subject: [PATCH 14/21] Fix sphinx syntax --- .../plot_heterosc_estimation_modelling.py | 8 ++--- examples/plot_infer_heterosc.py | 2 +- examples/plot_spatial_error_propagation.py | 5 ++- examples/plot_standardization.py | 31 +++++++++---------- xdem/coreg.py | 5 +-- xdem/spatialstats.py | 13 ++++---- 6 files changed, 30 insertions(+), 34 deletions(-) diff --git a/examples/plot_heterosc_estimation_modelling.py b/examples/plot_heterosc_estimation_modelling.py index ccd47cb5..e60eed56 100644 --- a/examples/plot_heterosc_estimation_modelling.py +++ b/examples/plot_heterosc_estimation_modelling.py @@ -12,7 +12,7 @@ heteroscedasticity for a DEM difference at Longyearbyen. We use `data binning `_ and robust statistics in N-dimension with :func:`xdem.spatialstats.nd_binning`, apply a N-dimensional interpolation with :func:`xdem.spatialstats.interp_nd_binning`, and scale our interpolant function with a two-step standardization - :func:`xdem.spatialstats.two_step_standardization` to produce the final elevation error function. +:func:`xdem.spatialstats.two_step_standardization` to produce the final elevation error function. **References**: `Hugonnet et al. (2021) `_, Equation 1, Extended Data Fig. 3a and `Hugonnet et al. (2022) `_, Figs. 4 and S6–S9. @@ -55,7 +55,7 @@ # %% # We use :func:`xdem.spatialstats.nd_binning` to perform N-dimensional binning on all those terrain variables, with uniform # bin length divided by 30. We use the NMAD as a robust measure of `statistical dispersion `_ -# (see :ref:`robuststats_meanstd`). +# (see :ref:`robuststats_meanstd`). df = xdem.spatialstats.nd_binning(values=dh_arr, list_var=[slope_arr, aspect_arr, planc_arr, profc_arr], list_var_names=['slope','aspect','planc','profc'], @@ -124,7 +124,7 @@ # The plan curvature shows a similar relation. Those are symmetrical with 0, and almost equal for both types of curvature. # To simplify the analysis, we here combine those curvatures into the maximum absolute curvature: -maxc_arr = np.maximum(np.abs(planc_arr),np.abs(profc_arr)) +maxc_arr = np.maximum(np.abs(planc_arr), np.abs(profc_arr)) df = xdem.spatialstats.nd_binning(values=dh_arr, list_var=[maxc_arr], list_var_names=['maxc'], statistics=['count', np.nanmedian, xdem.spatialstats.nmad], list_var_bins=[np.nanquantile(maxc_arr, np.linspace(0,1,1000))]) @@ -200,7 +200,7 @@ # Thus, we rescale the function to exactly match the spread on stable terrain using the # ``xdem.spatialstats.two_step_standardization`` function, and get our final error function. -dh_err_fun = xdem.spatialstats.two_step_standardization(dh_arr, list_var=[slope_arr, maxc_arr], +zscores, dh_err_fun = xdem.spatialstats.two_step_standardization(dh_arr, list_var=[slope_arr, maxc_arr], unscaled_error_fun=unscaled_dh_err_fun) for s, c in [(0., 0.1), (50., 0.1), (0., 20.), (50., 20.)]: diff --git a/examples/plot_infer_heterosc.py b/examples/plot_infer_heterosc.py index 8f920c95..f89e4f54 100644 --- a/examples/plot_infer_heterosc.py +++ b/examples/plot_infer_heterosc.py @@ -49,7 +49,7 @@ # (*Note: below we multiply the maximum curvature by 100 to convert it in m-1*): for slope, maxc in [(0, 0), (40, 0), (0, 5), (40, 5)]: print('Error for a slope of {:.0f} degrees and' - ' {:.0f} m-1 max. curvature: {:.1f} m'.format(slope, maxc * 100., error_function((slope, maxc)))) + ' {:.0f} m-1 max. curvature: {:.1f} m'.format(slope, maxc/100, error_function((slope, maxc)))) # %% # This pipeline will not always work optimally with default parameters: spread estimates can be affected by skewed diff --git a/examples/plot_spatial_error_propagation.py b/examples/plot_spatial_error_propagation.py index c846dc69..85b1dc14 100644 --- a/examples/plot_spatial_error_propagation.py +++ b/examples/plot_spatial_error_propagation.py @@ -7,19 +7,18 @@ other operation), which is computationally intensive. Here, we rely on published formulations to perform computationally-efficient spatial propagation for the mean of elevation (or elevation differences) in an area. -**References**: `Hugonnet et al. (2022) `_. Figure S16, Equations 17–19, +**References**: `Hugonnet et al. (2022) `_, Figure S16, Equations 17–19 and `Rolstad et al. (2009) `_, Equation 8. """ # sphinx_gallery_thumbnail_number = 1 import numpy as np - import xdem import geoutils as gu import matplotlib.pyplot as plt # %% # We load the same data, and perform the same calculations on heteroscedasticity and spatial correlations of errors as -# in the :ref:`sphx_glr_auto_examples_infer_heterosc.py` and :ref:`sphx_glr_auto_examples_infer_spatial_correlation.py` +# in the :ref:`sphx_glr_auto_examples_plot_infer_heterosc.py` and :ref:`sphx_glr_auto_examples_plot_infer_spatial_correlation.py` # examples. dh = xdem.DEM(xdem.examples.get_path("longyearbyen_ddem")) diff --git a/examples/plot_standardization.py b/examples/plot_standardization.py index adf32fbd..ebad94f0 100644 --- a/examples/plot_standardization.py +++ b/examples/plot_standardization.py @@ -1,20 +1,17 @@ """ -Standardization for stable terrain as proxy -=========================================== +Standardization for stable terrain as error proxy +================================================= Digital elevation models have both a precision that can vary with terrain or instrument-related variables, and -a spatial correlation of measurement errors that can be due to effects of resolution, processing or instrument noise. -Accouting for non-stationarities in elevation measurement errors is essential to use stable terrain as a proxy to -infer the precision on other types of terrain (Hugonnet et al., in prep) and reliably use spatial statistics (see -:ref:`spatialstats`). - -Here, we show an example to use standardization of the data based on the terrain-dependent nonstationarity in measurement -error (see :ref:`sphx_glr_auto_examples_plot_nonstationary_error.py`) and combine it with an analysis of spatial -correlation (see :ref:`sphx_glr_auto_examples_plot_vgm_error.py`) to derive spatially integrated errors for specific -spatial ensembles. - -**Reference**: `Hugonnet et al. (2021) `_, applied to the terrain slope -and quality of stereo-correlation (Equation 1, Extended Data Fig. 3a). +a spatial correlation of errors that can be due to effects of resolution, processing or instrument noise. +Accouting for non-stationarities in elevation errors is essential to use stable terrain as a proxy to infer the +precision on other types of terrain and reliably use spatial statistics (see :ref:`spatialstats`). + +Here, we show an example of standardization of the data based on terrain-dependent explanatory variables +(see :ref:`sphx_glr_auto_examples_plot_infer_heterosc.py`) and combine it with an analysis of spatial correlation +(see :ref:`sphx_glr_auto_examples_plot_infer_spatial_correlation.py`) . + +**Reference**: `Hugonnet et al. (2022) `_, Equation 12. """ # sphinx_gallery_thumbnail_number = 4 import matplotlib.pyplot as plt @@ -24,8 +21,8 @@ from xdem.spatialstats import nmad # %% -# We start by estimating the non-stationarities and deriving a terrain-dependent measurement error as a function of both -# slope and maximum curvature, as shown in the :ref:`sphx_glr_auto_examples_plot_nonstationary_error.py` example. +# We start by estimating the elevation heteroscedasticity and deriving a terrain-dependent measurement error as a function of both +# slope and maximum curvature, as shown in the :ref:`sphx_glr_auto_examples_plot_infer_heterosc.py` example. # Load the data ref_dem = xdem.DEM(xdem.examples.get_path("longyearbyen_ref_dem")) @@ -105,7 +102,7 @@ plt.show() # %% -# Now, we can perform an analysis of spatial correlation as shown in the :ref:`sphx_glr_auto_examples_plot_vgm_error.py` +# Now, we can perform an analysis of spatial correlation as shown in the :ref:`sphx_glr_auto_examples_plot_variogram_estimation_modelling.py` # example, by estimating a variogram and fitting a sum of two models. df_vgm = xdem.spatialstats.sample_empirical_variogram(values=z_dh.data.squeeze(), gsd=dh.res[0], subsample=300, n_variograms=10, random_state=42) diff --git a/xdem/coreg.py b/xdem/coreg.py index 97ee133a..0d418e6c 100644 --- a/xdem/coreg.py +++ b/xdem/coreg.py @@ -969,8 +969,9 @@ def __init__(self, degree: int = 1, subsample: Union[int, float] = 5e5): :param degree: The polynomial degree to estimate. degree=0 is a simple bias correction. :param subsample: Factor for subsampling the input raster for speed-up. - If <= 1, will be considered a fraction of valid pixels to extract. - If > 1 will be considered the number of pixels to extract. + If <= 1, will be considered a fraction of valid pixels to extract. + If > 1 will be considered the number of pixels to extract. + """ self.degree = degree self.subsample = subsample diff --git a/xdem/spatialstats.py b/xdem/spatialstats.py index 717c8fa9..43edc049 100644 --- a/xdem/spatialstats.py +++ b/xdem/spatialstats.py @@ -1368,13 +1368,13 @@ def infer_spatial_correlation_from_stable(dvalues: np.ndarray | RasterType, :param dvalues: Proxy values as array or Raster :param list_models: List of K variogram models to sum for the fit in order from short to long ranges. Can either be - a 3-letter string, full string of the variogram name or SciKit-GStat model function (e.g., for a - spherical model "Sph", "Spherical" or skgstat.models.spherical). + a 3-letter string, full string of the variogram name or SciKit-GStat model function (e.g., for a + spherical model "Sph", "Spherical" or skgstat.models.spherical). :param stable_mask: Vector shapefile of stable terrain (if dvalues is Raster), or boolean array of same shape as dvalues :param unstable_mask: Vector shapefile of unstable terrain (if dvalues is Raster), or boolean array of same shape as dvalues :param errors: Error values to account for heteroscedasticity (ignored if None). :param estimator: Estimator for the empirical variogram; default to Dowd's variogram (see skgstat.Variogram for - the list of available estimators). + the list of available estimators). :param gsd: Ground sampling distance :param coords: Coordinates :param subsample: Number of samples to randomly draw from the values @@ -1821,7 +1821,7 @@ def number_effective_samples(area: float | int | VectorType | gpd.GeoDataFrame, range to ensure a sufficiently fine grid for propagation of the shortest range. :param area: Area of interest either as a numeric value of surface in the same unit as the variogram ranges (will - assume a circular shape), or as a vector (shapefile) of the area + assume a circular shape), or as a vector (shapefile) of the area :param params_variogram_model: Dataframe of variogram models to sum with three to four columns, "model" for the model types (e.g., ["spherical", "matern"]), "range" for the correlation ranges (e.g., [2, 100]), "psill" for the partial sills (e.g., [0.8, 0.2]) and "smooth" for the smoothness parameter if it exists for this model (e.g., @@ -1898,10 +1898,9 @@ def spatial_error_propagation(areas: list[float | VectorType | gpd.GeoDataFrame] The standard error SE (1-sigma) is then computed as SE = mean(SD) / Neff, where mean(SD) is the mean of errors in the area of interest which accounts for heteroscedasticity, and Neff is the number of effective samples. - :param areas: Area of interest either as a numeric value of surface in the same unit as the variogram ranges (will - assume a circular shape), or as a vector (shapefile) of the area - :param errors: Errors from heteroscedasticity estimation and modelling, as an array or Raster + assume a circular shape), or as a vector (shapefile) of the area. + :param errors: Errors from heteroscedasticity estimation and modelling, as an array or Raster. :param params_variogram_model: Dataframe of variogram models to sum with three to four columns, "model" for the model types (e.g., ["spherical", "matern"]), "range" for the correlation ranges (e.g., [2, 100]), "psill" for the partial sills (e.g., [0.8, 0.2]) and "smooth" for the smoothness parameter if it exists for this model (e.g., From e6a072fa7c637ec353737c749669ef2f92a89701 Mon Sep 17 00:00:00 2001 From: rhugonne Date: Mon, 22 Aug 2022 16:06:45 +0200 Subject: [PATCH 15/21] Update documentation page on spatial statistics --- ...lope.py => spatialstats_heterosc_slope.py} | 0 docs/source/intro_accuracy_precision.rst | 6 ++- docs/source/spatialstats.rst | 41 +++++++++---------- .../plot_heterosc_estimation_modelling.py | 4 +- .../plot_variogram_estimation_modelling.py | 4 +- 5 files changed, 28 insertions(+), 27 deletions(-) rename docs/source/code/{spatialstats_nonstationarity_slope.py => spatialstats_heterosc_slope.py} (100%) diff --git a/docs/source/code/spatialstats_nonstationarity_slope.py b/docs/source/code/spatialstats_heterosc_slope.py similarity index 100% rename from docs/source/code/spatialstats_nonstationarity_slope.py rename to docs/source/code/spatialstats_heterosc_slope.py diff --git a/docs/source/intro_accuracy_precision.rst b/docs/source/intro_accuracy_precision.rst index 86466b65..9bcf0a8c 100644 --- a/docs/source/intro_accuracy_precision.rst +++ b/docs/source/intro_accuracy_precision.rst @@ -31,6 +31,10 @@ for when analyzing DEMs: .. figure:: imgs/precision_accuracy.png :width: 80% +*Note: sometimes "accuracy" is also used to describe both types of errors, and "trueness" systematic errors, following +`ISO 5725-1 `_. Here, we used accuracy for systematic errors +as, to our knowledge, it is more common in remote sensing applications." + Source: `antarcticglaciers.org `_, accessed 29.06.21. @@ -96,6 +100,6 @@ The tools for quantifying DEM precision are described in :ref:`spatialstats`. Functions that are used in several examples create duplicate examples intead of being merged into the list. Circumventing manually by selecting functions used only once in each example for now. -.. minigallery:: xdem.spatialstats.neff_circ xdem.spatialstats.plot_1d_binning +.. minigallery:: xdem.spatialstats.infer_heteroscedasticity_from_stable xdem.spatialstats.infer_spatial_correlation_from_stable :add-heading: Examples that use spatial statistics functions diff --git a/docs/source/spatialstats.rst b/docs/source/spatialstats.rst index dc844a43..2f828dc1 100644 --- a/docs/source/spatialstats.rst +++ b/docs/source/spatialstats.rst @@ -6,16 +6,15 @@ Spatial statistics Spatial statistics, also referred to as `geostatistics `_, are essential for the analysis of observations distributed in space. To analyze DEMs, xDEM integrates spatial statistics tools specific to DEMs described in recent literature, -in particular in `Rolstad et al. (2009) `_, -`Dehecq et al. (2020) `_ and -`Hugonnet et al. (2021) `_. The implementation of these methods relies -partly on the package `scikit-gstat `_. +in particular in `Hugonnet et al. (2022) `_ and +`Rolstad et al. (2009) `_. The implementation of these methods relies +partially on the package `scikit-gstat `_. The spatial statistics tools can be used to assess the precision of DEMs (see the definition of precision in :ref:`intro`). In particular, these tools help to: - - account for non-stationarities of elevation measurement errors (e.g., varying precision of DEMs with terrain slope), - - quantify the spatial correlation of measurement errors in DEMs (e.g., native spatial resolution, instrument noise), + - account for elevation heteroscedasticity (e.g., varying precision with terrain slope), + - quantify the spatial correlation of errors in DEMs (e.g., native spatial resolution, instrument noise), - estimate robust errors for observations integrated in space (e.g., average or sum of samples), - propagate errors between spatial ensembles at different scales (e.g., sum of glacier volume changes). @@ -73,7 +72,7 @@ Due to the sparsity of synchronous acquisitions, elevation data cannot be easily times. Thus, stable terrain is used a proxy to assess the precision of a DEM on all its terrain, including moving terrain that is generally of greater interest for analysis. -As shown in Hugonnet et al. (in prep), accounting for :ref:`spatialstats_nonstationarity` is needed to reliably +As shown in `Hugonnet et al. (2022) `_, accounting for :ref:`spatialstats_heterosc` is needed to reliably use stable terrain as a proxy for other types of terrain. .. _spatialstats_metrics: @@ -99,7 +98,7 @@ To estimate the pixel-wise measurement error for elevation data, two issues aris 1. The dispersion :math:`\sigma_{dh}` cannot be estimated directly on changing terrain, 2. The dispersion :math:`\sigma_{dh}` can show important non-stationarities. -The section :ref:`spatialstats_nonstationarity` describes how to quantify the measurement error as a function of +The section :ref:`spatialstats_heterosc` describes how to quantify the measurement error as a function of several explanatory variables by using stable terrain as a proxy. Spatially-integrated elevation measurement error @@ -129,23 +128,21 @@ and use those to integrate and propagate measurement errors in space. Workflow for DEM precision estimation ------------------------------------- -.. _spatialstats_nonstationarity: +.. _spatialstats_heterosc: -Non-stationarity in elevation measurement errors -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Elevation heteroscedasticity +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Elevation data contains significant non-stationarities in elevation measurement errors. +Elevation data contains significant variability in measurement errors. -xDEM provides tools to **quantify** these non-stationarities along several explanatory variables, -**model** those numerically to estimate an elevation measurement error, and **standardize** them for further analysis. +xDEM provides tools to **quantify** this variability using explanatory variables, **model** those numerically to +estimate a function predicting elevation error, and **standardize** data for further analysis. -Quantify and model non-stationarites -"""""""""""""""""""""""""""""""""""" +Quantify and model heteroscedasticity +""""""""""""""""""""""""""""""""""""" -Non-stationarities in elevation measurement errors correspond to a variability of the precision in the elevation -observations with certain explanatory variables that can be terrain- or instrument-related. -In statistical terms, it corresponds to an `heteroscedasticity `_ -of elevation observations. +Elevation `heteroscedasticity `_ corresponds to a variability in +precision of elevation observations, that are linked to terrain or instrument variables. .. math:: \sigma_{dh} = \sigma_{dh}(\textrm{var}_{1},\textrm{var}_{2}, \textrm{...}) \neq \textrm{constant} @@ -158,7 +155,7 @@ Owing to the large number of samples of elevation data, we can easily estimate t :lines: 18-19 :language: python -.. plot:: code/spatialstats_nonstationarity_slope.py +.. plot:: code/spatialstats_heterosc_slope.py :width: 90% The most common explanatory variables are: @@ -167,7 +164,7 @@ The most common explanatory variables are: - the quality of stereo-correlation that can explain a large part of the measurement error of DEMs generated by stereophotogrammetry, - the interferometric coherence that can explain a large part of the measurement error of DEMs generated by `InSAR `_. -Once quantified, the non-stationarities can be modelled numerically by linear interpolation across several +Once quantified, elevation heteroscedasticity can be modelled numerically by linear interpolation across several variables using :func:`xdem.spatialstats.interp_nd_binning`. .. literalinclude:: code/spatialstats.py diff --git a/examples/plot_heterosc_estimation_modelling.py b/examples/plot_heterosc_estimation_modelling.py index e60eed56..26e6697a 100644 --- a/examples/plot_heterosc_estimation_modelling.py +++ b/examples/plot_heterosc_estimation_modelling.py @@ -1,6 +1,6 @@ """ -Estimate and model elevation heteroscedasticity -=============================================== +Estimation and modelling of elevation heteroscedasticity +======================================================== Digital elevation models have a precision that can vary with terrain and instrument-related variables. This variability in variance is called `heteroscedasticy `_, diff --git a/examples/plot_variogram_estimation_modelling.py b/examples/plot_variogram_estimation_modelling.py index 542549b4..4483e61d 100644 --- a/examples/plot_variogram_estimation_modelling.py +++ b/examples/plot_variogram_estimation_modelling.py @@ -1,6 +1,6 @@ """ -Estimate and model spatial variograms -===================================== +Estimation and modelling of spatial variograms +============================================== Digital elevation models have errors that are often `correlated in space `_. While many DEM studies used solely short-range `variogram `_ to From 2e9d0f66c4d78484c9c103be468637b82392942d Mon Sep 17 00:00:00 2001 From: rhugonne Date: Mon, 22 Aug 2022 16:34:56 +0200 Subject: [PATCH 16/21] Fix tests --- docs/source/spatialstats.rst | 6 +++--- tests/test_spatialstats.py | 11 ++++++----- xdem/spatialstats.py | 6 ++++-- 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/docs/source/spatialstats.rst b/docs/source/spatialstats.rst index 2f828dc1..c3218110 100644 --- a/docs/source/spatialstats.rst +++ b/docs/source/spatialstats.rst @@ -216,8 +216,8 @@ average measurement error of the pixels in the subsample, evaluated through the Estimating the standard error of the mean of the standardized data :math:`\sigma_{\overline{z_{dh}}}\vert_{\mathbb{S}}` requires an analysis of spatial correlation and a spatial integration of this correlation, described in the next sections. -.. minigallery:: xdem.spatialstats.nd_binning - :add-heading: Examples that deal with non-stationarities +.. minigallery:: xdem.spatialstats.nd_binning xdem.spatialstats.infer_heteroscedasticity_from_stable + :add-heading: Examples that deal with elevation heteroscedasticity :heading-level: " .. _spatialstats_corr: @@ -307,7 +307,7 @@ This can be performed through the function :func:`xdem.spatialstats.fit_sum_mode :lines: 31 :language: python -.. minigallery:: xdem.spatialstats.sample_empirical_variogram +.. minigallery:: xdem.spatialstats.sample_empirical_variogram xdem.spatialstats.infer_spatial_correlation_from_stable :add-heading: Examples that deal with spatial correlations :heading-level: " diff --git a/tests/test_spatialstats.py b/tests/test_spatialstats.py index 1bb735d7..b65e8adc 100644 --- a/tests/test_spatialstats.py +++ b/tests/test_spatialstats.py @@ -527,24 +527,25 @@ def test_spatial_error_propagation(self): # Standardize the differences zscores = diff / errors params_variogram_model = xdem.spatialstats.infer_spatial_correlation_from_stable( - dvalues=zscores, list_models=['Gaussian', 'Spherical'], unstable_mask=vector_glacier, subsample=100)[1] + dvalues=zscores, list_models=['Gaussian', 'Spherical'], unstable_mask=vector_glacier, subsample=100, + random_state=42)[1] # Run the function with vector areas areas_vector = [vector_glacier.ds[vector_glacier.ds['NAME']=='Brombreen'], vector_glacier.ds[vector_glacier.ds['NAME']=='Medalsbreen']] list_stderr_vec = xdem.spatialstats.spatial_error_propagation(areas=areas_vector, errors=errors, - params_variogram_model=params_variogram_model) + params_variogram_model=params_variogram_model, + random_state=42) # Run the function with numeric areas (sum needed for Medalsbreen that has two separate polygons) areas_numeric = [np.sum(area_vec.area.values) for area_vec in areas_vector] list_stderr = xdem.spatialstats.spatial_error_propagation(areas=areas_numeric, errors=errors, params_variogram_model=params_variogram_model) - # Check that the outputs are consistent: the numeric method should always give smaller neff, but not too far - # off (20% relative) for those two glaciers as their shape is not too different from a disk + # Check that the outputs are consistent: the numeric method should always give a neff that is almost the same + # (20% relative) for those two glaciers as their shape is not too different from a disk for i in range(2): - assert list_stderr_vec[i] > list_stderr[i] assert list_stderr_vec[i] == pytest.approx(list_stderr[i], rel=0.2) diff --git a/xdem/spatialstats.py b/xdem/spatialstats.py index 43edc049..95e28d4c 100644 --- a/xdem/spatialstats.py +++ b/xdem/spatialstats.py @@ -1063,6 +1063,10 @@ def sample_empirical_variogram(values: Union[np.ndarray, RasterType], gsd: float df_mean['count'] = df_count['count'] df = df_mean + # Remove the last spatial lag bin which is always undersampled + # TODO: Solve this problem at the root: how the spatial lag binning is defined, probably? + df.drop(df.tail(1).index, inplace=True) + return df def _get_skgstat_variogram_model_name(model: str | Callable) -> str: @@ -1333,8 +1337,6 @@ def estimate_model_spatial_correlation(dvalues: Union[np.ndarray, RasterType], l subsample=subsample, subsample_method=subsample_method, n_variograms=n_variograms, n_jobs=n_jobs, verbose=verbose, random_state=random_state, **kwargs) - # TODO: prevent this last bin with few samples to be returned by `sample_empirical_variogram` - empirical_variogram = empirical_variogram[:-1] params_variogram_model = fit_sum_model_variogram(list_models=list_models, empirical_variogram=empirical_variogram, bounds=bounds, p0=p0)[1] From f4066a2bed78793f978155ebe90ec2c730c9662e Mon Sep 17 00:00:00 2001 From: rhugonne Date: Mon, 22 Aug 2022 17:07:27 +0200 Subject: [PATCH 17/21] Fix typo in infer_heteroscedasticity function name --- docs/source/spatialstats.rst | 4 +- examples/plot_infer_heterosc.py | 6 +-- examples/plot_spatial_error_propagation.py | 6 +-- tests/test_spatialstats.py | 30 ++++++------ xdem/spatialstats.py | 54 +++++++++++----------- 5 files changed, 50 insertions(+), 50 deletions(-) diff --git a/docs/source/spatialstats.rst b/docs/source/spatialstats.rst index c3218110..6e9ed127 100644 --- a/docs/source/spatialstats.rst +++ b/docs/source/spatialstats.rst @@ -216,7 +216,7 @@ average measurement error of the pixels in the subsample, evaluated through the Estimating the standard error of the mean of the standardized data :math:`\sigma_{\overline{z_{dh}}}\vert_{\mathbb{S}}` requires an analysis of spatial correlation and a spatial integration of this correlation, described in the next sections. -.. minigallery:: xdem.spatialstats.nd_binning xdem.spatialstats.infer_heteroscedasticity_from_stable +.. minigallery:: xdem.spatialstats.infer_heteroscedasticity_from_stable xdem.spatialstats.nd_binning :add-heading: Examples that deal with elevation heteroscedasticity :heading-level: " @@ -307,7 +307,7 @@ This can be performed through the function :func:`xdem.spatialstats.fit_sum_mode :lines: 31 :language: python -.. minigallery:: xdem.spatialstats.sample_empirical_variogram xdem.spatialstats.infer_spatial_correlation_from_stable +.. minigallery:: xdem.spatialstats.infer_spatial_correlation_from_stable xdem.spatialstats.sample_empirical_variogram :add-heading: Examples that deal with spatial correlations :heading-level: " diff --git a/examples/plot_infer_heterosc.py b/examples/plot_infer_heterosc.py index f89e4f54..ed602754 100644 --- a/examples/plot_infer_heterosc.py +++ b/examples/plot_infer_heterosc.py @@ -32,9 +32,9 @@ # %% # Then, we run the pipeline for inference of elevation heteroscedasticity from stable terrain: errors, df_binning, error_function = \ - xdem.spatialstats.infer_heteroscedasticy_from_stable(dvalues=dh, list_var=[slope, maximum_curvature], - list_var_names=['slope', 'maxc'], - unstable_mask=glacier_outlines) + xdem.spatialstats.infer_heteroscedasticity_from_stable(dvalues=dh, list_var=[slope, maximum_curvature], + list_var_names=['slope', 'maxc'], + unstable_mask=glacier_outlines) # %% # The first output corresponds to the error map for the DEM (1-sigma): diff --git a/examples/plot_spatial_error_propagation.py b/examples/plot_spatial_error_propagation.py index 85b1dc14..6e1fc66c 100644 --- a/examples/plot_spatial_error_propagation.py +++ b/examples/plot_spatial_error_propagation.py @@ -26,9 +26,9 @@ glacier_outlines = gu.Vector(xdem.examples.get_path("longyearbyen_glacier_outlines")) slope, maximum_curvature = xdem.terrain.get_terrain_attribute(ref_dem, attribute=['slope', 'maximum_curvature']) errors, df_binning, error_function = \ - xdem.spatialstats.infer_heteroscedasticy_from_stable(dvalues=dh, list_var=[slope, maximum_curvature], - list_var_names=['slope', 'maxc'], - unstable_mask=glacier_outlines) + xdem.spatialstats.infer_heteroscedasticity_from_stable(dvalues=dh, list_var=[slope, maximum_curvature], + list_var_names=['slope', 'maxc'], + unstable_mask=glacier_outlines) # %% # We use the error map to standardize the elevation differences before variogram estimation, following Equation 12 of diff --git a/tests/test_spatialstats.py b/tests/test_spatialstats.py index b65e8adc..4fe34189 100644 --- a/tests/test_spatialstats.py +++ b/tests/test_spatialstats.py @@ -522,7 +522,7 @@ def test_spatial_error_propagation(self): # Get the error map and variogram model with standardization slope, maxc = xdem.terrain.get_terrain_attribute(ref, attribute=['slope', 'maximum_curvature']) - errors = xdem.spatialstats.infer_heteroscedasticy_from_stable( + errors = xdem.spatialstats.infer_heteroscedasticity_from_stable( dvalues=diff, list_var=[slope, maxc], unstable_mask=vector_glacier)[0] # Standardize the differences zscores = diff / errors @@ -853,9 +853,9 @@ def test_estimate_model_heteroscedasticity_and_infer_from_stable(self): # Test infer function errors_1, df_binning_1, err_fun_1 = \ - xdem.spatialstats.infer_heteroscedasticy_from_stable(dvalues=diff, - list_var=[slope, maximum_curv], - unstable_mask=vector_glacier) + xdem.spatialstats.infer_heteroscedasticity_from_stable(dvalues=diff, + list_var=[slope, maximum_curv], + unstable_mask=vector_glacier) # Test this gives the same results as when using the base functions diff_arr = gu.spatial_tools.get_array_and_mask(diff)[0] @@ -881,22 +881,22 @@ def test_estimate_model_heteroscedasticity_and_infer_from_stable(self): # Check that errors are raised with wrong input with pytest.raises(ValueError, match='The dvalues must be a Raster or NumPy array.'): - xdem.spatialstats.infer_heteroscedasticy_from_stable(dvalues='not_an_array', - stable_mask=~mask_glacier.squeeze(), - list_var=[slope_arr]) + xdem.spatialstats.infer_heteroscedasticity_from_stable(dvalues='not_an_array', + stable_mask=~mask_glacier.squeeze(), + list_var=[slope_arr]) with pytest.raises(ValueError, match='The stable mask must be a Vector, GeoDataFrame or NumPy array.'): - xdem.spatialstats.infer_heteroscedasticy_from_stable(dvalues=diff, - stable_mask='not_a_vector_or_array', - list_var=[slope_arr]) + xdem.spatialstats.infer_heteroscedasticity_from_stable(dvalues=diff, + stable_mask='not_a_vector_or_array', + list_var=[slope_arr]) with pytest.raises(ValueError, match='The unstable mask must be a Vector, GeoDataFrame or NumPy array.'): - xdem.spatialstats.infer_heteroscedasticy_from_stable(dvalues=diff, - unstable_mask='not_a_vector_or_array', - list_var=[slope_arr]) + xdem.spatialstats.infer_heteroscedasticity_from_stable(dvalues=diff, + unstable_mask='not_a_vector_or_array', + list_var=[slope_arr]) with pytest.raises(ValueError, match='The stable mask can only passed as a Vector or GeoDataFrame if the input ' 'dvalues is a Raster.'): - xdem.spatialstats.infer_heteroscedasticy_from_stable(dvalues=diff_arr, stable_mask=vector_glacier, - list_var=[slope_arr]) + xdem.spatialstats.infer_heteroscedasticity_from_stable(dvalues=diff_arr, stable_mask=vector_glacier, + list_var=[slope_arr]) def test_plot_binning(self): diff --git a/xdem/spatialstats.py b/xdem/spatialstats.py index 95e28d4c..16c08602 100644 --- a/xdem/spatialstats.py +++ b/xdem/spatialstats.py @@ -365,40 +365,40 @@ def estimate_model_heteroscedasticity(dvalues: np.ndarray, list_var: Iterable[np @overload -def infer_heteroscedasticy_from_stable(dvalues: np.ndarray , list_var: list[np.ndarray | RasterType], - stable_mask: np.ndarray | VectorType | gpd.GeoDataFrame, - unstable_mask: np.ndarray | VectorType | gpd.GeoDataFrame, - list_var_names: Iterable[str], - spread_statistic: Callable, - list_var_bins: Optional[Union[int,Iterable[Iterable]]], - min_count: Optional[int], - factor_spread_exclude_outliers: float | None, - ) -> tuple[np.ndarray, +def infer_heteroscedasticity_from_stable(dvalues: np.ndarray, list_var: list[np.ndarray | RasterType], + stable_mask: np.ndarray | VectorType | gpd.GeoDataFrame, + unstable_mask: np.ndarray | VectorType | gpd.GeoDataFrame, + list_var_names: Iterable[str], + spread_statistic: Callable, + list_var_bins: Optional[Union[int,Iterable[Iterable]]], + min_count: Optional[int], + factor_spread_exclude_outliers: float | None, + ) -> tuple[np.ndarray, pd.DataFrame, Callable[[tuple[np.ndarray, ...]], np.ndarray]]: ... @overload -def infer_heteroscedasticy_from_stable(dvalues: RasterType , list_var: list[np.ndarray | RasterType], - stable_mask: np.ndarray | VectorType | gpd.GeoDataFrame, - unstable_mask: np.ndarray | VectorType | gpd.GeoDataFrame, - list_var_names: Iterable[str], - spread_statistic: Callable, - list_var_bins: Optional[Union[int,Iterable[Iterable]]], - min_count: Optional[int], - factor_spread_exclude_outliers: float | None, - ) -> tuple[RasterType, +def infer_heteroscedasticity_from_stable(dvalues: RasterType, list_var: list[np.ndarray | RasterType], + stable_mask: np.ndarray | VectorType | gpd.GeoDataFrame, + unstable_mask: np.ndarray | VectorType | gpd.GeoDataFrame, + list_var_names: Iterable[str], + spread_statistic: Callable, + list_var_bins: Optional[Union[int,Iterable[Iterable]]], + min_count: Optional[int], + factor_spread_exclude_outliers: float | None, + ) -> tuple[RasterType, pd.DataFrame, Callable[[tuple[np.ndarray, ...]], np.ndarray]]: ... -def infer_heteroscedasticy_from_stable(dvalues: np.ndarray | RasterType, list_var: list[np.ndarray | RasterType], - stable_mask: np.ndarray | VectorType | gpd.GeoDataFrame = None, - unstable_mask: np.ndarray | VectorType | gpd.GeoDataFrame = None, - list_var_names: Iterable[str] = None, - spread_statistic: Callable = nmad, - list_var_bins: Optional[Union[int,Iterable[Iterable]]] = None, - min_count: Optional[int] = 100, - fac_spread_outliers: float | None = 7, - ) -> tuple[np.ndarray | RasterType, +def infer_heteroscedasticity_from_stable(dvalues: np.ndarray | RasterType, list_var: list[np.ndarray | RasterType], + stable_mask: np.ndarray | VectorType | gpd.GeoDataFrame = None, + unstable_mask: np.ndarray | VectorType | gpd.GeoDataFrame = None, + list_var_names: Iterable[str] = None, + spread_statistic: Callable = nmad, + list_var_bins: Optional[Union[int,Iterable[Iterable]]] = None, + min_count: Optional[int] = 100, + fac_spread_outliers: float | None = 7, + ) -> tuple[np.ndarray | RasterType, pd.DataFrame, Callable[[tuple[np.ndarray, ...]], np.ndarray]]: """ From 2565aac6034ea8d1892ae6d251f67c85065370d2 Mon Sep 17 00:00:00 2001 From: rhugonne Date: Tue, 23 Aug 2022 14:20:26 +0200 Subject: [PATCH 18/21] Minor corrections for sphinx display --- examples/plot_heterosc_estimation_modelling.py | 4 ++-- examples/plot_infer_heterosc.py | 2 +- examples/plot_spatial_error_propagation.py | 16 ++++++++++++---- 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/examples/plot_heterosc_estimation_modelling.py b/examples/plot_heterosc_estimation_modelling.py index 26e6697a..b44e9e82 100644 --- a/examples/plot_heterosc_estimation_modelling.py +++ b/examples/plot_heterosc_estimation_modelling.py @@ -1,6 +1,6 @@ """ -Estimation and modelling of elevation heteroscedasticity -======================================================== +Estimation and modelling of heteroscedasticity +============================================== Digital elevation models have a precision that can vary with terrain and instrument-related variables. This variability in variance is called `heteroscedasticy `_, diff --git a/examples/plot_infer_heterosc.py b/examples/plot_infer_heterosc.py index ed602754..e5628105 100644 --- a/examples/plot_infer_heterosc.py +++ b/examples/plot_infer_heterosc.py @@ -46,7 +46,7 @@ # %% # The third output is the 2D binning interpolant, i.e. an error function with the slope and maximum curvature -# (*Note: below we multiply the maximum curvature by 100 to convert it in m-1*): +# (*Note: below we divide the maximum curvature by 100 to convert it in m-1*): for slope, maxc in [(0, 0), (40, 0), (0, 5), (40, 5)]: print('Error for a slope of {:.0f} degrees and' ' {:.0f} m-1 max. curvature: {:.1f} m'.format(slope, maxc/100, error_function((slope, maxc)))) diff --git a/examples/plot_spatial_error_propagation.py b/examples/plot_spatial_error_propagation.py index 6e1fc66c..7770c74c 100644 --- a/examples/plot_spatial_error_propagation.py +++ b/examples/plot_spatial_error_propagation.py @@ -47,6 +47,10 @@ stderr_glaciers = xdem.spatialstats.spatial_error_propagation(areas=areas, errors=errors, params_variogram_model=params_variogram_model) +for glacier_name, stderr_gla in [('Brombreen', stderr_glaciers[0]), ('Medalsbreen', stderr_glaciers[1])]: + print('The error in mean elevation change for {} is {:.2f} meters (1-sigma).'.format(glacier_name, stderr_gla)) + +# %% # When passing a numerical area value, we compute an approximation with disk shape from Equation 8 of Rolstad et al. # (2009). This approximation is practical to visualize changes in elevation error when averaging over different area # sizes, but is less accurate to estimate the standard error of a certain area shape. @@ -56,10 +60,14 @@ plt.plot(areas / 10**6 , stderrs) plt.xlabel('Averaging area (km²)') plt.ylabel('Standard error (m)') -plt.vlines(x=[np.pi*params_variogram_model['range'].values[0]**2 / 10**6, - np.pi*params_variogram_model['range'].values[1]**2 / 10**6], +plt.vlines(x=np.pi*params_variogram_model['range'].values[0]**2 / 10**6, ymin=np.min(stderrs), ymax=np.max(stderrs), colors='red', linestyles='dashed', - label='Area of disk with radius the correlation range') + label='Disk area with radius the\n1st correlation range of {:,.0f} meters' + .format(params_variogram_model['range'].values[0])) +plt.vlines(x=np.pi*params_variogram_model['range'].values[1]**2 / 10**6, + ymin=np.min(stderrs), ymax=np.max(stderrs), colors='blue', linestyles='dashed', + label='Disk area with radius the\n2nd correlation range of {:,.0f} meters' + .format(params_variogram_model['range'].values[1])) plt.xscale('log') plt.legend() - +plt.show() From a67fcc2e19b023cc65b7d22908f091917e6b7dea Mon Sep 17 00:00:00 2001 From: rhugonne Date: Wed, 24 Aug 2022 11:37:03 +0200 Subject: [PATCH 19/21] Account for comments of amaury: fixes, descriptions clarity, typos, etc --- docs/source/coregistration.rst | 2 +- docs/source/intro_accuracy_precision.rst | 19 +++++++----- docs/source/spatialstats.rst | 2 +- .../plot_heterosc_estimation_modelling.py | 17 +++++----- examples/plot_icp_coregistration.py | 4 ++- examples/plot_infer_spatial_correlation.py | 12 +++++-- .../plot_variogram_estimation_modelling.py | 11 +++---- tests/test_spatialstats.py | 18 +++++------ xdem/coreg.py | 1 + xdem/spatialstats.py | 31 ++++++++++--------- 10 files changed, 66 insertions(+), 51 deletions(-) diff --git a/docs/source/coregistration.rst b/docs/source/coregistration.rst index 5eb5b803..c250ba73 100644 --- a/docs/source/coregistration.rst +++ b/docs/source/coregistration.rst @@ -153,7 +153,7 @@ ICP - **Does not support weights** - **Recommended for:** Data with low noise and a high relative rotation. -Iterative Closest Point (ICP) coregistration works by iteratively moving the data until it fits the reference as well as possible. +Iterative Closest Point (ICP) coregistration, which is based on `Besl and McKay (1992) `_, works by iteratively moving the data until it fits the reference as well as possible. The DEMs are read as point clouds; collections of points with X/Y/Z coordinates, and a nearest neighbour analysis is made between the reference and the data to be aligned. After the distances are calculated, a rigid transform is estimated to minimise them. The transform is attempted, and then distances are calculated again. diff --git a/docs/source/intro_accuracy_precision.rst b/docs/source/intro_accuracy_precision.rst index 9bcf0a8c..863361bb 100644 --- a/docs/source/intro_accuracy_precision.rst +++ b/docs/source/intro_accuracy_precision.rst @@ -28,15 +28,14 @@ for when analyzing DEMs: - the **accuracy** (systematic error) of a DEM describes how close a DEM is to the true location of measured elevations on the Earth's surface, - the **precision** (random error) of a DEM describes the typical spread of its error in measurement, independently of a possible bias from the true positioning. -.. figure:: imgs/precision_accuracy.png - :width: 80% - *Note: sometimes "accuracy" is also used to describe both types of errors, and "trueness" systematic errors, following `ISO 5725-1 `_. Here, we used accuracy for systematic errors -as, to our knowledge, it is more common in remote sensing applications." +as, to our knowledge, it is more common in remote sensing applications.* -Source: `antarcticglaciers.org `_, accessed 29.06.21. +.. figure:: imgs/precision_accuracy.png + :width: 80% + + Source: `antarcticglaciers.org `_, accessed 29.06.21. Absolute or relative accuracy ----------------------------- @@ -51,6 +50,12 @@ TODO: Add another little schematic! Optimizing DEM absolute accuracy -------------------------------- +.. figure:: https://github.com/rhugonnet/dem_error_study/blob/main/figures/fig_2.png?raw=true + :witdh: 100% + + Source: `Hugonnet et al. (2022) `_. + + Shifts due to poor absolute accuracy are common in elevation datasets, and can be easily corrected by performing a DEM co-registration to precise and accurate, quality-controlled elevation data such as `ICESat `_ and `ICESat-2 `_. @@ -100,6 +105,6 @@ The tools for quantifying DEM precision are described in :ref:`spatialstats`. Functions that are used in several examples create duplicate examples intead of being merged into the list. Circumventing manually by selecting functions used only once in each example for now. -.. minigallery:: xdem.spatialstats.infer_heteroscedasticity_from_stable xdem.spatialstats.infer_spatial_correlation_from_stable +.. minigallery:: xdem.spatialstats.infer_heteroscedasticity_from_stable xdem.spatialstats.get_variogram_model_func xdem.spatialstats.sample_empirical_variogram :add-heading: Examples that use spatial statistics functions diff --git a/docs/source/spatialstats.rst b/docs/source/spatialstats.rst index 6e9ed127..f6a1b351 100644 --- a/docs/source/spatialstats.rst +++ b/docs/source/spatialstats.rst @@ -15,7 +15,7 @@ In particular, these tools help to: - account for elevation heteroscedasticity (e.g., varying precision with terrain slope), - quantify the spatial correlation of errors in DEMs (e.g., native spatial resolution, instrument noise), - - estimate robust errors for observations integrated in space (e.g., average or sum of samples), + - estimate robust errors for observations analyzed in space (e.g., average or sum of elevation, or of elevation changes), - propagate errors between spatial ensembles at different scales (e.g., sum of glacier volume changes). .. contents:: Contents diff --git a/examples/plot_heterosc_estimation_modelling.py b/examples/plot_heterosc_estimation_modelling.py index b44e9e82..58a86bae 100644 --- a/examples/plot_heterosc_estimation_modelling.py +++ b/examples/plot_heterosc_estimation_modelling.py @@ -40,17 +40,16 @@ # We derive terrain attributes from the reference DEM (see :ref:`sphx_glr_auto_examples_plot_terrain_attributes.py`), # which we will use to explore the variability in elevation error. slope, aspect, planc, profc = \ - xdem.terrain.get_terrain_attribute(dem=ref_dem.data, - attribute=['slope','aspect', 'planform_curvature', 'profile_curvature'], - resolution=ref_dem.res) + xdem.terrain.get_terrain_attribute(dem=ref_dem, + attribute=['slope','aspect', 'planform_curvature', 'profile_curvature']) # %% -# We keep only stable terrain for the analysis of variability -dh_arr = dh.data[~mask_glacier] -slope_arr = slope[~mask_glacier] -aspect_arr = aspect[~mask_glacier] -planc_arr = planc[~mask_glacier] -profc_arr = profc[~mask_glacier] +# We convert to arrays and keep only stable terrain for the analysis of variability +dh_arr = gu.spatial_tools.get_array_and_mask(dh)[0][~mask_glacier] +slope_arr = gu.spatial_tools.get_array_and_mask(slope)[0][~mask_glacier] +aspect_arr = gu.spatial_tools.get_array_and_mask(aspect)[0][~mask_glacier] +planc_arr = gu.spatial_tools.get_array_and_mask(planc)[0][~mask_glacier] +profc_arr = gu.spatial_tools.get_array_and_mask(profc)[0][~mask_glacier] # %% # We use :func:`xdem.spatialstats.nd_binning` to perform N-dimensional binning on all those terrain variables, with uniform diff --git a/examples/plot_icp_coregistration.py b/examples/plot_icp_coregistration.py index c1d3495b..1b77fe34 100644 --- a/examples/plot_icp_coregistration.py +++ b/examples/plot_icp_coregistration.py @@ -4,9 +4,11 @@ Some DEMs may for one or more reason be erroneously rotated in the X, Y or Z directions. Established coregistration approaches like :ref:`coregistration_nuthkaab` work great for X, Y and Z *translations*, but rotation is not accounted for at all. -ICP is one method that takes both rotation and translation into account. +Iterative Closest Point (ICP) is one method that takes both rotation and translation into account. It is however not as good as :ref:`coregistration_nuthkaab` when it comes to sub-pixel accuracy. Fortunately, ``xdem`` provides the best of two worlds by allowing a combination of the two. + +**Reference**: `Besl and McKay (1992) `_. """ # sphinx_gallery_thumbnail_number = 2 import matplotlib.pyplot as plt diff --git a/examples/plot_infer_spatial_correlation.py b/examples/plot_infer_spatial_correlation.py index 0f0d6c71..3e56198d 100644 --- a/examples/plot_infer_spatial_correlation.py +++ b/examples/plot_infer_spatial_correlation.py @@ -21,18 +21,24 @@ # %% # Then, we run the pipeline for inference of elevation heteroscedasticity from stable terrain (*Note: we pass a -# random_state argument to ensure a fixed, reproducible random subsampling in this example*): +# random_state argument to ensure a fixed, reproducible random subsampling in this example*). We ask for a fit with +# a Gaussian model for short range (as it is passed first), and Spherical for long range (as it is passed second): df_empirical_variogram, df_model_params, spatial_corr_function = \ xdem.spatialstats.infer_spatial_correlation_from_stable(dvalues=dh, list_models=['Gaussian', 'Spherical'], unstable_mask=glacier_outlines, random_state=42) # %% # The first output corresponds to the dataframe of the empirical variogram, by default estimated using Dowd's estimator -# and the circular sampling scheme of `skgstat.RasterEquidistantMetricSpace` (Fig. S13 of Hugonnet et al. (2022)): +# and the circular sampling scheme of `skgstat.RasterEquidistantMetricSpace` (Fig. S13 of Hugonnet et al. (2022)). The +# ``lags`` columns is the upper bound of spatial lag bins (lower bound of first bin being 0), the ``exp`` column is the +# "experimental" variance value of the variogram in that bin, the ``count`` the number of pairwise samples, and +# ``err_exp`` the 1-sigma error of the "experimental" variance, if more than one variogram is estimated with the +# ``n_variograms`` parameter. df_empirical_variogram # %% -# The second output is the dataframe of optimized model parameters for a sum of gaussian and spherical models: +# The second output is the dataframe of optimized model parameters (``range``, ``sill``, and possibly ``smoothness``) +# for a sum of gaussian and spherical models: df_model_params # %% diff --git a/examples/plot_variogram_estimation_modelling.py b/examples/plot_variogram_estimation_modelling.py index 4483e61d..add44442 100644 --- a/examples/plot_variogram_estimation_modelling.py +++ b/examples/plot_variogram_estimation_modelling.py @@ -78,8 +78,7 @@ # %% # *Note: in this example, we add a* ``random_state`` *argument to yield a reproducible random sampling of pixels within -# the grid, and a* ``runs`` *argument to reduce the computing time of* ``sgstat.MetricSpace.RasterEquidistantMetricSpace`` -# *which, by default, samples more data for robustness.* +# the grid.* # %% # We plot the empirical variogram: @@ -88,7 +87,7 @@ # %% # With this plot, it is hard to conclude anything! Properly visualizing the empirical variogram is one of the most # important step. With grid data, we expect short-range correlations close to the resolution of the grid (~20-200 -# meters), but also possibly longer range correlation due to instrument noise or alignment issues (~1-50 km) (Hugonnet et al., in prep). +# meters), but also possibly longer range correlation due to instrument noise or alignment issues (~1-50 km). # # To better visualize the variogram, we can either change the axis to log-scale, but this might make it more difficult # to later compare to variogram models. # Another solution is to split the variogram plot into subpanels, each with @@ -113,7 +112,7 @@ # is based on `scipy.optimize.curve_fit `_: func_sum_vgm1, params_vgm1 = xdem.spatialstats.fit_sum_model_variogram(list_models = ['Spherical'], empirical_variogram=df) -func_sum_vgm2, params_vgm2 = xdem.spatialstats.fit_sum_model_variogram(list_models = ['Gaussian', 'Spherical'], empirical_variogram=df) +func_sum_vgm2, params_vgm2 = xdem.spatialstats.fit_sum_model_variogram(list_models = ['Spherical', 'Spherical'], empirical_variogram=df) xdem.spatialstats.plot_variogram(df, list_fit_fun=[func_sum_vgm1, func_sum_vgm2], list_fit_fun_label=['Single-range model', 'Double-range model'], @@ -123,11 +122,11 @@ # The sum of two spherical models fits better, accouting for the small partial sill at longer ranges. Yet this longer # range partial sill (correlated variance) is quite small... # -# **So one could ask himself: is it really important to account for this small additional "bump" in the variogram?** +# **So one could wonder: is it really important to account for this small additional "bump" in the variogram?** # # To answer this, we compute the precision of the DEM integrated over a certain surface area based on spatial integration of the # variogram models using :func:`xdem.spatialstats.neff_circ`, with areas varying from pixel size to grid size. -# Numerical and exact integration of variogram is fast, allowing us to estimate errors for a wide range of areas radidly. +# Numerical and exact integration of variogram is fast, allowing us to estimate errors for a wide range of areas rapidly. areas = np.linspace(20**2, 10000**2, 1000) diff --git a/tests/test_spatialstats.py b/tests/test_spatialstats.py index 4fe34189..9e9b26db 100644 --- a/tests/test_spatialstats.py +++ b/tests/test_spatialstats.py @@ -40,7 +40,7 @@ def test_sample_multirange_variogram_default(self): """Verify that the default function runs, and its basic output""" # Load data - diff, mask = load_ref_and_diff()[1:3] + diff = load_ref_and_diff()[1] # Check the variogram output is consistent for a random state df0 = xdem.spatialstats.sample_empirical_variogram( @@ -83,7 +83,7 @@ def test_sample_multirange_variogram_methods(self, subsample_method): """Verify that all other methods run""" # Load data - diff, mask = load_ref_and_diff()[1:3] + diff = load_ref_and_diff()[1] # Check the variogram estimation runs for several methods df = xdem.spatialstats.sample_empirical_variogram( @@ -106,7 +106,7 @@ def test_sample_multirange_variogram_args(self): """Verify that optional parameters run only for their specific method, raise warning otherwise""" # Load data - diff, mask = load_ref_and_diff()[1:3] + diff = load_ref_and_diff()[1] pdist_args = {'pdist_multi_ranges':[0, diff.res[0]*5, diff.res[0]*10]} cdist_args = {'ratio_subsample': 0.5, 'samples': 50, 'runs': 10} @@ -288,7 +288,7 @@ def test_estimate_model_spatial_correlation_and_infer_from_stable(self): # Run wrapper infer from stable function with a Raster and the mask, and check the consistency there as well emp_vgm_3, params_model_vgm_3, _ = \ - xdem.spatialstats.infer_spatial_correlation_from_stable(dvalues=diff_on_stable, stable_mask=~mask_glacier.squeeze(), + xdem.spatialstats.infer_spatial_correlation_from_stable(dvalues=diff, stable_mask=~mask_glacier.squeeze(), list_models=['Gau', 'Sph'], subsample=100, random_state=42) pd.testing.assert_frame_equal(emp_vgm_1, emp_vgm_3) pd.testing.assert_frame_equal(params_model_vgm_1, params_model_vgm_3) @@ -296,7 +296,7 @@ def test_estimate_model_spatial_correlation_and_infer_from_stable(self): # Run again with array instead of Raster as input diff_on_stable_arr = gu.spatial_tools.get_array_and_mask(diff_on_stable)[0] emp_vgm_4, params_model_vgm_4, _ = \ - xdem.spatialstats.infer_spatial_correlation_from_stable(dvalues=diff_on_stable_arr, gsd=diff_on_stable.res[0], + xdem.spatialstats.infer_spatial_correlation_from_stable(dvalues=diff_on_stable_arr, gsd=diff.res[0], stable_mask=~mask_glacier.squeeze(), list_models=['Gau', 'Sph'], subsample=100, random_state=42) @@ -461,8 +461,7 @@ def test_neff_exact_and_approx_hugonnet(self): def test_number_effective_samples(self): """Test that the wrapper function for neff functions behaves correctly and that output values are robust""" - raster = load_ref_and_diff()[0] - outlines = load_ref_and_diff()[3] + raster, _, outlines = load_ref_and_diff()[1:] # The function should return the same result as neff_circular_approx_numerical when using a numerical area area = 10000 @@ -518,7 +517,7 @@ def test_number_effective_samples(self): def test_spatial_error_propagation(self): """Test that the spatial error propagation wrapper function runs properly""" - ref, diff, mask_glacier, vector_glacier = load_ref_and_diff() + ref, diff, vector_glacier = load_ref_and_diff() # Get the error map and variogram model with standardization slope, maxc = xdem.terrain.get_terrain_attribute(ref, attribute=['slope', 'maximum_curvature']) @@ -823,7 +822,8 @@ def test_two_step_standardization(self): scale_fac = xdem.spatialstats.nmad(zscores) assert scale_fac != 1 - # Filter with a factor of 5 and the standard deviation and check the function outputs the exact same array + # Filter with a factor of 3 and the standard deviation (not default values) and check the function outputs + # the exact same array zscores[np.abs(zscores) > 3*np.nanstd(zscores)] = np.nan scale_fac_std = np.nanstd(zscores) zscores /= scale_fac_std diff --git a/xdem/coreg.py b/xdem/coreg.py index 0d418e6c..0e5db317 100644 --- a/xdem/coreg.py +++ b/xdem/coreg.py @@ -869,6 +869,7 @@ def _to_matrix_func(self) -> np.ndarray: class ICP(Coreg): """ Iterative Closest Point DEM coregistration. + Based on 3D registration of Besl and McKay (1992), https://doi.org/10.1117/12.57955. Estimates a rigid transform (rotation + translation) between two DEMs. diff --git a/xdem/spatialstats.py b/xdem/spatialstats.py index 16c08602..834aab84 100644 --- a/xdem/spatialstats.py +++ b/xdem/spatialstats.py @@ -291,7 +291,7 @@ def two_step_standardization(dvalues: np.ndarray, list_var: Iterable[np.ndarray] Standardize the proxy differenced values using the modelled heteroscedasticity, re-scaled to the spread statistic, and generate the final standardization function. - :param dvalues: Proxy values array of size (N,) + :param dvalues: Proxy values as array of size (N,) (i.e., differenced values where signal should be zero such as elevation differences on stable terrain) :param list_var: List of size (L) of explanatory variables array of size (N,) :param unscaled_error_fun: Function of the spread with explanatory variables not yet re-scaled :param spread_statistic: Statistic to be computed for the spread; defaults to nmad @@ -339,7 +339,7 @@ def estimate_model_heteroscedasticity(dvalues: np.ndarray, list_var: Iterable[np The functions used are `nd_binning`, `interp_nd_binning` and `two_step_standardization`. - :param dvalues: Proxy values array of size (N,) + :param dvalues: Proxy values as array of size (N,) (i.e., differenced values where signal should be zero such as elevation differences on stable terrain) :param list_var: List of size (L) of explanatory variables array of size (N,) :param list_var_names: List of size (L) of names of the explanatory variables :param spread_statistic: Statistic to be computed for the spread; defaults to nmad @@ -410,7 +410,7 @@ def infer_heteroscedasticity_from_stable(dvalues: np.ndarray | RasterType, list_ If no stable or unstable mask is provided to mask in or out the values, all terrain is used. - :param dvalues: Proxy values as array or Raster + :param dvalues: Proxy values as array or Raster (i.e., differenced values where signal should be zero such as elevation differences on stable terrain) :param list_var: List of size (L) of explanatory variables as array or Raster of same shape as dvalues :param stable_mask: Vector shapefile of stable terrain (if dvalues is Raster), or boolean array of same shape as dvalues :param unstable_mask: Vector shapefile of unstable terrain (if dvalues is Raster), or boolean array of same shape as dvalues @@ -904,7 +904,7 @@ def sample_empirical_variogram(values: Union[np.ndarray, RasterType], gsd: float :param coords: Coordinates :param subsample: Number of samples to randomly draw from the values :param subsample_method: Spatial subsampling method - :param n_variograms: Number of independent empirical variogram estimations + :param n_variograms: Number of independent empirical variogram estimations (to estimate empirical variogram spread) :param n_jobs: Number of processing cores :param verbose: Print statements during processing :param random_state: Random state or seed number to use for calculations (to fix random sampling during testing) @@ -1310,10 +1310,13 @@ def estimate_model_spatial_correlation(dvalues: Union[np.ndarray, RasterType], l Estimate and model the spatial correlation of the input variable by empirical variogram sampling and fitting of a sum of variogram model. + The spatial correlation is returned as a function of spatial lags (in units of the input coordinates) which gives a + correlation value between 0 and 1. + This function samples an empirical variogram using skgstat.Variogram, then optimizes by weighted least-squares the sum of a defined number of models, using the functions `sample_empirical_variogram` and `fit_sum_model_variogram`. - :param dvalues: Proxy values of studied variable + :param dvalues: Proxy values as array or Raster (i.e., differenced values where signal should be zero such as elevation differences on stable terrain) :param list_models: List of K variogram models to sum for the fit in order from short to long ranges. Can either be a 3-letter string, full string of the variogram name or SciKit-GStat model function (e.g., for a spherical model "Sph", "Spherical" or skgstat.models.spherical). @@ -1323,14 +1326,14 @@ def estimate_model_spatial_correlation(dvalues: Union[np.ndarray, RasterType], l :param coords: Coordinates :param subsample: Number of samples to randomly draw from the values :param subsample_method: Spatial subsampling method - :param n_variograms: Number of independent empirical variogram estimations + :param n_variograms: Number of independent empirical variogram estimations (to estimate empirical variogram spread) :param n_jobs: Number of processing cores :param verbose: Print statements during processing :param random_state: Random state or seed number to use for calculations (to fix random sampling during testing) :param bounds: Bounds of range and sill parameters for each model (shape K x 4 = K x range lower, range upper, sill lower, sill upper). :param p0: Initial guess of ranges and sills each model (shape K x 2 = K x range first guess, sill first guess). - :return: Dataframe of empirical variogram, Dataframe of optimized model parameters, Spatial correlation function + :return: Dataframe of empirical variogram, Dataframe of optimized model parameters, Function of spatial correlation (0 to 1) with spatial lags """ empirical_variogram = sample_empirical_variogram(values=dvalues, estimator=estimator, gsd=gsd, coords=coords, @@ -1353,8 +1356,8 @@ def infer_spatial_correlation_from_stable(dvalues: np.ndarray | RasterType, estimator = 'dowd', gsd: float = None, coords: np.ndarray = None, subsample: int = 1000, subsample_method: str = 'cdist_equidistant', n_variograms: int = 1, n_jobs: int = 1, verbose = False, - random_state: None | np.random.RandomState | np.random.Generator | int = None, bounds: list[tuple[float, float]] = None, p0: list[float] = None, + random_state: None | np.random.RandomState | np.random.Generator | int = None, **kwargs ) -> tuple[pd.DataFrame, pd.DataFrame, Callable[[np.ndarray], np.ndarray]]: """ @@ -1362,13 +1365,14 @@ def infer_spatial_correlation_from_stable(dvalues: np.ndarray | RasterType, as a sum. This function returns a dataframe of the empirical variogram, a dataframe of optimized model parameters, and a - spatial correlation function. + spatial correlation function. The spatial correlation is returned as a function of spatial lags + (in units of the input coordinates) which gives a correlation value between 0 and 1. It is a convenience wrapper for `estimate_model_spatial_correlation` to work on either Raster or array and compute the stable mask. If no stable or unstable mask is provided to mask in or out the values, all terrain is used. - :param dvalues: Proxy values as array or Raster + :param dvalues: Proxy values as array or Raster (i.e., differenced values where signal should be zero such as elevation differences on stable terrain) :param list_models: List of K variogram models to sum for the fit in order from short to long ranges. Can either be a 3-letter string, full string of the variogram name or SciKit-GStat model function (e.g., for a spherical model "Sph", "Spherical" or skgstat.models.spherical). @@ -1381,14 +1385,14 @@ def infer_spatial_correlation_from_stable(dvalues: np.ndarray | RasterType, :param coords: Coordinates :param subsample: Number of samples to randomly draw from the values :param subsample_method: Spatial subsampling method - :param n_variograms: Number of independent empirical variogram estimations + :param n_variograms: Number of independent empirical variogram estimations (to estimate empirical variogram spread) :param n_jobs: Number of processing cores :param verbose: Print statements during processing - :param random_state: Random state or seed number to use for calculations (to fix random sampling during testing) :param bounds: Bounds of range and sill parameters for each model (shape K x 4 = K x range lower, range upper, sill lower, sill upper). :param p0: Initial guess of ranges and sills each model (shape K x 2 = K x range first guess, sill first guess). + :param random_state: Random state or seed number to use for calculations (to fix random sampling during testing) - :return: Dataframe of empirical variogram, Dataframe of optimized model parameters, Spatial correlation function + :return: Dataframe of empirical variogram, Dataframe of optimized model parameters, Function of spatial correlation (0 to 1) with spatial lags """ # Check inputs @@ -1411,7 +1415,6 @@ def infer_spatial_correlation_from_stable(dvalues: np.ndarray | RasterType, else: dvalues_arr = dvalues - # If the stable mask is not an array, create it # If the stable mask is not an array, create it if stable_mask is None: stable_mask_arr = np.ones(np.shape(dvalues_arr), dtype=bool) From 175b35bf728d4ed9f589c7de4b869e3cd8e8199d Mon Sep 17 00:00:00 2001 From: rhugonne Date: Wed, 24 Aug 2022 13:13:31 +0200 Subject: [PATCH 20/21] Fix sphinx syntax issues --- docs/source/intro_accuracy_precision.rst | 2 +- examples/plot_heterosc_estimation_modelling.py | 2 +- examples/plot_infer_spatial_correlation.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/intro_accuracy_precision.rst b/docs/source/intro_accuracy_precision.rst index 863361bb..eb823e58 100644 --- a/docs/source/intro_accuracy_precision.rst +++ b/docs/source/intro_accuracy_precision.rst @@ -51,7 +51,7 @@ Optimizing DEM absolute accuracy -------------------------------- .. figure:: https://github.com/rhugonnet/dem_error_study/blob/main/figures/fig_2.png?raw=true - :witdh: 100% + :width: 100% Source: `Hugonnet et al. (2022) `_. diff --git a/examples/plot_heterosc_estimation_modelling.py b/examples/plot_heterosc_estimation_modelling.py index 58a86bae..77a6aff4 100644 --- a/examples/plot_heterosc_estimation_modelling.py +++ b/examples/plot_heterosc_estimation_modelling.py @@ -34,7 +34,7 @@ ref_dem = xdem.DEM(xdem.examples.get_path("longyearbyen_ref_dem")) dh = xdem.DEM(xdem.examples.get_path("longyearbyen_ddem")) glacier_outlines = gu.Vector(xdem.examples.get_path("longyearbyen_glacier_outlines")) -mask_glacier = glacier_outlines.create_mask(dh) +mask_glacier = glacier_outlines.create_mask(dh).squeeze() # %% # We derive terrain attributes from the reference DEM (see :ref:`sphx_glr_auto_examples_plot_terrain_attributes.py`), diff --git a/examples/plot_infer_spatial_correlation.py b/examples/plot_infer_spatial_correlation.py index 3e56198d..89547d7d 100644 --- a/examples/plot_infer_spatial_correlation.py +++ b/examples/plot_infer_spatial_correlation.py @@ -29,7 +29,7 @@ # %% # The first output corresponds to the dataframe of the empirical variogram, by default estimated using Dowd's estimator -# and the circular sampling scheme of `skgstat.RasterEquidistantMetricSpace` (Fig. S13 of Hugonnet et al. (2022)). The +# and the circular sampling scheme of ``skgstat.RasterEquidistantMetricSpace`` (Fig. S13 of Hugonnet et al. (2022)). The # ``lags`` columns is the upper bound of spatial lag bins (lower bound of first bin being 0), the ``exp`` column is the # "experimental" variance value of the variogram in that bin, the ``count`` the number of pairwise samples, and # ``err_exp`` the 1-sigma error of the "experimental" variance, if more than one variogram is estimated with the From 2f3145eceebae24e338124e9e9e9ff9c217d6b2f Mon Sep 17 00:00:00 2001 From: rhugonne Date: Wed, 24 Aug 2022 14:19:51 +0200 Subject: [PATCH 21/21] Fixes on warnings, wrong edits when accounting for comments --- docs/source/intro_accuracy_precision.rst | 33 ++++++++++++------- .../plot_heterosc_estimation_modelling.py | 32 +++++------------- .../plot_variogram_estimation_modelling.py | 4 +-- tests/test_spatialstats.py | 2 +- xdem/spatialstats.py | 2 +- 5 files changed, 33 insertions(+), 40 deletions(-) diff --git a/docs/source/intro_accuracy_precision.rst b/docs/source/intro_accuracy_precision.rst index eb823e58..6c9fc1ae 100644 --- a/docs/source/intro_accuracy_precision.rst +++ b/docs/source/intro_accuracy_precision.rst @@ -22,21 +22,34 @@ calculations consistent, reproducible, and easy. Accuracy and precision ---------------------- -Both `accuracy and precision `_ are important factors to account -for when analyzing DEMs: +`Accuracy and precision `_ describe random and systematic errors, +respectively. -- the **accuracy** (systematic error) of a DEM describes how close a DEM is to the true location of measured elevations on the Earth's surface, -- the **precision** (random error) of a DEM describes the typical spread of its error in measurement, independently of a possible bias from the true positioning. - -*Note: sometimes "accuracy" is also used to describe both types of errors, and "trueness" systematic errors, following -`ISO 5725-1 `_. Here, we used accuracy for systematic errors -as, to our knowledge, it is more common in remote sensing applications.* +*Note: sometimes "accuracy" is also used to describe both types of errors, and "trueness" systematic errors, as defined +in* `ISO 5725-1 `_ *. Here, we used accuracy for systematic +errors as, to our knowledge, it is a more commonly used terminology in remote sensing applications.* .. figure:: imgs/precision_accuracy.png :width: 80% Source: `antarcticglaciers.org `_, accessed 29.06.21. + +For DEMs, we thus have: + +- **DEM accuracy** (systematic error) describes how close a DEM is to the true location of measured elevations on the Earth's surface, +- **DEM precision** (random error) of a DEM describes the typical spread of its error in measurement, independently of a possible bias from the true positioning. + +The spatial structure of DEMs complexifies the notion of accuracy and precision, however. Spatially structured +systematic errors are often related to the gridded nature of DEMs, creating **affine biases** while other, **specific +biases** exist at the pixel scale. For random errors, a variability in error magnitude or **heteroscedasticity** exists +across the DEM, while spatially structured patterns of errors are linked to **spatial correlations**. + +.. figure:: https://github.com/rhugonnet/dem_error_study/blob/main/figures/fig_2.png?raw=true + :width: 100% + + Source: `Hugonnet et al. (2022) `_. + Absolute or relative accuracy ----------------------------- @@ -50,10 +63,6 @@ TODO: Add another little schematic! Optimizing DEM absolute accuracy -------------------------------- -.. figure:: https://github.com/rhugonnet/dem_error_study/blob/main/figures/fig_2.png?raw=true - :width: 100% - - Source: `Hugonnet et al. (2022) `_. Shifts due to poor absolute accuracy are common in elevation datasets, and can be easily corrected by performing a DEM diff --git a/examples/plot_heterosc_estimation_modelling.py b/examples/plot_heterosc_estimation_modelling.py index 77a6aff4..23eedcdb 100644 --- a/examples/plot_heterosc_estimation_modelling.py +++ b/examples/plot_heterosc_estimation_modelling.py @@ -177,9 +177,10 @@ # We also identify that, steep slopes (> 40°) only correspond to high curvature, while the opposite is not true, hence # the importance of mapping the variability in two dimensions. # -# Now we need to account for the non-stationarities identified. For this, the simplest approach is a numerical -# approximation i.e. a piecewise linear interpolation/extrapolation based on the binning results. -# To ensure that only robust statistic values are used in the interpolation, we set a ``min_count`` value at 30 samples. +# Now we need to account for the heteroscedasticity identified. For this, the simplest approach is a numerical +# approximation i.e. a piecewise linear interpolation/extrapolation based on the binning results available through +# the function :func:`xdem.spatialstats.interp_nd_binning`. To ensure that only robust statistic values are used +# in the interpolation, we set a ``min_count`` value at 30 samples. unscaled_dh_err_fun = xdem.spatialstats.interp_nd_binning(df, list_var_names=['slope', 'maxc'], statistic='nmad', min_count=30) @@ -197,7 +198,7 @@ # %% # Thus, we rescale the function to exactly match the spread on stable terrain using the -# ``xdem.spatialstats.two_step_standardization`` function, and get our final error function. +# :func:`xdem.spatialstats.two_step_standardization` function, and get our final error function. zscores, dh_err_fun = xdem.spatialstats.two_step_standardization(dh_arr, list_var=[slope_arr, maxc_arr], unscaled_error_fun=unscaled_dh_err_fun) @@ -209,23 +210,6 @@ # %% # This function can be used to estimate the spatial distribution of the elevation error on the extent of our DEMs: maxc = np.maximum(np.abs(profc), np.abs(planc)) -errors = dh_err_fun((slope, maxc)) - -plt.figure(figsize=(8, 5)) -plt_extent = [ - ref_dem.bounds.left, - ref_dem.bounds.right, - ref_dem.bounds.bottom, - ref_dem.bounds.top, -] -plt.imshow(errors.squeeze(), cmap="Reds", vmin=2, vmax=8, extent=plt_extent) -cbar = plt.colorbar() -cbar.set_label('Elevation error ($1\sigma$, m)') -plt.show() - -# %% -# These 3 steps can be done in one go using the ``xdem.spatialstats.estimate_model_heteroscedasticity`` function, which -# wraps those three funtions, expecting ``np.ndarray`` inputs subset to stable terrain. - -df, dh_err_fun = xdem.spatialstats.estimate_model_heteroscedasticity(dvalues=dh_arr, list_var=[slope_arr, maxc_arr], - list_var_names=['slope', 'maxc']) +errors = dh.copy(new_array= dh_err_fun((slope, maxc))) + +errors.show(cmap='Reds', vmin=2, vmax=8, cb_title='Elevation error ($1\sigma$, m)') diff --git a/examples/plot_variogram_estimation_modelling.py b/examples/plot_variogram_estimation_modelling.py index add44442..f6b4f4f1 100644 --- a/examples/plot_variogram_estimation_modelling.py +++ b/examples/plot_variogram_estimation_modelling.py @@ -33,12 +33,12 @@ # %% # We exclude values on glacier terrain in order to isolate stable terrain, our proxy for elevation errors. -dh.data[mask_glacier] = np.nan +dh.set_mask(mask_glacier) # %% # We estimate the average per-pixel elevation error on stable terrain, using both the standard deviation # and normalized median absolute deviation. For this example, we do not account for elevation heteroscedasticity. -print('STD: {:.2f} meters.'.format(np.nanstd(dh.data))) +print('STD: {:.2f} meters.'.format(np.ma.std(dh.data))) print('NMAD: {:.2f} meters.'.format(xdem.spatialstats.nmad(dh.data))) # %% diff --git a/tests/test_spatialstats.py b/tests/test_spatialstats.py index 9e9b26db..51dfd162 100644 --- a/tests/test_spatialstats.py +++ b/tests/test_spatialstats.py @@ -517,7 +517,7 @@ def test_number_effective_samples(self): def test_spatial_error_propagation(self): """Test that the spatial error propagation wrapper function runs properly""" - ref, diff, vector_glacier = load_ref_and_diff() + ref, diff, _ , vector_glacier = load_ref_and_diff() # Get the error map and variogram model with standardization slope, maxc = xdem.terrain.get_terrain_attribute(ref, attribute=['slope', 'maximum_curvature']) diff --git a/xdem/spatialstats.py b/xdem/spatialstats.py index 834aab84..73d146c0 100644 --- a/xdem/spatialstats.py +++ b/xdem/spatialstats.py @@ -38,7 +38,7 @@ def nmad(data: np.ndarray, nfact: float = 1.4826) -> float: Calculate the normalized median absolute deviation (NMAD) of an array. Default scaling factor is 1.4826 to scale the median absolute deviation (MAD) to the dispersion of a normal distribution (see https://en.wikipedia.org/wiki/Median_absolute_deviation#Relation_to_standard_deviation, and - e.g. http://dx.doi.org/10.1016/j.isprsjprs.2009.02.003) + e.g. Höhle and Höhle (2009), http://dx.doi.org/10.1016/j.isprsjprs.2009.02.003) :param data: Input data :param nfact: Normalization factor for the data