Merge pull request #185 from NOAA-OWP/memory_analysis

Memory analysis
NOAA-OWP · Jun 14, 2024 · 19f8c1d · 19f8c1d
2 parents 9d5bc26 + 462c3c7
commit 19f8c1d
Show file tree

Hide file tree

Showing 12 changed files with 79 additions and 34 deletions.
diff --git a/notebooks/Catalog Tutorial.ipynb b/notebooks/Catalog Tutorial.ipynb
@@ -884,7 +884,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.10"
+   "version": "3.10.14"
   }
  },
  "nbformat": 4,

diff --git a/notebooks/Continuous Comparison Tutorial.ipynb b/notebooks/Continuous Comparison Tutorial.ipynb
@@ -710,7 +710,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.10"
+   "version": "3.10.14"
   }
  },
  "nbformat": 4,

diff --git a/notebooks/Tutorial.ipynb b/notebooks/Tutorial.ipynb
@@ -1158,7 +1158,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.10"
+   "version": "3.10.14"
   }
  },
  "nbformat": 4,

diff --git a/pyproject.toml b/pyproject.toml
@@ -12,7 +12,7 @@ authors = [
 requires-python = ">=3.8"
 keywords = ["geospatial", "evaluations"]
 license = {text = "MIT"}
-version = "0.2.6"
+version = "0.2.7"
 dynamic = ["readme", "dependencies"]
 
 [project.optional-dependencies]

diff --git a/requirements.txt b/requirements.txt
@@ -1,17 +1,16 @@
 rioxarray>=0.13.4
-dask==2023.5.0
-xarray-spatial==0.3.5
+dask>=2023.5.0,<2025
 pandera==0.15.1
 shapely==2.0.1
 geocube>=0.3.3
 pandas==2.0.2
 odc-geo==0.4.1
-pydantic==1.10.10
+pydantic>=1.10.13
 rio-cogeo==4.0.0
 matplotlib==3.7.1
 contextily==1.3.0
 flox==0.7.2
 xskillscore==0.0.24
-pyogrio==0.7.2
+pyogrio>=0.7.2,<=0.8.0
 pystac-client==0.7.5
 s3fs<=2023.12.1
diff --git a/src/gval/catalogs/catalogs.py b/src/gval/catalogs/catalogs.py
@@ -6,6 +6,7 @@
 # __all__ = ['*']
 __author__ = "Fernando Aristizabal"
 
+import gc
 from typing import Iterable, Optional, Callable, Tuple
 import os
 
@@ -172,7 +173,7 @@ def loadxr(map, open_kwargs):
         else:
             raise ValueError("compare_type must be str or Callable")
 
-        # write agreement map to file
+        # Write agreement map to file
         if (agreement_map_field is not None) & isinstance(
             agreement_map, (xr.DataArray, xr.Dataset)
         ):
@@ -181,6 +182,10 @@ def loadxr(map, open_kwargs):
                     row[agreement_map_field], **agreement_map_write_kwargs
                 )
 
+        # Unfortunately necessary until a fix is found in xarray/rioxarray io
+        del candidate_map, benchmark_map, agreement_map
+        gc.collect()
+
         return metrics_df
 
     # make kwargs for dask apply

diff --git a/src/gval/comparison/pairing_functions.py b/src/gval/comparison/pairing_functions.py
@@ -17,10 +17,34 @@
 from numbers import Number
 
 import numpy as np
-import numba as nb
-
-
-@nb.vectorize(nopython=True)
+from numba import vectorize, uint8, int32, int64, float32, float64, boolean
+
+
+# Numba Type Definitions
+one_param_function_types = [
+    uint8(uint8),
+    int32(int32),
+    int64(int64),
+    float32(float32),
+    float64(float64),
+]
+two_param_function_types = [
+    uint8(uint8, uint8),
+    int32(int32, int32),
+    int64(int64, int64),
+    float32(float32, float32),
+    float64(float64, float64),
+]
+not_natural_number_types = [
+    boolean(uint8, boolean),
+    boolean(int32, boolean),
+    int64(int64, boolean),
+    float32(float32, boolean),
+    float64(float64, boolean),
+]
+
+
+@vectorize(not_natural_number_types, nopython=True)
 def _is_not_natural_number(
     x: Number, raise_exception: bool
 ) -> bool:  # pragma: no cover
@@ -49,7 +73,7 @@ def _is_not_natural_number(
         return False  # treated as natural for this use case
 
     # checks for non-negative and whole number
-    elif (x < 0) | ((x - nb.int64(x)) != 0):
+    elif (x < 0) | ((x - int64(x)) != 0):
         if raise_exception:
             raise ValueError(
                 "Non natural number found (non-negative integers, excluding Inf) [0, 1, 2, 3, 4, ...)"
@@ -62,7 +86,7 @@ def _is_not_natural_number(
         return False
 
 
-@nb.vectorize(nopython=True)
+@vectorize(two_param_function_types, nopython=True)
 def cantor_pair(c: Number, b: Number) -> Number:  # pragma: no cover
     """
     Produces unique natural number for two non-negative natural numbers (0,1,2,...)
@@ -92,7 +116,7 @@ def cantor_pair(c: Number, b: Number) -> Number:  # pragma: no cover
         return 0.5 * (c**2 + c + 2 * c * b + 3 * b + b**2)
 
 
-@nb.vectorize(nopython=True)
+@vectorize(two_param_function_types, nopython=True)
 def szudzik_pair(c: Number, b: Number) -> Number:  # pragma: no cover
     """
     Produces unique natural number for two non-negative natural numbers (0,1,2,3,...).
@@ -122,7 +146,7 @@ def szudzik_pair(c: Number, b: Number) -> Number:  # pragma: no cover
         return c**2 + c + b if c >= b else b**2 + c
 
 
-@nb.vectorize(nopython=True)
+@vectorize(one_param_function_types, nopython=True)
 def _negative_value_transformation(x: Number) -> Number:  # pragma: no cover
     """
     Transforms negative values for use with pairing functions that only accept non-negative integers.
@@ -147,7 +171,7 @@ def _negative_value_transformation(x: Number) -> Number:  # pragma: no cover
         return 2 * x if x >= 0 else -2 * x - 1
 
 
-@nb.vectorize(nopython=True)
+@vectorize(two_param_function_types, nopython=True)
 def cantor_pair_signed(c: Number, b: Number) -> Number:  # pragma: no cover
     """
     Output unique natural number for each unique combination of whole numbers using Cantor signed method.
@@ -177,7 +201,12 @@ def cantor_pair_signed(c: Number, b: Number) -> Number:  # pragma: no cover
         return cantor_pair(ct, bt)
 
 
-@nb.vectorize(nopython=True)
+# from typing import TypeVar
+#
+# T = TypeVar("T")
+
+
+@vectorize(two_param_function_types, nopython=True)
 def szudzik_pair_signed(c: Number, b: Number) -> Number:  # pragma: no cover
     """
     Output unique natural number for each unique combination of whole numbers using Szudzik signed method._summary_
@@ -386,10 +415,10 @@ def pairing_dict_fn(
                 "Value combination found not accounted for in pairing dictionary"
             )
 
-    return nb.vectorize(nopython=True)(pairing_dict_fn)
+    return vectorize(two_param_function_types, nopython=True)(pairing_dict_fn)
 
 
-@nb.vectorize(nopython=True)
+@vectorize(two_param_function_types, nopython=True)
 def difference(c: Number, b: Number) -> Number:  # pragma: no cover
     """
     Calculates the difference between candidate and benchmark.

diff --git a/src/gval/comparison/tabulation.py b/src/gval/comparison/tabulation.py
@@ -91,16 +91,20 @@ def _crosstab_2d_DataArrays(
         is_dsk = True
 
     agreement_map.name = "group"
+    ag_dtype = agreement_map.dtype
 
     if is_dsk:
         agreement_counts = xarray_reduce(
             agreement_map,
             agreement_map,
+            engine="numba",
             expected_groups=dask.array.unique(agreement_map.data),
             func="count",
         )
     else:
-        agreement_counts = xarray_reduce(agreement_map, agreement_map, func="count")
+        agreement_counts = xarray_reduce(
+            agreement_map, agreement_map, engine="numba", func="count"
+        )
 
     def not_nan(number):
         return not np.isnan(number)
@@ -129,13 +133,15 @@ def not_nan(number):
                 for x in filter(not_nan, agreement_counts.coords["group"].values)
             ],
             "agreement_values": list(
-                filter(not_nan, agreement_counts.coords["group"].values.astype(float))
+                filter(
+                    not_nan, agreement_counts.coords["group"].values.astype(ag_dtype)
+                )
             ),
             "counts": [
                 x
                 for x, y in zip(
-                    agreement_counts.values.astype(float),
-                    agreement_counts.coords["group"].values.astype(float),
+                    agreement_counts.values.astype(ag_dtype),
+                    agreement_counts.coords["group"].values.astype(ag_dtype),
                 )
                 if not np.isnan(y)
             ],

diff --git a/tests/cases_catalogs.py b/tests/cases_catalogs.py
@@ -79,15 +79,15 @@
     pd.DataFrame(
         {
             "map_id_candidate": [
-                "s3://gval-test/candidate_continuous_0.tif",
-                "s3://gval-test/candidate_continuous_1.tif",
-                "s3://gval-test/candidate_continuous_1.tif",
+                f"{TEST_DATA_DIR}/candidate_continuous_0.tif",
+                f"{TEST_DATA_DIR}/candidate_continuous_1.tif",
+                f"{TEST_DATA_DIR}/candidate_continuous_1.tif",
             ],
             "compare_id": ["compare1", "compare2", "compare2"],
             "map_id_benchmark": [
-                "s3://gval-test/benchmark_continuous_0.tif",
-                "s3://gval-test/benchmark_continuous_1.tif",
-                "s3://gval-test/benchmark_continuous_1.tif",
+                f"{TEST_DATA_DIR}/benchmark_continuous_0.tif",
+                f"{TEST_DATA_DIR}/benchmark_continuous_1.tif",
+                f"{TEST_DATA_DIR}/benchmark_continuous_1.tif",
             ],
             "value1_candidate": [1, 2, 2],
             "value2_candidate": [5, 6, 6],

diff --git a/tests/cases_compare.py b/tests/cases_compare.py
@@ -225,7 +225,7 @@ def case_make_pairing_dict(
 pairing_dict_fn_inputs = [
     (1, 2, {(1, 2): 3}, 3),
     (9, 10, {(9, 10.0): 1}, 1),
-    (-1, 10, {(-1, 10): np.nan}, np.nan),
+    (-1.0, 10.0, {(-1.0, 10.0): np.nan}, np.nan),
 ]
 
 

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -18,7 +18,7 @@
 from gval.comparison.pairing_functions import PairingDict
 
 # name of S3 for test data
-TEST_DATA_S3_NAME = "gval-test"
+TEST_DATA_S3_NAME = "gval"
 TEST_DATA_DIR = f"s3://{TEST_DATA_S3_NAME}"
 
 

diff --git a/tests/test_homogenize.py b/tests/test_homogenize.py
@@ -10,6 +10,7 @@
 import xarray as xr
 import numpy as np
 import geopandas as gpd
+from geopandas.testing import assert_geodataframe_equal
 
 from gval.homogenize.spatial_alignment import (
     _matching_crs,
@@ -191,7 +192,12 @@ def test_vectorize_raster_success(raster_map, expected):
     vector_df = _vectorize_data(raster_data=raster_map)
 
     assert isinstance(vector_df, gpd.GeoDataFrame)
-    assert vector_df.equals(expected)
+    assert_geodataframe_equal(
+        vector_df.sort_values("geometry", ignore_index=True),
+        expected.sort_values("geometry", ignore_index=True),
+        check_index_type=False,
+        check_dtype=False,
+    )
 
 
 @parametrize_with_cases(