Skip to content

Commit

Permalink
Create validator for evaluating geoshape WKT values
Browse files Browse the repository at this point in the history
Why these changes are being introduced:
* This is part of an effort to add some data validation to the TIMDEX
pipeline, with an aim to catch errors with records in earlier stages
(e.g., during the harvest). This also proposes a workflow wherein
validation methods can easily be applied as decorators to field methods
for a given SourceRecord.

How this addresses that need:
* Add 'shapely' as a dependency
* Create custom warning for invalid field values
* Create decorator method for validating geoshape WKT values
   * Returns None if geoshape WKT value is invalid (cannot be parsed)

Side effects of this change:
* None

Relevant ticket(s):
* https://mitlibraries.atlassian.net/browse/GDT-159
  • Loading branch information
jonavellecuerdo committed Feb 5, 2024
1 parent da958f4 commit 79a593d
Show file tree
Hide file tree
Showing 6 changed files with 189 additions and 19 deletions.
1 change: 1 addition & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ lxml = "*"
jsonlines = "*"
pycountry = "*"
pygit2 = "*"
shapely = "*"

[dev-packages]
black = "*"
Expand Down
127 changes: 108 additions & 19 deletions Pipfile.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions harvester/records/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@
from jsonschema.exceptions import ValidationError


class FieldValueInvalidWarning(Warning):
"""Warning to log when a validator determines field method returns an invalid value"""


class FieldMethodError(Exception):
"""Exception to raise from normalize() method"""

Expand Down
57 changes: 57 additions & 0 deletions harvester/records/validators.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import functools
import logging
import re
from ast import literal_eval
from collections.abc import Callable

import shapely

from harvester.records.exceptions import FieldValueInvalidWarning

logger = logging.getLogger(__name__)


class ValidateGeoshapeWKT:
"""Method for validating geoshape WKT values.
The validator should be applied to any field methods that retrieve geoshape
WKT values. The validator logs a warning if the WKT value cannot be parsed
using the shapely module. If the WKT value cannot be parsed, the validator
resets the value to None.
Note: shapely does not currently support WKT values for bounding box regions.
A workaround relies on manually retrieving the vertices for the envelope using regex
and passing these values into the shapely.box() method.
"""

invalid_wkt_warning_message: str = (
"field: {field_name}, unable to parse WKT: '{value}'; setting value to None"
)

def __init__(self, field_method: Callable):
functools.update_wrapper(self, field_method)
self.field_method = field_method

def __call__(self) -> str | None:
field_name = self.field_method.__name__.removeprefix("_")
value = self.field_method()

try:
self.create_geoshape(value)
except Exception: # noqa: BLE001
logger.warning(
FieldValueInvalidWarning(
self.invalid_wkt_warning_message.format(
field_name=field_name, value=value
)
)
)
return None
return value

@staticmethod
def create_geoshape(wkt: str) -> shapely.Geometry:
if geoshape_string := re.compile(r"^ENVELOPE\s?(.*)").match(wkt):
xmin, xmax, ymax, ymin = literal_eval(geoshape_string.group(1))
return shapely.box(xmin, ymin, xmax, ymax)
return shapely.from_wkt(wkt)
4 changes: 4 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ disallow_untyped_calls = true
disallow_untyped_defs = true
exclude = ["tests/", "output/"]

[[tool.mypy.overrides]]
module = ["shapely"]
ignore_missing_imports = true

[tool.pytest.ini_options]
log_level = "INFO"
markers = [
Expand Down
15 changes: 15 additions & 0 deletions tests/test_records/test_validator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from unittest.mock import MagicMock

from harvester.records.validators import ValidateGeoshapeWKT


def test_validator_invalid_geoshape_wkt_logs_warning_and_resets_value(caplog):
caplog.set_level("DEBUG")
mock_dcat_bbox = MagicMock()
mock_dcat_bbox.__name__ = "_dcat_bbox"
mock_dcat_bbox.return_value = "ENVELOPE"
value = ValidateGeoshapeWKT(mock_dcat_bbox)()
assert (
"field: dcat_bbox, unable to parse WKT: 'ENVELOPE'; setting value to None"
) in caplog.text
assert value is None

0 comments on commit 79a593d

Please sign in to comment.