generated from MITLibraries/python-cli-template
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Create validator for evaluating geoshape WKT values
Why these changes are being introduced: * This is part of an effort to add some data validation to the TIMDEX pipeline, with an aim to catch errors with records in earlier stages (e.g., during the harvest). This also proposes a workflow wherein validation methods can easily be applied as decorators to field methods for a given SourceRecord. How this addresses that need: * Add 'shapely' as a dependency * Create custom warning for invalid field values * Create decorator method for validating geoshape WKT values * Returns None if geoshape WKT value is invalid (cannot be parsed) Side effects of this change: * None Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/GDT-159
- Loading branch information
1 parent
da958f4
commit 79a593d
Showing
6 changed files
with
189 additions
and
19 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,6 +15,7 @@ lxml = "*" | |
jsonlines = "*" | ||
pycountry = "*" | ||
pygit2 = "*" | ||
shapely = "*" | ||
|
||
[dev-packages] | ||
black = "*" | ||
|
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
import functools | ||
import logging | ||
import re | ||
from ast import literal_eval | ||
from collections.abc import Callable | ||
|
||
import shapely | ||
|
||
from harvester.records.exceptions import FieldValueInvalidWarning | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class ValidateGeoshapeWKT: | ||
"""Method for validating geoshape WKT values. | ||
The validator should be applied to any field methods that retrieve geoshape | ||
WKT values. The validator logs a warning if the WKT value cannot be parsed | ||
using the shapely module. If the WKT value cannot be parsed, the validator | ||
resets the value to None. | ||
Note: shapely does not currently support WKT values for bounding box regions. | ||
A workaround relies on manually retrieving the vertices for the envelope using regex | ||
and passing these values into the shapely.box() method. | ||
""" | ||
|
||
invalid_wkt_warning_message: str = ( | ||
"field: {field_name}, unable to parse WKT: '{value}'; setting value to None" | ||
) | ||
|
||
def __init__(self, field_method: Callable): | ||
functools.update_wrapper(self, field_method) | ||
self.field_method = field_method | ||
|
||
def __call__(self) -> str | None: | ||
field_name = self.field_method.__name__.removeprefix("_") | ||
value = self.field_method() | ||
|
||
try: | ||
self.create_geoshape(value) | ||
except Exception: # noqa: BLE001 | ||
logger.warning( | ||
FieldValueInvalidWarning( | ||
self.invalid_wkt_warning_message.format( | ||
field_name=field_name, value=value | ||
) | ||
) | ||
) | ||
return None | ||
return value | ||
|
||
@staticmethod | ||
def create_geoshape(wkt: str) -> shapely.Geometry: | ||
if geoshape_string := re.compile(r"^ENVELOPE\s?(.*)").match(wkt): | ||
xmin, xmax, ymax, ymin = literal_eval(geoshape_string.group(1)) | ||
return shapely.box(xmin, ymin, xmax, ymax) | ||
return shapely.from_wkt(wkt) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
from unittest.mock import MagicMock | ||
|
||
from harvester.records.validators import ValidateGeoshapeWKT | ||
|
||
|
||
def test_validator_invalid_geoshape_wkt_logs_warning_and_resets_value(caplog): | ||
caplog.set_level("DEBUG") | ||
mock_dcat_bbox = MagicMock() | ||
mock_dcat_bbox.__name__ = "_dcat_bbox" | ||
mock_dcat_bbox.return_value = "ENVELOPE" | ||
value = ValidateGeoshapeWKT(mock_dcat_bbox)() | ||
assert ( | ||
"field: dcat_bbox, unable to parse WKT: 'ENVELOPE'; setting value to None" | ||
) in caplog.text | ||
assert value is None |