From 6a979122007871bb0cb5fced9dd3a40641c27f9e Mon Sep 17 00:00:00 2001 From: Graham Hukill Date: Mon, 8 Jan 2024 10:58:08 -0500 Subject: [PATCH] Adjustments for successful OpenSearch indexing Why these changes are being introduced: After an MITAardvark transformation was mostly completed, a couple of small bugs were discovered when attempting to index the transformed TIMDEX records into OpenSearch. This commit adddresses those bugs, both full fixes and workarounds, to allow continued pipeline testing and improvements to the MITAardvark transformation. How this addresses that need: * adds kind property to Dates and Identifiers * set get_locations() field method to return empty list until mapping approach confirmed Side effects of this change: * Anticipating successful indexing attempts from transformed records Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/GDT-54 --- tests/fixtures/aardvark_records.jsonl | 4 ++-- tests/sources/json/test_aardvark.py | 13 ++++++----- transmogrifier/sources/json/aardvark.py | 29 +++++++++++++------------ 3 files changed, 24 insertions(+), 22 deletions(-) diff --git a/tests/fixtures/aardvark_records.jsonl b/tests/fixtures/aardvark_records.jsonl index 4946524..4c89e60 100644 --- a/tests/fixtures/aardvark_records.jsonl +++ b/tests/fixtures/aardvark_records.jsonl @@ -1,2 +1,2 @@ -{"dct_accessRights_s": "Access rights", "dct_references_s": "", "dct_title_s": "Test title 1", "gbl_mdModified_dt": "", "gbl_mdVersion_s": "", "gbl_resourceClass_sm": "", "id": "mit:123", "locn_geometry": ""} -{"dct_accessRights_s": "Access rights", "dct_references_s": "", "dct_title_s": "Test title 2", "gbl_mdModified_dt": "", "gbl_mdVersion_s": "", "gbl_resourceClass_sm": "", "id": "ogm:456", "locn_geometry": ""} \ No newline at end of file +{"dct_accessRights_s": "Access rights", "dct_references_s": "", "dct_title_s": "Test title 1", "gbl_mdModified_dt": "", "gbl_mdVersion_s": "", "gbl_resourceClass_sm": "", "id": "mit:123", "locn_geometry": "ENVELOPE(-111.1, -104.0, 45.0, 40.9)"} +{"dct_accessRights_s": "Access rights", "dct_references_s": "", "dct_title_s": "Test title 2", "gbl_mdModified_dt": "", "gbl_mdVersion_s": "", "gbl_resourceClass_sm": "", "id": "ogm:456", "locn_geometry": "ENVELOPE(-111.1, -104.0, 45.0, 40.9)"} \ No newline at end of file diff --git a/tests/sources/json/test_aardvark.py b/tests/sources/json/test_aardvark.py index 5779b5e..a688530 100644 --- a/tests/sources/json/test_aardvark.py +++ b/tests/sources/json/test_aardvark.py @@ -75,6 +75,7 @@ def test_aardvark_get_dates_success(aardvark_record_all_fields): timdex.Date(kind="Coverage", value="1945"), timdex.Date(kind="Coverage", value="1946"), timdex.Date( + kind="Coverage", range=timdex.Date_Range(gte="1943", lte="1946"), ), ] @@ -97,7 +98,7 @@ def test_parse_solr_date_range_invalid_date_range_string_raises_error(): def test_aardvark_get_identifiers_success(aardvark_record_all_fields): assert MITAardvark.get_identifiers(next(aardvark_record_all_fields)) == [ - timdex.Identifier(value="abc123") + timdex.Identifier(value="abc123", kind="Not specified") ] @@ -129,11 +130,11 @@ def test_aardvark_get_links_logs_warning_for_invalid_json(caplog): ) -def test_aardvark_get_locations_success(aardvark_record_all_fields): - assert MITAardvark.get_locations(next(aardvark_record_all_fields), "123") == [ - timdex.Location(kind="Bounding Box", geodata=[-111.1, -104.0, 45.0, 40.9]), - timdex.Location(kind="Geometry", geodata=[-111.1, -104.0, 45.0, 40.9]), - ] +def test_aardvark_get_locations_success(caplog, aardvark_record_all_fields): + caplog.set_level("DEBUG") + assert "Geometry field 'dcat_bbox' found, but currently not mapped." + assert "Geometry field 'locn_geometry' found, but currently not mapped." + assert MITAardvark.get_locations(next(aardvark_record_all_fields), "123") == [] def test_aardvark_get_notes_success(aardvark_record_all_fields): diff --git a/transmogrifier/sources/json/aardvark.py b/transmogrifier/sources/json/aardvark.py index e6b8416..5e5f971 100644 --- a/transmogrifier/sources/json/aardvark.py +++ b/transmogrifier/sources/json/aardvark.py @@ -3,7 +3,6 @@ import re import transmogrifier.models as timdex -from transmogrifier.helpers import parse_geodata_string from transmogrifier.sources.transformer import JSON, JSONTransformer logger = logging.getLogger(__name__) @@ -228,9 +227,10 @@ def _range_dates( ) range_dates.append( timdex.Date( + kind="Coverage", range=timdex.Date_Range( gte=date_range_values[0], lte=date_range_values[1] - ) + ), ) ) return range_dates @@ -262,7 +262,7 @@ def parse_solr_date_range_string( def get_identifiers(source_record: dict) -> list[timdex.Identifier]: """Get values from source record for TIMDEX identifiers field.""" return [ - timdex.Identifier(value=identifier_value) + timdex.Identifier(value=identifier_value, kind="Not specified") for identifier_value in source_record.get("dct_identifier_sm", []) ] @@ -292,8 +292,13 @@ def get_links(source_record: dict, source_record_id: str) -> list[timdex.Link]: def get_locations( source_record: dict, source_record_id: str ) -> list[timdex.Location]: - """Get values from source record for TIMDEX locations field.""" - locations = [] + """Get values from source record for TIMDEX locations field. + + WIP: Currently in the process of determining our approach for storing geographic + geometry data in the TIMDEX record and how this dovetails with the OpenSearch + mapping. At this time, this method returns an empty list of Locations. + """ + locations: list[timdex.Location] = [] aardvark_location_fields = { "dcat_bbox": "Bounding Box", @@ -303,15 +308,11 @@ def get_locations( if aardvark_location_field not in source_record: continue try: - if geodata_points := parse_geodata_string( - source_record[aardvark_location_field], source_record_id - ): - locations.append( - timdex.Location( - geodata=geodata_points, - kind=kind_value, - ) - ) + message = ( + f"Geometry field '{aardvark_location_field}' found, but " + f"currently not mapped." + ) + logger.debug(message) except ValueError as exception: logger.warning(exception) return locations