Skip to content

Commit

Permalink
Adjustments for successful OpenSearch indexing
Browse files Browse the repository at this point in the history
Why these changes are being introduced:

After an MITAardvark transformation was mostly completed, a couple of small bugs were discovered when attempting to index the
transformed TIMDEX records into OpenSearch.  This commit adddresses those bugs, both full fixes and workarounds, to allow
continued pipeline testing and improvements to the MITAardvark transformation.

How this addresses that need:
* adds kind property to Dates and Identifiers
* set get_locations() field method to return empty list until mapping approach confirmed

Side effects of this change:
* Anticipating successful indexing attempts from transformed records

Relevant ticket(s):
* https://mitlibraries.atlassian.net/browse/GDT-54
  • Loading branch information
ghukill committed Jan 8, 2024
1 parent a941359 commit 6a97912
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 22 deletions.
4 changes: 2 additions & 2 deletions tests/fixtures/aardvark_records.jsonl
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
{"dct_accessRights_s": "Access rights", "dct_references_s": "", "dct_title_s": "Test title 1", "gbl_mdModified_dt": "", "gbl_mdVersion_s": "", "gbl_resourceClass_sm": "", "id": "mit:123", "locn_geometry": ""}
{"dct_accessRights_s": "Access rights", "dct_references_s": "", "dct_title_s": "Test title 2", "gbl_mdModified_dt": "", "gbl_mdVersion_s": "", "gbl_resourceClass_sm": "", "id": "ogm:456", "locn_geometry": ""}
{"dct_accessRights_s": "Access rights", "dct_references_s": "", "dct_title_s": "Test title 1", "gbl_mdModified_dt": "", "gbl_mdVersion_s": "", "gbl_resourceClass_sm": "", "id": "mit:123", "locn_geometry": "ENVELOPE(-111.1, -104.0, 45.0, 40.9)"}
{"dct_accessRights_s": "Access rights", "dct_references_s": "", "dct_title_s": "Test title 2", "gbl_mdModified_dt": "", "gbl_mdVersion_s": "", "gbl_resourceClass_sm": "", "id": "ogm:456", "locn_geometry": "ENVELOPE(-111.1, -104.0, 45.0, 40.9)"}
13 changes: 7 additions & 6 deletions tests/sources/json/test_aardvark.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ def test_aardvark_get_dates_success(aardvark_record_all_fields):
timdex.Date(kind="Coverage", value="1945"),
timdex.Date(kind="Coverage", value="1946"),
timdex.Date(
kind="Coverage",
range=timdex.Date_Range(gte="1943", lte="1946"),
),
]
Expand All @@ -97,7 +98,7 @@ def test_parse_solr_date_range_invalid_date_range_string_raises_error():

def test_aardvark_get_identifiers_success(aardvark_record_all_fields):
assert MITAardvark.get_identifiers(next(aardvark_record_all_fields)) == [
timdex.Identifier(value="abc123")
timdex.Identifier(value="abc123", kind="Not specified")
]


Expand Down Expand Up @@ -129,11 +130,11 @@ def test_aardvark_get_links_logs_warning_for_invalid_json(caplog):
)


def test_aardvark_get_locations_success(aardvark_record_all_fields):
assert MITAardvark.get_locations(next(aardvark_record_all_fields), "123") == [
timdex.Location(kind="Bounding Box", geodata=[-111.1, -104.0, 45.0, 40.9]),
timdex.Location(kind="Geometry", geodata=[-111.1, -104.0, 45.0, 40.9]),
]
def test_aardvark_get_locations_success(caplog, aardvark_record_all_fields):
caplog.set_level("DEBUG")
assert "Geometry field 'dcat_bbox' found, but currently not mapped."
assert "Geometry field 'locn_geometry' found, but currently not mapped."
assert MITAardvark.get_locations(next(aardvark_record_all_fields), "123") == []


def test_aardvark_get_notes_success(aardvark_record_all_fields):
Expand Down
29 changes: 15 additions & 14 deletions transmogrifier/sources/json/aardvark.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import re

import transmogrifier.models as timdex
from transmogrifier.helpers import parse_geodata_string
from transmogrifier.sources.transformer import JSON, JSONTransformer

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -228,9 +227,10 @@ def _range_dates(
)
range_dates.append(
timdex.Date(
kind="Coverage",
range=timdex.Date_Range(
gte=date_range_values[0], lte=date_range_values[1]
)
),
)
)
return range_dates
Expand Down Expand Up @@ -262,7 +262,7 @@ def parse_solr_date_range_string(
def get_identifiers(source_record: dict) -> list[timdex.Identifier]:
"""Get values from source record for TIMDEX identifiers field."""
return [
timdex.Identifier(value=identifier_value)
timdex.Identifier(value=identifier_value, kind="Not specified")
for identifier_value in source_record.get("dct_identifier_sm", [])
]

Expand Down Expand Up @@ -292,8 +292,13 @@ def get_links(source_record: dict, source_record_id: str) -> list[timdex.Link]:
def get_locations(
source_record: dict, source_record_id: str
) -> list[timdex.Location]:
"""Get values from source record for TIMDEX locations field."""
locations = []
"""Get values from source record for TIMDEX locations field.
WIP: Currently in the process of determining our approach for storing geographic
geometry data in the TIMDEX record and how this dovetails with the OpenSearch
mapping. At this time, this method returns an empty list of Locations.
"""
locations: list[timdex.Location] = []

aardvark_location_fields = {
"dcat_bbox": "Bounding Box",
Expand All @@ -303,15 +308,11 @@ def get_locations(
if aardvark_location_field not in source_record:
continue
try:
if geodata_points := parse_geodata_string(
source_record[aardvark_location_field], source_record_id
):
locations.append(
timdex.Location(
geodata=geodata_points,
kind=kind_value,
)
)
message = (
f"Geometry field '{aardvark_location_field}' found, but "
f"currently not mapped."
)
logger.debug(message)
except ValueError as exception:
logger.warning(exception)
return locations
Expand Down

0 comments on commit 6a97912

Please sign in to comment.