Skip to content

Commit

Permalink
Gdt 116 update locations (#113)
Browse files Browse the repository at this point in the history
* Add dct_spatial_sm to get_subjects

* Update record_is_deleted method

* Add functionality to record_is_deleted method as well as corresponding unit tests to reflect possible data scenarios
* Add unit tests for write_output_files method now that deleted records are properly processed
* Update aardvark_records fixtures to include deleted records

* Update Location class to use WKT strings

Why these changes are being introduced:
* OpenSearch can parse the WKT strings that are found in MITAardvark records so the values don't require the previously expected parsing.

How this addresses that need:
* Rename Location.geodata attribute to geoshape as well as update corresponding unit tests and fixtures
* Refactor get_locations method to store WKT strings as well as update corresponding unit tests and fixtures
* Remove parse_geodata_string from helpers.py and corresponding unit tests
* Add default kind value to get_identifiers method

Side effects of this change:
* None

Relevant ticket(s):
* https://mitlibraries.atlassian.net/browse/GDT-116

* Updates dependencies

* Updates based on discussion in PR #113

* Update aardvark fixtures
* Refactor record_is_deleted method and update corresponding unit test
* Update kind values for get_subjects method
  • Loading branch information
ehanson8 committed Jan 22, 2024
1 parent cbddb32 commit 4f83410
Show file tree
Hide file tree
Showing 10 changed files with 296 additions and 617 deletions.
706 changes: 172 additions & 534 deletions Pipfile.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ def timdex_record_all_fields_and_subfields():
timdex.Location(
value="A point on the globe",
kind="Data was gathered here",
geodata=[-77.025955, 38.942142],
geoshape="BBOX(-77.025955, 38.942142)",
)
],
notes=[timdex.Note(value=["This book is awesome"], kind="opinion")],
Expand Down
2 changes: 1 addition & 1 deletion tests/fixtures/aardvark/aardvark_record_all_fields.jsonl
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"id": "mit:123", "dcat_bbox": "ENVELOPE(-111.1, -104.0, 45.0, 40.9)", "dcat_keyword_sm": ["Country"], "dcat_theme_sm": ["Political boundaries"], "dct_accessRights_s": "Access note", "dct_alternative_sm": ["Alternate title"], "dct_creator_sm": ["Smith, Jane", "Smith, John"], "dct_description_sm": ["A description"], "dct_format_s": "Shapefile", "dct_identifier_sm": ["abc123"], "dct_issued_s": "2003-10-23", "dct_language_sm": ["eng"], "dct_license_sm": ["http://license.license", "http://another_license.another_license"], "dct_publisher_sm": ["ML InfoMap (Firm)"], "dct_references_s": "{\"https://schema.org/downloadUrl\": [{\"label\": \"Source Metadata\", \"protocol\": \"Download\", \"url\": \"https://example.com/GISPORTAL_GISOWNER01_BOSTONWATER95.source.fgdc.xml\"}, {\"label\": \"Normalized Metadata\", \"protocol\": \"Download\", \"url\": \"https://example.com/GISPORTAL_GISOWNER01_BOSTONWATER95.normalized.aardvark.json\"}, {\"label\": \"Data Zipfile\", \"protocol\": \"Download\", \"url\": \"https://example.com/GISPORTAL_GISOWNER01_BOSTONWATER95.zip\"}]}", "dct_rights_sm": ["Some person has the rights"], "dct_rightsHolder_sm": ["The person with the rights", "Another person with the rights"], "dct_spatial_sm": ["Some city, Some country"], "dct_subject_sm": ["Geography", "Earth"], "dct_temporal_sm": ["1943", "1979"], "dct_title_s": "Test title 1", "gbl_dateRange_drsim": ["[1943 TO 1946]"], "gbl_displayNote_sm": ["Danger: This text will be displayed in a red box","Info: This text will be displayed in a blue box","Tip: This text will be displayed in a green box","Warning: This text will be displayed in a yellow box","This is text without a tag and it will be assigned default 'note' style"], "gbl_indexYear_im": [1943,1944,1945,1946], "gbl_resourceClass_sm": ["Dataset"], "gbl_resourceType_sm": ["Vector data"], "locn_geometry": "ENVELOPE(-111.1, -104.0, 45.0, 40.9)", "schema_provider_s": "MIT"}
{"id": "mit:123", "dcat_bbox": "ENVELOPE(-111.1, -104.0, 45.0, 40.9)", "dcat_keyword_sm": ["Country"], "dcat_theme_sm": ["Political boundaries"], "dct_accessRights_s": "Access note", "dct_alternative_sm": ["Alternate title"], "dct_creator_sm": ["Smith, Jane", "Smith, John"], "dct_description_sm": ["A description"], "dct_format_s": "Shapefile", "dct_identifier_sm": ["abc123"], "dct_issued_s": "2003-10-23", "dct_language_sm": ["eng"], "dct_license_sm": ["http://license.license", "http://another_license.another_license"], "dct_publisher_sm": ["ML InfoMap (Firm)"], "dct_references_s": "{\"https://schema.org/downloadUrl\": [{\"label\": \"Source Metadata\", \"protocol\": \"Download\", \"url\": \"https://example.com/GISPORTAL_GISOWNER01_BOSTONWATER95.source.fgdc.xml\"}, {\"label\": \"Normalized Metadata\", \"protocol\": \"Download\", \"url\": \"https://example.com/GISPORTAL_GISOWNER01_BOSTONWATER95.normalized.aardvark.json\"}, {\"label\": \"Data Zipfile\", \"protocol\": \"Download\", \"url\": \"https://example.com/GISPORTAL_GISOWNER01_BOSTONWATER95.zip\"}]}", "dct_rights_sm": ["Some person has the rights"], "dct_rightsHolder_sm": ["The person with the rights", "Another person with the rights"], "dct_spatial_sm": ["Some city, Some country"], "dct_subject_sm": ["Geography", "Earth"], "dct_temporal_sm": ["1943", "1979"], "dct_title_s": "Test title 1", "gbl_dateRange_drsim": ["[1943 TO 1946]"], "gbl_displayNote_sm": ["Danger: This text will be displayed in a red box","Info: This text will be displayed in a blue box","Tip: This text will be displayed in a green box","Warning: This text will be displayed in a yellow box","This is text without a tag and it will be assigned default 'note' style"], "gbl_indexYear_im": [1943,1944,1945,1946], "gbl_resourceClass_sm": ["Dataset"], "gbl_resourceType_sm": ["Vector data"], "gbl_suppressed_b": false, "locn_geometry": "ENVELOPE(-111.1, -104.0, 45.0, 40.9)", "schema_provider_s": "MIT"}
5 changes: 3 additions & 2 deletions tests/fixtures/aardvark_records.jsonl
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
{"dct_accessRights_s": "Access rights", "dct_references_s": "", "dct_title_s": "Test title 1", "gbl_mdModified_dt": "", "gbl_mdVersion_s": "", "gbl_resourceClass_sm": "", "id": "mit:123", "locn_geometry": "ENVELOPE(-111.1, -104.0, 45.0, 40.9)"}
{"dct_accessRights_s": "Access rights", "dct_references_s": "", "dct_title_s": "Test title 2", "gbl_mdModified_dt": "", "gbl_mdVersion_s": "", "gbl_resourceClass_sm": "", "id": "ogm:456", "locn_geometry": "ENVELOPE(-111.1, -104.0, 45.0, 40.9)"}
{"dct_accessRights_s": "Access rights", "dct_references_s": "", "dct_title_s": "Test title 1", "gbl_mdModified_dt": "", "gbl_mdVersion_s": "", "gbl_resourceClass_sm": "", "gbl_suppressed_b": false, "id": "mit:123", "locn_geometry": ""}
{"dct_accessRights_s": "Access rights", "dct_references_s": "", "dct_title_s": "Test title 2", "gbl_mdModified_dt": "", "gbl_mdVersion_s": "", "gbl_resourceClass_sm": "", "gbl_suppressed_b": false, "id": "ogm:456", "locn_geometry": ""}
{"dct_accessRights_s": "Access rights", "dct_references_s": "", "dct_title_s": "Test title 3", "gbl_mdModified_dt": "", "gbl_mdVersion_s": "", "gbl_resourceClass_sm": "", "gbl_suppressed_b": true, "id": "ogm:789", "locn_geometry": ""}
94 changes: 85 additions & 9 deletions tests/sources/json/test_aardvark.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,33 @@
from pathlib import Path

import pytest

import transmogrifier.models as timdex
from transmogrifier.sources.json.aardvark import MITAardvark


def test_mitaardvark_transform_and_write_output_files_writes_output_files(
tmp_path, aardvark_records
):
output_file = str(tmp_path / "output_file.json")
transformer = MITAardvark("cool-repo", aardvark_records)
assert not Path(tmp_path / "output_file.json").exists()
assert not Path(tmp_path / "output_file.txt").exists()
transformer.transform_and_write_output_files(output_file)
assert Path(tmp_path / "output_file.json").exists()
assert Path(tmp_path / "output_file.txt").exists()


def test_mitaardvark_transform_and_write_output_files_no_txt_file_if_not_needed(
tmp_path, aardvark_record_all_fields
):
output_file = str(tmp_path / "output_file.json")
transformer = MITAardvark("cool-repo", aardvark_record_all_fields)
transformer.transform_and_write_output_files(output_file)
assert len(list(tmp_path.iterdir())) == 1
assert next(tmp_path.iterdir()).name == "output_file.json"


def test_aardvark_get_required_fields_returns_expected_values(aardvark_records):
transformer = MITAardvark("cool-repo", aardvark_records)
assert transformer.get_required_fields(next(aardvark_records)) == {
Expand Down Expand Up @@ -43,6 +67,38 @@ def test_aardvark_get_main_titles_success(aardvark_record_all_fields):
]


def test_aardvark_record_is_deleted_returns_false_if_field_missing(
aardvark_record_all_fields,
):
assert MITAardvark.record_is_deleted(next(aardvark_record_all_fields)) is False


def test_aardvark_record_is_deleted_raises_error_if_value_is_string(
aardvark_record_all_fields,
):
with pytest.raises(
ValueError,
match="Record ID '123': 'gbl_suppressed_b' value is not a boolean",
):
aardvark_record = next(aardvark_record_all_fields)
aardvark_record["gbl_suppressed_b"] = "True"
MITAardvark.record_is_deleted(aardvark_record)


def test_aardvark_record_is_deleted_returns_false_if_value_is_false(
aardvark_record_all_fields,
):
aardvark_record = next(aardvark_record_all_fields)
aardvark_record["gbl_suppressed_b"] = False
assert MITAardvark.record_is_deleted(aardvark_record) is False


def test_aardvark_record_is_deleted_success(aardvark_record_all_fields):
aardvark_record = next(aardvark_record_all_fields)
aardvark_record["gbl_suppressed_b"] = True
assert MITAardvark.record_is_deleted(aardvark_record) is True


def test_aardvark_get_source_record_id_success(aardvark_record_all_fields):
assert MITAardvark.get_source_record_id(next(aardvark_record_all_fields)) == "123"

Expand Down Expand Up @@ -130,11 +186,30 @@ def test_aardvark_get_links_logs_warning_for_invalid_json(caplog):
)


def test_aardvark_get_locations_success(caplog, aardvark_record_all_fields):
caplog.set_level("DEBUG")
assert "Geometry field 'dcat_bbox' found, but currently not mapped."
assert "Geometry field 'locn_geometry' found, but currently not mapped."
assert MITAardvark.get_locations(next(aardvark_record_all_fields), "123") == []
def test_aardvark_get_locations_success(aardvark_record_all_fields):
assert MITAardvark.get_locations(next(aardvark_record_all_fields), "123") == [
timdex.Location(
kind="Bounding Box", geoshape="BBOX (-111.1, -104.0, 45.0, 40.9)"
),
timdex.Location(kind="Geometry", geoshape="BBOX (-111.1, -104.0, 45.0, 40.9)"),
]


def test_parse_get_locations_string_invalid_geostring_logs_warning(
aardvark_record_all_fields, caplog
):
aardvark_record = next(aardvark_record_all_fields)
aardvark_record["dcat_bbox"] = "Invalid"
aardvark_record["locn_geometry"] = "Invalid"
assert MITAardvark.get_locations(aardvark_record, "123") == []
assert (
"Record ID '123': Unable to parse geodata string 'Invalid' in 'dcat_bbox'"
in caplog.text
)
assert (
"Record ID '123': Unable to parse geodata string 'Invalid' in 'locn_geometry'"
in caplog.text
)


def test_aardvark_get_notes_success(aardvark_record_all_fields):
Expand Down Expand Up @@ -184,10 +259,11 @@ def test_aardvark_get_rights_success(aardvark_record_all_fields):

def test_aardvark_get_subjects_success(aardvark_record_all_fields):
assert MITAardvark.get_subjects(next(aardvark_record_all_fields)) == [
timdex.Subject(value=["Country"], kind="DCAT Keyword"),
timdex.Subject(value=["Political boundaries"], kind="DCAT Theme"),
timdex.Subject(value=["Geography"], kind="Dublin Core Subject"),
timdex.Subject(value=["Earth"], kind="Dublin Core Subject"),
timdex.Subject(value=["Country"], kind="DCAT; Keyword"),
timdex.Subject(value=["Political boundaries"], kind="DCAT; Theme"),
timdex.Subject(value=["Some city, Some country"], kind="Dublin Core; Spatial"),
timdex.Subject(value=["Geography"], kind="Dublin Core; Subject"),
timdex.Subject(value=["Earth"], kind="Dublin Core; Subject"),
timdex.Subject(value=["Dataset"], kind="Subject scheme not provided"),
timdex.Subject(value=["Vector data"], kind="Subject scheme not provided"),
]
20 changes: 0 additions & 20 deletions tests/test_helpers.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@
from datetime import datetime

import pytest

import transmogrifier.models as timdex
from transmogrifier.helpers import (
generate_citation,
parse_date_from_string,
parse_geodata_string,
validate_date,
validate_date_range,
)
Expand Down Expand Up @@ -259,23 +256,6 @@ def test_parse_date_from_string_invalid_date_returns_none():
assert parse_date_from_string("circa 1930s") is None


def test_parse_geodata_string_success():
assert parse_geodata_string("ENVELOPE(-111.1, -104.0, 45.0, 40.9)", "123") == [
-111.1,
-104.0,
45.0,
40.9,
]


def test_parse_geodata_string_invalid_geodata_string_raises_error():
with pytest.raises(
ValueError,
match="Record ID '123': Unable to parse geodata string 'Invalid'",
):
parse_geodata_string("Invalid", "123")


def test_validate_date_success():
assert validate_date("1930", "1234") is True

Expand Down
10 changes: 5 additions & 5 deletions tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,10 +181,10 @@ def test_timdex_record_all_fields_and_subfields(timdex_record_all_fields_and_sub
timdex_record_all_fields_and_subfields.locations[0].kind
== "Data was gathered here"
)
assert timdex_record_all_fields_and_subfields.locations[0].geodata == [
-77.025955,
38.942142,
]
assert (
timdex_record_all_fields_and_subfields.locations[0].geoshape
== "BBOX(-77.025955, 38.942142)"
)
assert timdex_record_all_fields_and_subfields.notes[0].value == [
"This book is awesome"
]
Expand Down Expand Up @@ -305,7 +305,7 @@ def test_record_asdict_includes_all_fields(timdex_record_all_fields_and_subfield
"literary_form": "nonfiction",
"locations": [
{
"geodata": [-77.025955, 38.942142],
"geoshape": "BBOX(-77.025955, 38.942142)",
"kind": "Data was gathered here",
"value": "A point on the globe",
}
Expand Down
25 changes: 0 additions & 25 deletions transmogrifier/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,31 +75,6 @@ def parse_date_from_string(
return None


def parse_geodata_string(geodata_string: str, source_record_id: str) -> list[float]:
"""Get list of values from a formatted geodata string.
Example:
- "ENVELOPE(-111.1, -104.0, 45.0, 40.9)"
Args:
geodata_string: Formatted geodata string to parse.
source_record_id: The ID of the record containing the string to parse.
"""
geodata_points = []
try:
raw_geodata_points = geodata_string.split("(")[-1].split(")")[0].split(",")
stripped_geodata_points = map(str.strip, raw_geodata_points)
geodata_floats = list(map(float, stripped_geodata_points))
geodata_points.extend(geodata_floats)
except ValueError:
message = (
f"Record ID '{source_record_id}': "
f"Unable to parse geodata string '{geodata_string}'"
)
raise ValueError(message)
return geodata_points


def validate_date(
date_string: str,
source_record_id: str,
Expand Down
4 changes: 1 addition & 3 deletions transmogrifier/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,9 +125,7 @@ class Link:
class Location:
value: Optional[str] = field(default=None, validator=optional(instance_of(str)))
kind: Optional[str] = field(default=None, validator=optional(instance_of(str)))
geodata: Optional[list[float]] = field(
default=None, validator=optional(list_of(float))
)
geoshape: Optional[str] = field(default=None, validator=optional(instance_of(str)))


@define
Expand Down
45 changes: 28 additions & 17 deletions transmogrifier/sources/json/aardvark.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,14 @@ def record_is_deleted(cls, source_record: dict) -> bool:
Args:
source_record: A JSON object representing a source record.
"""
return False
if isinstance(source_record["gbl_suppressed_b"], bool):
return source_record["gbl_suppressed_b"]
else:
message = (
f"Record ID '{cls.get_source_record_id(source_record)}': "
"'gbl_suppressed_b' value is not a boolean"
)
raise ValueError(message)

def get_optional_fields(self, source_record: dict) -> dict | None:
"""
Expand Down Expand Up @@ -292,13 +299,8 @@ def get_links(source_record: dict, source_record_id: str) -> list[timdex.Link]:
def get_locations(
source_record: dict, source_record_id: str
) -> list[timdex.Location]:
"""Get values from source record for TIMDEX locations field.
WIP: Currently in the process of determining our approach for storing geographic
geometry data in the TIMDEX record and how this dovetails with the OpenSearch
mapping. At this time, this method returns an empty list of Locations.
"""
locations: list[timdex.Location] = []
"""Get values from source record for TIMDEX locations field."""
locations = []

aardvark_location_fields = {
"dcat_bbox": "Bounding Box",
Expand All @@ -307,14 +309,22 @@ def get_locations(
for aardvark_location_field, kind_value in aardvark_location_fields.items():
if aardvark_location_field not in source_record:
continue
try:
if (
geodata_string := source_record[aardvark_location_field]
) and "ENVELOPE" in source_record[aardvark_location_field]:
locations.append(
timdex.Location(
geoshape=geodata_string.replace("ENVELOPE", "BBOX "),
kind=kind_value,
)
)
else:
message = (
f"Geometry field '{aardvark_location_field}' found, but "
f"currently not mapped."
f"Record ID '{source_record_id}': "
f"Unable to parse geodata string '{geodata_string}' "
f"in '{aardvark_location_field}'"
)
logger.debug(message)
except ValueError as exception:
logger.warning(exception)
logger.warning(message)
return locations

@staticmethod
Expand Down Expand Up @@ -385,9 +395,10 @@ def get_subjects(source_record: dict) -> list[timdex.Subject]:
subjects = []

aardvark_subject_fields = {
"dcat_keyword_sm": "DCAT Keyword",
"dcat_theme_sm": "DCAT Theme",
"dct_subject_sm": "Dublin Core Subject",
"dcat_keyword_sm": "DCAT; Keyword",
"dcat_theme_sm": "DCAT; Theme",
"dct_spatial_sm": "Dublin Core; Spatial",
"dct_subject_sm": "Dublin Core; Subject",
"gbl_resourceClass_sm": "Subject scheme not provided",
"gbl_resourceType_sm": "Subject scheme not provided",
}
Expand Down

0 comments on commit 4f83410

Please sign in to comment.