Skip to content

Commit

Permalink
Merge pull request #233 from MITLibraries/GDT-241-post-normalize-data…
Browse files Browse the repository at this point in the history
…-cleanup

Post normalize data cleanup methods
  • Loading branch information
ghukill committed Mar 27, 2024
2 parents a5ef992 + 035c425 commit 3425250
Show file tree
Hide file tree
Showing 4 changed files with 123 additions and 10 deletions.
48 changes: 38 additions & 10 deletions harvester/records/record.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import json
import logging
from abc import abstractmethod
from typing import Any, Literal
from typing import Any, Literal, TypeAlias

import marcalyx # type: ignore[import-untyped]
from attrs import asdict, define, field, fields
Expand All @@ -26,6 +26,8 @@

CONFIG = Config()

MITAardvarkFieldValue: TypeAlias = str | list | bool | None


@define
class Record:
Expand Down Expand Up @@ -326,12 +328,15 @@ def normalize(self) -> MITAardvark:
Exceptions encountered during normalization will bubble up to the Harvester
calling context, where it will be handled and recorded as a Record.exception,
thereby allowing the harvest to continue with other records.
Lastly, values parsed for fields run through a series of post normalization
quality improvements like removing empty strings, None values from lists, etc.
"""
# get MITAardvark fields
aardvark_fields = fields(MITAardvark)

# loop through fields and attempt field-level child class methods if defined
all_field_values = {}
all_field_values: dict[str, MITAardvarkFieldValue] = {}
for aardvark_field in aardvark_fields:
if field_method := getattr(self, f"_{aardvark_field.name}", None):
try:
Expand All @@ -343,16 +348,39 @@ def normalize(self) -> MITAardvark:
logger.exception(message)
raise FieldMethodError(exc, message) from exc

# dedupe all list fields
for field_name, field_values in all_field_values.items():
if isinstance(field_values, list):
deduped_field_values = [
value for value in field_values if value is not None
]
all_field_values[field_name] = dedupe_list_of_values(deduped_field_values)
# post normalization quality improvements
for field_name, original_value in all_field_values.items():
clean_value = self._remove_none_and_blank_strings(original_value)
clean_value = self._dedupe_list_fields(clean_value)
all_field_values[field_name] = clean_value

# initialize a new MITAardvark instance and return
return MITAardvark(**all_field_values)
return MITAardvark(**all_field_values) # type: ignore[arg-type]

@staticmethod
def _remove_none_and_blank_strings(
original_value: MITAardvarkFieldValue,
) -> MITAardvarkFieldValue:
"""Remove None values and empty strings from MITAardvark field value."""
if isinstance(original_value, str):
return None if original_value.strip() == "" else original_value
if isinstance(original_value, list):
return [
value
for value in original_value
if value is not None
and not (isinstance(value, str) and value.strip() == "")
]
return original_value

@staticmethod
def _dedupe_list_fields(
original_value: MITAardvarkFieldValue,
) -> MITAardvarkFieldValue:
"""Remove duplicate values from MITAardvark field value list."""
if isinstance(original_value, list):
return dedupe_list_of_values(original_value)
return original_value

####################################
# Abstract Required Field Methods
Expand Down
16 changes: 16 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -777,3 +777,19 @@ def alma_harvester(mocked_timdex_alma_s3_export):
from_date="2024-01-01",
until_date="2024-01-02",
)


@pytest.fixture
def aardvark_empty_strings():
with open(
"tests/fixtures/records/generic/ogm_aardvark_empty_strings.json", "rb"
) as f:
return OGMAardvark(
identifier="abc123",
data=f.read(),
event="created",
ogm_repo_config={
"name": "Earth",
"metadata_format": "aardvark",
},
)
60 changes: 60 additions & 0 deletions tests/fixtures/records/generic/ogm_aardvark_empty_strings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
{
"dct_title_s": "Map of Alaska",
"dct_description_sm": [
"Relief shown by contours and spot heights.; Ancillary maps: Southeastern Alaska -- Relation of Alaska to North America and the Orient -- Index map.",
"122 x 158 centimeters",
"Scale 1:1,250,000; 19.7285; approximately 20 miles to 1 inch",
"General Map Collection"
],
"dct_language_sm": [
"eng"
],
"dct_creator_sm": [
"Briesemeister, William A."
],
"dct_publisher_sm": [
"American Geographical Society (New York, N.Y)"
],
"schema_provider_s": "University of Minnesota",
"gbl_resourceClass_sm": [
"Maps"
],
"dcat_keyword_sm": [
"",
"2022-creator-sprint"
],
"dct_temporal_sm": [
"1923"
],
"dct_issued_s": "1923",
"gbl_indexYear_im": [
"1923"
],
"gbl_dateRange_drsim": [
"[1923 TO 1923]"
],
"dct_spatial_sm": [
"Alaska"
],
"locn_geometry": "POLYGON((-169.0 68.0, -139.0 68.0, -139.0 54.0, -169.0 54.0, -169.0 68.0))",
"dcat_bbox": "ENVELOPE(-169.0,-139.0,68.0,54.0)",
"dcat_centroid": "61.0,-154.0",
"pcdm_memberOf_sm": [
"64bd8c4c-8e60-4956-b43d-bdc3f93db488"
],
"dct_isPartOf_sm": [
"05d-01",
"p16022coll230"
],
"dct_rights_sm": [
"Use of this item may be governed by US and international copyright laws. You may be able to use this item, but copyright and other considerations may apply. For possible additional information or guidance on your use, please contact the contributing organization."
],
"dct_accessRights_s": "Public",
"dct_format_s": "JPEG",
"dct_references_s": "{\"http://iiif.io/api/image\":\"https://cdm16022.contentdm.oclc.org/digital/iiif/p16022coll230/2590/info.json\",\"http://schema.org/url\":\"https://umedia.lib.umn.edu/item/p16022coll230:2590\",\"http://iiif.io/api/presentation#manifest\":\"https://cdm16022.contentdm.oclc.org/iiif/info/p16022coll230/2590/manifest.json\"}",
"id": "p16022coll230:2590",
"dct_identifier_sm": [
"UMN_ALMA:9927552850001701"
],
"gbl_mdVersion_s": "Aardvark"
}
9 changes: 9 additions & 0 deletions tests/test_records/test_record.py
Original file line number Diff line number Diff line change
Expand Up @@ -408,3 +408,12 @@ def test_controlled_resource_type_variant_matches(
generic_source_record.get_controlled_gbl_resourceType_sm_terms(raw_values)
== controlled_values
)


def test_empty_strings_filtered_from_output_aardvark(aardvark_empty_strings):
assert aardvark_empty_strings.parsed_data["dcat_keyword_sm"] == [
"", # note this empty string in original record
"2022-creator-sprint",
]
normalized_record = aardvark_empty_strings.normalize()
assert normalized_record.dcat_keyword_sm == ["2022-creator-sprint"]

0 comments on commit 3425250

Please sign in to comment.