Skip to content

Commit

Permalink
Merge pull request #246 from MITLibraries/handle-ogm-missing-data
Browse files Browse the repository at this point in the history
Prevent error logging for handled OGM harvest errors
  • Loading branch information
ghukill authored Mar 29, 2024
2 parents 5b9b729 + 99e22a2 commit e727587
Show file tree
Hide file tree
Showing 7 changed files with 27 additions and 10 deletions.
2 changes: 1 addition & 1 deletion harvester/harvest/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ def filter_failed_records(self, records: Iterator[Record]) -> Iterator[Record]:
}
self.failed_records.append(failure_dict)
message = f"Record error: {failure_dict}"
logger.error(message)
logger.debug(message)
else:
yield record

Expand Down
9 changes: 9 additions & 0 deletions harvester/records/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,3 +60,12 @@ def get_formatted_message(self) -> str:
return "\n".join(
["The normalized MITAardvark record is invalid:", *error_messages]
)


class NoExternalUrlError(Exception):
"""Exception to raise when external URL cannot be determined from OGM record."""

def __init__(
self, message: str = "Could not determine external URL from source metadata"
) -> None:
super().__init__(message)
7 changes: 6 additions & 1 deletion harvester/records/formats/gbl1.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,12 @@ def _convert_scalar_to_array(self, field_name: str) -> list[str]:
##########################

def _dct_accessRights_s(self) -> str:
return self.parsed_data["dc_rights_s"]
"""Field method: dct_accessRights_s.
While dc_rights_s is a required GBL1 field, some OGM records do not have values.
In these scenarios, default to value "Public" for this required Aardvark field.
"""
return self.parsed_data.get("dc_rights_s", "Public")

def _dct_title_s(self) -> str | None:
return self.parsed_data["dc_title_s"]
Expand Down
4 changes: 2 additions & 2 deletions harvester/records/record.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,11 +342,11 @@ def normalize(self) -> MITAardvark:
if field_method := getattr(self, f"_{aardvark_field.name}", None):
try:
all_field_values[aardvark_field.name] = field_method()
except Exception as exc:
except Exception as exc: # noqa: BLE001
message = (
f"Error getting value for field '{aardvark_field.name}': {exc}"
)
logger.exception(message)
logger.debug(message)
raise FieldMethodError(exc, message) from exc

# post normalization quality improvements
Expand Down
7 changes: 3 additions & 4 deletions harvester/records/sources/ogm.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from harvester.config import Config
from harvester.records import SourceRecord
from harvester.records.exceptions import NoExternalUrlError
from harvester.records.formats import FGDC, GBL1, ISO19139, Aardvark

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -68,8 +69,7 @@ def _dct_references_s(self) -> str:
refs_dict = json.loads(self.parsed_data["dct_references_s"])
url = refs_dict.get("http://schema.org/url")
if not url:
error_message = "Could not determine external URL from source metadata"
raise ValueError(error_message)
raise NoExternalUrlError
urls_dict = {"http://schema.org/url": url}

# extract optional download url
Expand Down Expand Up @@ -133,8 +133,7 @@ def _dct_references_s(self) -> str:
# extract required external URL
url = refs_dict.get("http://schema.org/url")
if not url:
error_message = "Could not determine external URL from source metadata"
raise ValueError(error_message)
raise NoExternalUrlError
urls_dict = {"http://schema.org/url": url}

# extract optional download url
Expand Down
4 changes: 3 additions & 1 deletion tests/test_records/test_aardvark.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

import pytest

from harvester.records.exceptions import NoExternalUrlError


def test_aardvark_required_dct_accessRights_s(aardvark_all_fields):
assert aardvark_all_fields._dct_accessRights_s() == "Restricted"
Expand Down Expand Up @@ -52,7 +54,7 @@ def test_aardvark_required_dct_references_s(aardvark_all_fields):
def test_aardvark_required_dct_references_s_no_url_raise_error(aardvark_all_fields):
aardvark_all_fields._parsed_data = {"dct_references_s": "{}"}
with pytest.raises(
ValueError, match="Could not determine external URL from source metadata"
NoExternalUrlError, match="Could not determine external URL from source metadata"
):
aardvark_all_fields._dct_references_s()

Expand Down
4 changes: 3 additions & 1 deletion tests/test_records/test_gbl1.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

import pytest

from harvester.records.exceptions import NoExternalUrlError


def test_gbl1_convert_scalar_to_array_no_value_get_list(gbl1_all_fields):
assert gbl1_all_fields._convert_scalar_to_array("watermelon") == []
Expand Down Expand Up @@ -184,7 +186,7 @@ def test_gbl1_alternate_url_strategy_field_value_non_url_return_none(gbl1_all_fi
"gbl1_field": "layer_slug_s",
}
with pytest.raises(
ValueError, match="Could not determine external URL from source metadata"
NoExternalUrlError, match="Could not determine external URL from source metadata"
):
json.loads(gbl1_all_fields._dct_references_s())

Expand Down

0 comments on commit e727587

Please sign in to comment.