Skip to content

Commit

Permalink
Aardvark field method refactor
Browse files Browse the repository at this point in the history
Why these changes are being introduced:
* Update Aardvark class to use current field method conventions

How this addresses that need:
* Add field methods and associated private methods for alternate_titles, content_type, contributors, and dates
* Update unit tests with current conventions

Side effects of this change:
* None

Relevant ticket(s):
* https://mitlibraries.atlassian.net/browse/TIMX-285
  • Loading branch information
ehanson8 committed Jul 10, 2024
1 parent dd664d3 commit e28712d
Show file tree
Hide file tree
Showing 2 changed files with 123 additions and 83 deletions.
98 changes: 78 additions & 20 deletions tests/sources/json/test_aardvark.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,14 +147,54 @@ def test_aardvark_record_is_deleted_returns_false_if_value_is_false(
assert MITAardvark.record_is_deleted(source_record) is False


def test_aardvark_get_alternate_titles_success(aardvark_record_all_fields):
assert MITAardvark.get_alternate_titles(next(aardvark_record_all_fields)) == [
def test_aardvark_get_alternate_titles_success(aardvark_records):
source_record = next(aardvark_records)
source_record["dct_alternative_sm"] = ["Alternate title"]
assert MITAardvark.get_alternate_titles(source_record) == [
timdex.AlternateTitle(value="Alternate title")
]


def test_aardvark_get_contributors_success(aardvark_record_all_fields):
assert MITAardvark.get_contributors(next(aardvark_record_all_fields)) == [
def test_aardvark_get_alternate_titles_transforms_correctly_if_fields_blank(
aardvark_records,
):
source_record = next(aardvark_records)
source_record["dct_alternative_sm"] = []
assert MITAardvark.get_alternate_titles(source_record) is None


def test_aardvark_get_alternate_titles_transforms_correctly_if_fields_missing(
aardvark_records,
):
source_record = next(aardvark_records)
assert MITAardvark.get_alternate_titles(source_record) is None


def test_aardvark_get_content_type_success(aardvark_records):
source_record = next(aardvark_records)
source_record["gbl_resourceType_sm"] = ["Vector data"]
assert MITAardvark.get_content_type(source_record) == ["Vector data"]


def test_aardvark_get_content_type_transforms_correctly_if_fields_blank(
aardvark_records,
):
source_record = next(aardvark_records)
source_record["gbl_resourceType_sm"] = []
assert MITAardvark.get_content_type(source_record) is None


def test_aardvark_get_content_type_transforms_correctly_if_fields_missing(
aardvark_records,
):
source_record = next(aardvark_records)
assert MITAardvark.get_content_type(source_record) is None


def test_aardvark_get_contributors_success(aardvark_records):
source_record = next(aardvark_records)
source_record["dct_creator_sm"] = ["Smith, Jane", "Smith, John"]
assert MITAardvark.get_contributors(source_record) == [
timdex.Contributor(
value="Smith, Jane",
kind="Creator",
Expand All @@ -166,8 +206,28 @@ def test_aardvark_get_contributors_success(aardvark_record_all_fields):
]


def test_aardvark_get_dates_success(aardvark_record_all_fields):
assert MITAardvark.get_dates(next(aardvark_record_all_fields), "123") == [
def test_aardvark_get_contributors_transforms_correctly_if_fields_blank(
aardvark_records,
):
source_record = next(aardvark_records)
source_record["dct_creator_sm"] = []
assert MITAardvark.get_contributors(source_record) is None


def test_aardvark_get_contributors_transforms_correctly_if_fields_missing(
aardvark_records,
):
source_record = next(aardvark_records)
assert MITAardvark.get_contributors(source_record) is None


def test_aardvark_get_dates_success(aardvark_records):
source_record = next(aardvark_records)
source_record["dct_issued_s"] = "2003-10-23"
source_record["dct_temporal_sm"] = ["1943", "1979"]
source_record["gbl_dateRange_drsim"] = ["[1943 TO 1946]"]
source_record["gbl_indexYear_im"] = [1943, 1944, 1945, 1946]
assert MITAardvark.get_dates(source_record) == [
timdex.Date(kind="Issued", value="2003-10-23"),
timdex.Date(kind="Coverage", value="1943"),
timdex.Date(kind="Coverage", value="1979"),
Expand All @@ -185,30 +245,28 @@ def test_aardvark_get_dates_drops_dates_with_invalid_strings(
caplog, aardvark_record_all_fields
):
caplog.set_level("DEBUG")
record = next(aardvark_record_all_fields)
record["dct_issued_s"] = "1933?" # dropped
record["dct_temporal_sm"] = [
source_record = next(aardvark_record_all_fields)
source_record["dct_issued_s"] = "1933?" # dropped
source_record["dct_temporal_sm"] = [
"2000-01-01",
"1999",
"approximately 1569", # dropped
"absolute junky date", # dropped
]
record["gbl_dateRange_drsim"] = [
source_record["gbl_dateRange_drsim"] = [
"[1943 TO 1946]",
"[apples TO oranges]", # logged and dropped
]
assert MITAardvark.get_dates(record, "123") == [
timdex.Date(kind="Coverage", note=None, range=None, value="2000-01-01"),
timdex.Date(kind="Coverage", note=None, range=None, value="1999"),
timdex.Date(kind="Coverage", note=None, range=None, value="1943"),
timdex.Date(kind="Coverage", note=None, range=None, value="1944"),
timdex.Date(kind="Coverage", note=None, range=None, value="1945"),
timdex.Date(kind="Coverage", note=None, range=None, value="1946"),
assert MITAardvark.get_dates(source_record) == [
timdex.Date(kind="Coverage", value="2000-01-01"),
timdex.Date(kind="Coverage", value="1999"),
timdex.Date(kind="Coverage", value="1943"),
timdex.Date(kind="Coverage", value="1944"),
timdex.Date(kind="Coverage", value="1945"),
timdex.Date(kind="Coverage", value="1946"),
timdex.Date(
kind="Coverage",
note=None,
range=timdex.DateRange(gt=None, gte="1943", lt=None, lte="1946"),
value=None,
range=timdex.DateRange(gte="1943", lte="1946"),
),
]
assert "Unable to parse date range string" in caplog.text
Expand Down
108 changes: 45 additions & 63 deletions transmogrifier/sources/json/aardvark.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import json
import logging
import re
from collections.abc import Iterator

import transmogrifier.models as timdex
from transmogrifier.helpers import validate_date
from transmogrifier.helpers import validate_date, validate_date_range
from transmogrifier.sources.jsontransformer import JSONTransformer
from transmogrifier.sources.transformer import JSON

Expand Down Expand Up @@ -124,16 +125,16 @@ def get_optional_fields(self, source_record: dict) -> dict | None:
source_record_id = self.get_source_record_id(source_record)

# alternate_titles
fields["alternate_titles"] = self.get_alternate_titles(source_record) or None
fields["alternate_titles"] = self.get_alternate_titles(source_record)

# content_type
fields["content_type"] = source_record.get("gbl_resourceType_sm")
fields["content_type"] = self.get_content_type(source_record)

# contributors
fields["contributors"] = self.get_contributors(source_record) or None
fields["contributors"] = self.get_contributors(source_record)

# dates
fields["dates"] = self.get_dates(source_record, source_record_id) or None
fields["dates"] = self.get_dates(source_record)

# edition not used in MITAardvark

Expand Down Expand Up @@ -178,58 +179,42 @@ def get_optional_fields(self, source_record: dict) -> dict | None:

return fields

@staticmethod
def get_alternate_titles(source_record: dict) -> list[timdex.AlternateTitle]:
"""Get values from source record for TIMDEX alternate_titles field."""
@classmethod
def get_alternate_titles(
cls, source_record: dict
) -> list[timdex.AlternateTitle] | None:
return [
timdex.AlternateTitle(value=title_value)
for title_value in source_record.get("dct_alternative_sm", [])
]
] or None

@staticmethod
def get_contributors(source_record: dict) -> list[timdex.Contributor]:
"""Get values from source record for TIMDEX contributors field."""
@classmethod
def get_content_type(cls, source_record: dict) -> list[str] | None:
return source_record.get("gbl_resourceType_sm") or None

@classmethod
def get_contributors(cls, source_record: dict) -> list[timdex.Contributor] | None:
return [
timdex.Contributor(value=contributor_value, kind="Creator")
for contributor_value in source_record.get("dct_creator_sm", [])
]
] or None

@classmethod
def get_dates(cls, source_record: dict, source_record_id: str) -> list[timdex.Date]:
"""Get values from source record for TIMDEX dates field.
This method aggregates dates from a variety of Aardvark fields. Once aggregated,
the results are filtered to allow only well formed DateRanges or validated date
strings.
"""
dates = (
cls._issued_dates(source_record)
+ cls._coverage_dates(source_record)
+ cls._range_dates(source_record, source_record_id)
)
return [
date
for date in dates
# skip value validation for DateRange type dates
if isinstance(date.range, timdex.DateRange)
# validate date string if not None
or (date.value is not None and validate_date(date.value, source_record_id))
]
def get_dates(cls, source_record: dict) -> list[timdex.Date] | None:
dates: list[timdex.Date] = []
dates.extend(cls._issued_dates(source_record))
dates.extend(cls._coverage_dates(source_record))
dates.extend(cls._range_dates(source_record))
return dates or None

@classmethod
def _issued_dates(cls, source_record: dict) -> list[timdex.Date]:
"""Get values for issued dates."""
issued_dates = []
if "dct_issued_s" in source_record:
issued_dates.append(
timdex.Date(value=source_record["dct_issued_s"], kind="Issued")
)
return issued_dates
def _issued_dates(cls, source_record: dict) -> Iterator[timdex.Date]:
if issued_date := source_record.get("dct_issued_s"): # noqa: SIM102
if validate_date(issued_date, cls.get_source_record_id(source_record)):
yield (timdex.Date(value=issued_date, kind="Issued"))

@classmethod
def _coverage_dates(cls, source_record: dict) -> list[timdex.Date]:
"""Get values for coverage dates."""
coverage_dates = []
def _coverage_dates(cls, source_record: dict) -> Iterator[timdex.Date]:
coverage_date_values = []
coverage_date_values.extend(source_record.get("dct_temporal_sm", []))
coverage_date_values.extend(
Expand All @@ -239,37 +224,34 @@ def _coverage_dates(cls, source_record: dict) -> list[timdex.Date]:
if str(date_value) not in coverage_date_values
]
)
coverage_dates.extend(
[
timdex.Date(value=coverage_date_value, kind="Coverage")
for coverage_date_value in coverage_date_values
]
)
return coverage_dates

for coverage_date_value in coverage_date_values:
if validate_date(
coverage_date_value, cls.get_source_record_id(source_record)
):
yield timdex.Date(value=coverage_date_value, kind="Coverage")

@classmethod
def _range_dates(
cls, source_record: dict, source_record_id: str
) -> list[timdex.Date]:
"""Get values for issued dates."""
range_dates = []
def _range_dates(cls, source_record: dict) -> Iterator[timdex.Date]:
for date_range_string in source_record.get("gbl_dateRange_drsim", []):
try:
date_range_values = cls.parse_solr_date_range_string(
date_range_string, source_record_id
date_range_string, cls.get_source_record_id(source_record)
)
except ValueError as exc:
logger.warning(exc)
except ValueError as error:
logger.warning(error)
continue
range_dates.append(
timdex.Date(
if validate_date_range(
date_range_values[0],
date_range_values[1],
cls.get_source_record_id(source_record),
):
yield timdex.Date(
kind="Coverage",
range=timdex.DateRange(
gte=date_range_values[0], lte=date_range_values[1]
),
)
)
return range_dates

@classmethod
def parse_solr_date_range_string(
Expand Down

0 comments on commit e28712d

Please sign in to comment.