Skip to content

Commit

Permalink
Additional MITAardvark methods
Browse files Browse the repository at this point in the history
Why these changes are being introduced:
* Additonal methods are needed for the MITAardvark class

How this addresses that need:
* Update get_optional_fields method to include format and summary values and add corresponding unit test
* Add get_alternate_titles, get_contributors, get_notes, get_publication_information, and get_rights field methods along with calls in get_optional_fields and corresponding unit tests
* Update aardvark_record_all_fields fixture to include new fields

Side effects of this change:
* None

Relevant ticket(s):
* https://mitlibraries.atlassian.net/browse/GDT-54
  • Loading branch information
ehanson8 committed Dec 19, 2023
1 parent 1aba987 commit e9e00c8
Show file tree
Hide file tree
Showing 4 changed files with 179 additions and 14 deletions.
6 changes: 2 additions & 4 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,8 @@ def runner():

@pytest.fixture
def aardvark_record_all_fields():
return next(
JsonTransformer.parse_source_file(
"tests/fixtures/aardvark/aardvark_record_all_fields.jsonl"
)
return JsonTransformer.parse_source_file(
"tests/fixtures/aardvark/aardvark_record_all_fields.jsonl"
)


Expand Down
2 changes: 1 addition & 1 deletion tests/fixtures/aardvark/aardvark_record_all_fields.jsonl
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"id": "123", "dcat_keyword_sm": ["Country"], "dcat_theme_sm": ["Political boundaries"], "dct_spatial_sm": ["Some city, Some country"], "dct_subject_sm": ["Geography", "Earth"], "gbl_resourceClass_sm": ["Dataset"], "gbl_resourceType_sm": ["Vector data"], "dct_title_s": "Test title 1"}
{"id": "123", "dcat_bbox": "ENVELOPE(-111.1, -104.0, 45.0, 40.9)", "dcat_keyword_sm": ["Country"], "dcat_theme_sm": ["Political boundaries"], "dct_accessRights_s": "Access note", "dct_alternative_sm": ["Alternate title"], "dct_creator_sm": ["Smith, Jane", "Smith, John"], "dct_description_sm": ["A description"], "dct_format_s": "Shapefile", "dct_language_sm": ["eng"], "dct_license_sm": "http://license.license", "dct_publisher_sm": ["ML InfoMap (Firm)"], "dct_rights_sm": ["Some person has the rights"], "dct_rightsHolder_sm": ["The person with the rights", "Another person with the rights"], "dct_spatial_sm": ["Some city, Some country"], "dct_subject_sm": ["Geography", "Earth"], "dct_title_s": "Test title 1", "gbl_displayNote_sm": ["Danger: This text will be displayed in a red box","Info: This text will be displayed in a blue box","Tip: This text will be displayed in a green box","Warning: This text will be displayed in a yellow box","This is text without a tag and it will be assigned default 'note' style"], "gbl_resourceClass_sm": ["Dataset"], "gbl_resourceType_sm": ["Vector data"], "locn_geometry": "POLYGON((-80 25, -65 18, -64 33, -80 25))", "schema_provider_s": "MIT"}
83 changes: 79 additions & 4 deletions tests/sources/json/test_aardvark.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def test_aardvark_get_required_fields_returns_expected_values(aardvark_records):
}


def test_jsontransformer_transform_returns_timdex_record(aardvark_records):
def test_aardvark_transform_returns_timdex_record(aardvark_records):
transformer = MITAardvark("cool-repo", aardvark_records)
assert next(transformer) == timdex.TimdexRecord(
source="A Cool Repository",
Expand All @@ -24,16 +24,91 @@ def test_jsontransformer_transform_returns_timdex_record(aardvark_records):
)


def test_aardvark_get_optional_fields_non_field_method_values_success(
aardvark_record_all_fields,
):
transformer = MITAardvark("cool-repo", aardvark_record_all_fields)
record = next(transformer)
assert record.format == "Shapefile"
assert record.languages == ["eng"]
assert record.summary == ["A description"]


def test_aardvark_get_main_titles_success(aardvark_record_all_fields):
assert MITAardvark.get_main_titles(aardvark_record_all_fields) == ["Test title 1"]
assert MITAardvark.get_main_titles(next(aardvark_record_all_fields)) == [
"Test title 1"
]


def test_aardvark_get_source_record_id_success(aardvark_record_all_fields):
assert MITAardvark.get_source_record_id(aardvark_record_all_fields) == "123"
assert MITAardvark.get_source_record_id(next(aardvark_record_all_fields)) == "123"


def test_aardvark_get_alternate_titles_success(aardvark_record_all_fields):
assert MITAardvark.get_alternate_titles(next(aardvark_record_all_fields)) == [
timdex.AlternateTitle(value="Alternate title")
]


def test_aardvark_get_contributors_success(aardvark_record_all_fields):
assert MITAardvark.get_contributors(next(aardvark_record_all_fields)) == [
timdex.Contributor(
value="Smith, Jane",
kind="Creator",
),
timdex.Contributor(
value="Smith, John",
kind="Creator",
),
]


def test_aardvark_get_notes_success(aardvark_record_all_fields):
assert MITAardvark.get_notes(next(aardvark_record_all_fields)) == [
timdex.Note(
value=["Danger: This text will be displayed in a red box"],
kind="Display note",
),
timdex.Note(
value=["Info: This text will be displayed in a blue box"],
kind="Display note",
),
timdex.Note(
value=["Tip: This text will be displayed in a green box"],
kind="Display note",
),
timdex.Note(
value=["Warning: This text will be displayed in a yellow box"],
kind="Display note",
),
timdex.Note(
value=[
"This is text without a tag and it will be assigned default 'note' style"
],
kind="Display note",
),
]


def test_aardvark_get_publication_information_success(aardvark_record_all_fields):
assert MITAardvark.get_publication_information(
next(aardvark_record_all_fields)
) == ["ML InfoMap (Firm)", "MIT"]


def test_aardvark_get_rights_success(aardvark_record_all_fields):
assert MITAardvark.get_rights(next(aardvark_record_all_fields)) == [
timdex.Rights(description="Access note", kind="Access"),
timdex.Rights(uri="http://license.license"),
timdex.Rights(description="Some person has the rights"),
timdex.Rights(
description="The person with the rights. Another person with the rights"
),
]


def test_aardvark_get_subjects_success(aardvark_record_all_fields):
assert MITAardvark.get_subjects(aardvark_record_all_fields) == [
assert MITAardvark.get_subjects(next(aardvark_record_all_fields)) == [
timdex.Subject(value=["Country"], kind="DCAT Keyword"),
timdex.Subject(value=["Political boundaries"], kind="DCAT Theme"),
timdex.Subject(value=["Geography"], kind="Dublin Core Subject"),
Expand Down
102 changes: 97 additions & 5 deletions transmogrifier/sources/json/aardvark.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,25 +56,27 @@ def get_optional_fields(self, source_record: dict) -> dict | None:
Overrides metaclass get_optional_fields() method.
Args:
xml: A BeautifulSoup Tag representing a single Datacite record in
oai_datacite XML.
source_record: A JSON object representing a source record.
"""
fields: dict = {}

# alternate_titles
fields["alternate_titles"] = self.get_alternate_titles(source_record) or None

# content_type
fields["content_type"] = ["Geospatial data"]

# contributors
fields["contributors"] = self.get_contributors(source_record) or None

# dates

# edition
# edition not used in MITAardvark

# format
fields["format"] = source_record.get("dct_format_s")

# funding_information
# funding_information not used in MITAardvark

# identifiers

Expand All @@ -86,19 +88,106 @@ def get_optional_fields(self, source_record: dict) -> dict | None:
# locations

# notes
fields["notes"] = self.get_notes(source_record) or None

# publication_information
fields["publication_information"] = (
self.get_publication_information(source_record) or None
)

# related_items
# related_items not used in MITAardvark

# rights
fields["rights"] = self.get_rights(source_record) or None

# subjects
fields["subjects"] = self.get_subjects(source_record) or None

# summary field
fields["summary"] = source_record.get("dct_description_sm")

return fields

@staticmethod
def get_alternate_titles(source_record: dict) -> list[timdex.AlternateTitle]:
"""Get values from source record for TIMDEX alternate_titles field."""
alternate_titles = []

if "dct_alternative_sm" in source_record:
for title_value in [
title_value for title_value in source_record["dct_alternative_sm"]
]:
alternate_titles.append(timdex.AlternateTitle(value=title_value))

return alternate_titles

@staticmethod
def get_contributors(source_record: dict) -> list[timdex.Contributor]:
"""Get values from source record for TIMDEX contributors field."""
contributors = []

if "dct_creator_sm" in source_record:
for contributor_value in [
contributor_value
for contributor_value in source_record["dct_creator_sm"]
]:
contributors.append(
timdex.Contributor(value=contributor_value, kind="Creator")
)

return contributors

@staticmethod
def get_notes(source_record: dict) -> list[timdex.Note]:
"""Get values from source record for TIMDEX notes field."""
notes = []

if "gbl_displayNote_sm" in source_record:
for note_value in [
note_value for note_value in source_record["gbl_displayNote_sm"]
]:
notes.append(timdex.Note(value=[note_value], kind="Display note"))

return notes

@staticmethod
def get_publication_information(source_record: dict) -> list[str]:
"""Get values from source record for TIMDEX publication_information field."""
publication_information = []

if "dct_publisher_sm" in source_record:
publication_information.extend(source_record["dct_publisher_sm"])

if "schema_provider_s" in source_record:
publication_information.append(source_record["schema_provider_s"])

return publication_information

@staticmethod
def get_rights(source_record: dict) -> list[timdex.Rights]:
"""Get values from source record for TIMDEX rights field."""
rights = []

if "dct_accessRights_s" in source_record:
rights.append(
timdex.Rights(
description=source_record["dct_accessRights_s"], kind="Access"
)
)

if "dct_license_sm" in source_record:
rights.append(timdex.Rights(uri=source_record["dct_license_sm"]))

for aardvark_rights_field in ["dct_rights_sm", "dct_rightsHolder_sm"]:
if aardvark_rights_field in source_record:
rights.append(
timdex.Rights(
description=". ".join(source_record[aardvark_rights_field])
)
)

return rights

@staticmethod
def get_subjects(source_record: dict) -> list[timdex.Subject]:
"""Get values from source record for TIMDEX subjects field.
Expand All @@ -115,18 +204,21 @@ def get_subjects(source_record: dict) -> list[timdex.Subject]:
source_record: A JSON object representing a source record.
"""
subjects = []

aardvark_subject_fields = {
"dcat_keyword_sm": "DCAT Keyword",
"dcat_theme_sm": "DCAT Theme",
"dct_subject_sm": "Dublin Core Subject",
"gbl_resourceClass_sm": "Subject scheme not provided",
"gbl_resourceType_sm": "Subject scheme not provided",
}

for aardvark_subject_field, kind_value in {
key: value
for key, value in aardvark_subject_fields.items()
if key in source_record
}.items():
for subject in source_record[aardvark_subject_field]:
subjects.append(timdex.Subject(value=[subject], kind=kind_value))

return subjects

0 comments on commit e9e00c8

Please sign in to comment.