From e9e00c811ea8f9c088437abb41c0b2c9df312aea Mon Sep 17 00:00:00 2001 From: Eric Hanson Date: Tue, 19 Dec 2023 14:28:10 -0500 Subject: [PATCH] Additional MITAardvark methods Why these changes are being introduced: * Additonal methods are needed for the MITAardvark class How this addresses that need: * Update get_optional_fields method to include format and summary values and add corresponding unit test * Add get_alternate_titles, get_contributors, get_notes, get_publication_information, and get_rights field methods along with calls in get_optional_fields and corresponding unit tests * Update aardvark_record_all_fields fixture to include new fields Side effects of this change: * None Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/GDT-54 --- tests/conftest.py | 6 +- .../aardvark/aardvark_record_all_fields.jsonl | 2 +- tests/sources/json/test_aardvark.py | 83 +++++++++++++- transmogrifier/sources/json/aardvark.py | 102 +++++++++++++++++- 4 files changed, 179 insertions(+), 14 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 8616c13..c2bab99 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -48,10 +48,8 @@ def runner(): @pytest.fixture def aardvark_record_all_fields(): - return next( - JsonTransformer.parse_source_file( - "tests/fixtures/aardvark/aardvark_record_all_fields.jsonl" - ) + return JsonTransformer.parse_source_file( + "tests/fixtures/aardvark/aardvark_record_all_fields.jsonl" ) diff --git a/tests/fixtures/aardvark/aardvark_record_all_fields.jsonl b/tests/fixtures/aardvark/aardvark_record_all_fields.jsonl index af39020..43511dc 100644 --- a/tests/fixtures/aardvark/aardvark_record_all_fields.jsonl +++ b/tests/fixtures/aardvark/aardvark_record_all_fields.jsonl @@ -1 +1 @@ -{"id": "123", "dcat_keyword_sm": ["Country"], "dcat_theme_sm": ["Political boundaries"], "dct_spatial_sm": ["Some city, Some country"], "dct_subject_sm": ["Geography", "Earth"], "gbl_resourceClass_sm": ["Dataset"], "gbl_resourceType_sm": ["Vector data"], "dct_title_s": "Test title 1"} \ No newline at end of file +{"id": "123", "dcat_bbox": "ENVELOPE(-111.1, -104.0, 45.0, 40.9)", "dcat_keyword_sm": ["Country"], "dcat_theme_sm": ["Political boundaries"], "dct_accessRights_s": "Access note", "dct_alternative_sm": ["Alternate title"], "dct_creator_sm": ["Smith, Jane", "Smith, John"], "dct_description_sm": ["A description"], "dct_format_s": "Shapefile", "dct_language_sm": ["eng"], "dct_license_sm": "http://license.license", "dct_publisher_sm": ["ML InfoMap (Firm)"], "dct_rights_sm": ["Some person has the rights"], "dct_rightsHolder_sm": ["The person with the rights", "Another person with the rights"], "dct_spatial_sm": ["Some city, Some country"], "dct_subject_sm": ["Geography", "Earth"], "dct_title_s": "Test title 1", "gbl_displayNote_sm": ["Danger: This text will be displayed in a red box","Info: This text will be displayed in a blue box","Tip: This text will be displayed in a green box","Warning: This text will be displayed in a yellow box","This is text without a tag and it will be assigned default 'note' style"], "gbl_resourceClass_sm": ["Dataset"], "gbl_resourceType_sm": ["Vector data"], "locn_geometry": "POLYGON((-80 25, -65 18, -64 33, -80 25))", "schema_provider_s": "MIT"} \ No newline at end of file diff --git a/tests/sources/json/test_aardvark.py b/tests/sources/json/test_aardvark.py index a87637e..8389642 100644 --- a/tests/sources/json/test_aardvark.py +++ b/tests/sources/json/test_aardvark.py @@ -12,7 +12,7 @@ def test_aardvark_get_required_fields_returns_expected_values(aardvark_records): } -def test_jsontransformer_transform_returns_timdex_record(aardvark_records): +def test_aardvark_transform_returns_timdex_record(aardvark_records): transformer = MITAardvark("cool-repo", aardvark_records) assert next(transformer) == timdex.TimdexRecord( source="A Cool Repository", @@ -24,16 +24,91 @@ def test_jsontransformer_transform_returns_timdex_record(aardvark_records): ) +def test_aardvark_get_optional_fields_non_field_method_values_success( + aardvark_record_all_fields, +): + transformer = MITAardvark("cool-repo", aardvark_record_all_fields) + record = next(transformer) + assert record.format == "Shapefile" + assert record.languages == ["eng"] + assert record.summary == ["A description"] + + def test_aardvark_get_main_titles_success(aardvark_record_all_fields): - assert MITAardvark.get_main_titles(aardvark_record_all_fields) == ["Test title 1"] + assert MITAardvark.get_main_titles(next(aardvark_record_all_fields)) == [ + "Test title 1" + ] def test_aardvark_get_source_record_id_success(aardvark_record_all_fields): - assert MITAardvark.get_source_record_id(aardvark_record_all_fields) == "123" + assert MITAardvark.get_source_record_id(next(aardvark_record_all_fields)) == "123" + + +def test_aardvark_get_alternate_titles_success(aardvark_record_all_fields): + assert MITAardvark.get_alternate_titles(next(aardvark_record_all_fields)) == [ + timdex.AlternateTitle(value="Alternate title") + ] + + +def test_aardvark_get_contributors_success(aardvark_record_all_fields): + assert MITAardvark.get_contributors(next(aardvark_record_all_fields)) == [ + timdex.Contributor( + value="Smith, Jane", + kind="Creator", + ), + timdex.Contributor( + value="Smith, John", + kind="Creator", + ), + ] + + +def test_aardvark_get_notes_success(aardvark_record_all_fields): + assert MITAardvark.get_notes(next(aardvark_record_all_fields)) == [ + timdex.Note( + value=["Danger: This text will be displayed in a red box"], + kind="Display note", + ), + timdex.Note( + value=["Info: This text will be displayed in a blue box"], + kind="Display note", + ), + timdex.Note( + value=["Tip: This text will be displayed in a green box"], + kind="Display note", + ), + timdex.Note( + value=["Warning: This text will be displayed in a yellow box"], + kind="Display note", + ), + timdex.Note( + value=[ + "This is text without a tag and it will be assigned default 'note' style" + ], + kind="Display note", + ), + ] + + +def test_aardvark_get_publication_information_success(aardvark_record_all_fields): + assert MITAardvark.get_publication_information( + next(aardvark_record_all_fields) + ) == ["ML InfoMap (Firm)", "MIT"] + + +def test_aardvark_get_rights_success(aardvark_record_all_fields): + assert MITAardvark.get_rights(next(aardvark_record_all_fields)) == [ + timdex.Rights(description="Access note", kind="Access"), + timdex.Rights(uri="http://license.license"), + timdex.Rights(description="Some person has the rights"), + timdex.Rights( + description="The person with the rights. Another person with the rights" + ), + ] def test_aardvark_get_subjects_success(aardvark_record_all_fields): - assert MITAardvark.get_subjects(aardvark_record_all_fields) == [ + assert MITAardvark.get_subjects(next(aardvark_record_all_fields)) == [ timdex.Subject(value=["Country"], kind="DCAT Keyword"), timdex.Subject(value=["Political boundaries"], kind="DCAT Theme"), timdex.Subject(value=["Geography"], kind="Dublin Core Subject"), diff --git a/transmogrifier/sources/json/aardvark.py b/transmogrifier/sources/json/aardvark.py index 25429cf..7e0ec5f 100644 --- a/transmogrifier/sources/json/aardvark.py +++ b/transmogrifier/sources/json/aardvark.py @@ -56,25 +56,27 @@ def get_optional_fields(self, source_record: dict) -> dict | None: Overrides metaclass get_optional_fields() method. Args: - xml: A BeautifulSoup Tag representing a single Datacite record in - oai_datacite XML. + source_record: A JSON object representing a source record. """ fields: dict = {} # alternate_titles + fields["alternate_titles"] = self.get_alternate_titles(source_record) or None # content_type fields["content_type"] = ["Geospatial data"] # contributors + fields["contributors"] = self.get_contributors(source_record) or None # dates - # edition + # edition not used in MITAardvark # format + fields["format"] = source_record.get("dct_format_s") - # funding_information + # funding_information not used in MITAardvark # identifiers @@ -86,19 +88,106 @@ def get_optional_fields(self, source_record: dict) -> dict | None: # locations # notes + fields["notes"] = self.get_notes(source_record) or None # publication_information + fields["publication_information"] = ( + self.get_publication_information(source_record) or None + ) - # related_items + # related_items not used in MITAardvark # rights + fields["rights"] = self.get_rights(source_record) or None # subjects fields["subjects"] = self.get_subjects(source_record) or None # summary field + fields["summary"] = source_record.get("dct_description_sm") + return fields + @staticmethod + def get_alternate_titles(source_record: dict) -> list[timdex.AlternateTitle]: + """Get values from source record for TIMDEX alternate_titles field.""" + alternate_titles = [] + + if "dct_alternative_sm" in source_record: + for title_value in [ + title_value for title_value in source_record["dct_alternative_sm"] + ]: + alternate_titles.append(timdex.AlternateTitle(value=title_value)) + + return alternate_titles + + @staticmethod + def get_contributors(source_record: dict) -> list[timdex.Contributor]: + """Get values from source record for TIMDEX contributors field.""" + contributors = [] + + if "dct_creator_sm" in source_record: + for contributor_value in [ + contributor_value + for contributor_value in source_record["dct_creator_sm"] + ]: + contributors.append( + timdex.Contributor(value=contributor_value, kind="Creator") + ) + + return contributors + + @staticmethod + def get_notes(source_record: dict) -> list[timdex.Note]: + """Get values from source record for TIMDEX notes field.""" + notes = [] + + if "gbl_displayNote_sm" in source_record: + for note_value in [ + note_value for note_value in source_record["gbl_displayNote_sm"] + ]: + notes.append(timdex.Note(value=[note_value], kind="Display note")) + + return notes + + @staticmethod + def get_publication_information(source_record: dict) -> list[str]: + """Get values from source record for TIMDEX publication_information field.""" + publication_information = [] + + if "dct_publisher_sm" in source_record: + publication_information.extend(source_record["dct_publisher_sm"]) + + if "schema_provider_s" in source_record: + publication_information.append(source_record["schema_provider_s"]) + + return publication_information + + @staticmethod + def get_rights(source_record: dict) -> list[timdex.Rights]: + """Get values from source record for TIMDEX rights field.""" + rights = [] + + if "dct_accessRights_s" in source_record: + rights.append( + timdex.Rights( + description=source_record["dct_accessRights_s"], kind="Access" + ) + ) + + if "dct_license_sm" in source_record: + rights.append(timdex.Rights(uri=source_record["dct_license_sm"])) + + for aardvark_rights_field in ["dct_rights_sm", "dct_rightsHolder_sm"]: + if aardvark_rights_field in source_record: + rights.append( + timdex.Rights( + description=". ".join(source_record[aardvark_rights_field]) + ) + ) + + return rights + @staticmethod def get_subjects(source_record: dict) -> list[timdex.Subject]: """Get values from source record for TIMDEX subjects field. @@ -115,6 +204,7 @@ def get_subjects(source_record: dict) -> list[timdex.Subject]: source_record: A JSON object representing a source record. """ subjects = [] + aardvark_subject_fields = { "dcat_keyword_sm": "DCAT Keyword", "dcat_theme_sm": "DCAT Theme", @@ -122,6 +212,7 @@ def get_subjects(source_record: dict) -> list[timdex.Subject]: "gbl_resourceClass_sm": "Subject scheme not provided", "gbl_resourceType_sm": "Subject scheme not provided", } + for aardvark_subject_field, kind_value in { key: value for key, value in aardvark_subject_fields.items() @@ -129,4 +220,5 @@ def get_subjects(source_record: dict) -> list[timdex.Subject]: }.items(): for subject in source_record[aardvark_subject_field]: subjects.append(timdex.Subject(value=[subject], kind=kind_value)) + return subjects