Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Additional MITAardvark methods #109

Merged
merged 3 commits into from
Dec 20, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,8 @@ def runner():

@pytest.fixture
def aardvark_record_all_fields():
return next(
JsonTransformer.parse_source_file(
"tests/fixtures/aardvark/aardvark_record_all_fields.jsonl"
)
return JsonTransformer.parse_source_file(
Copy link
Contributor

@jonavellecuerdo jonavellecuerdo Dec 20, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, how do we feel about renaming:
JsonTransformer -> JSONTransformer--
using common capitalized abbreviations when naming functions?

Asking @ghukill , too! 🤓

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is following the capitalization pattern of classes like Ead where is was decided at some point in the past that all-caps class names like EAD weren't preferred. Not sure if that is following some linting principle but I don't have strong feelings either way about the capitalization

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just my two cents: I prefer, and have used, all caps like JSONTransformer or SQSClient as examples. But I noticed Transmog follows a pretty strict CamelCasing for classes. Doesn't feel like a deal breaker either way to me, but maybe leaning towards consistency in the project.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, I don't think it's based on a linting rule--since it seemed pass without error in geo-harvester, but I do think CamelCasing is generally preferred. I'm okay with it being consistent just within the repo at the least, so it's fine as is!

Copy link
Contributor Author

@ehanson8 ehanson8 Dec 20, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Honestly, I have to stop myself typing JSON rather than Json every time and we aren't consistent about it in other repos so I'll switch it to JSONTransformer and XMLTransformer

"tests/fixtures/aardvark/aardvark_record_all_fields.jsonl"
)


Expand Down
2 changes: 1 addition & 1 deletion tests/fixtures/aardvark/aardvark_record_all_fields.jsonl
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"id": "123", "dcat_keyword_sm": ["Country"], "dcat_theme_sm": ["Political boundaries"], "dct_spatial_sm": ["Some city, Some country"], "dct_subject_sm": ["Geography", "Earth"], "gbl_resourceClass_sm": ["Dataset"], "gbl_resourceType_sm": ["Vector data"], "dct_title_s": "Test title 1"}
{"id": "123", "dcat_bbox": "ENVELOPE(-111.1, -104.0, 45.0, 40.9)", "dcat_keyword_sm": ["Country"], "dcat_theme_sm": ["Political boundaries"], "dct_accessRights_s": "Access note", "dct_alternative_sm": ["Alternate title"], "dct_creator_sm": ["Smith, Jane", "Smith, John"], "dct_description_sm": ["A description"], "dct_format_s": "Shapefile", "dct_language_sm": ["eng"], "dct_license_sm": ["http://license.license", "http://another_license.another_license"], "dct_publisher_sm": ["ML InfoMap (Firm)"], "dct_rights_sm": ["Some person has the rights"], "dct_rightsHolder_sm": ["The person with the rights", "Another person with the rights"], "dct_spatial_sm": ["Some city, Some country"], "dct_subject_sm": ["Geography", "Earth"], "dct_title_s": "Test title 1", "gbl_displayNote_sm": ["Danger: This text will be displayed in a red box","Info: This text will be displayed in a blue box","Tip: This text will be displayed in a green box","Warning: This text will be displayed in a yellow box","This is text without a tag and it will be assigned default 'note' style"], "gbl_resourceClass_sm": ["Dataset"], "gbl_resourceType_sm": ["Vector data"], "locn_geometry": "POLYGON((-80 25, -65 18, -64 33, -80 25))", "schema_provider_s": "MIT"}
84 changes: 80 additions & 4 deletions tests/sources/json/test_aardvark.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def test_aardvark_get_required_fields_returns_expected_values(aardvark_records):
}


def test_jsontransformer_transform_returns_timdex_record(aardvark_records):
def test_aardvark_transform_returns_timdex_record(aardvark_records):
transformer = MITAardvark("cool-repo", aardvark_records)
assert next(transformer) == timdex.TimdexRecord(
source="A Cool Repository",
Expand All @@ -24,16 +24,92 @@ def test_jsontransformer_transform_returns_timdex_record(aardvark_records):
)


def test_aardvark_get_optional_fields_non_field_method_values_success(
aardvark_record_all_fields,
):
transformer = MITAardvark("cool-repo", aardvark_record_all_fields)
record = next(transformer)
assert record.format == "Shapefile"
assert record.languages == ["eng"]
assert record.summary == ["A description"]


def test_aardvark_get_main_titles_success(aardvark_record_all_fields):
assert MITAardvark.get_main_titles(aardvark_record_all_fields) == ["Test title 1"]
assert MITAardvark.get_main_titles(next(aardvark_record_all_fields)) == [
"Test title 1"
]


def test_aardvark_get_source_record_id_success(aardvark_record_all_fields):
assert MITAardvark.get_source_record_id(aardvark_record_all_fields) == "123"
assert MITAardvark.get_source_record_id(next(aardvark_record_all_fields)) == "123"


def test_aardvark_get_alternate_titles_success(aardvark_record_all_fields):
assert MITAardvark.get_alternate_titles(next(aardvark_record_all_fields)) == [
timdex.AlternateTitle(value="Alternate title")
]


def test_aardvark_get_contributors_success(aardvark_record_all_fields):
assert MITAardvark.get_contributors(next(aardvark_record_all_fields)) == [
timdex.Contributor(
value="Smith, Jane",
kind="Creator",
),
timdex.Contributor(
value="Smith, John",
kind="Creator",
),
]


def test_aardvark_get_notes_success(aardvark_record_all_fields):
assert MITAardvark.get_notes(next(aardvark_record_all_fields)) == [
timdex.Note(
value=["Danger: This text will be displayed in a red box"],
kind="Display note",
),
timdex.Note(
value=["Info: This text will be displayed in a blue box"],
kind="Display note",
),
timdex.Note(
value=["Tip: This text will be displayed in a green box"],
kind="Display note",
),
timdex.Note(
value=["Warning: This text will be displayed in a yellow box"],
kind="Display note",
),
timdex.Note(
value=[
"This is text without a tag and it will be assigned default 'note' style"
],
kind="Display note",
),
]


def test_aardvark_get_publication_information_success(aardvark_record_all_fields):
assert MITAardvark.get_publication_information(
next(aardvark_record_all_fields)
) == ["ML InfoMap (Firm)", "MIT"]


def test_aardvark_get_rights_success(aardvark_record_all_fields):
assert MITAardvark.get_rights(next(aardvark_record_all_fields)) == [
timdex.Rights(description="Access note", kind="Access"),
timdex.Rights(uri="http://license.license"),
timdex.Rights(uri="http://another_license.another_license"),
timdex.Rights(description="Some person has the rights"),
timdex.Rights(
description="The person with the rights. Another person with the rights"
),
]


def test_aardvark_get_subjects_success(aardvark_record_all_fields):
assert MITAardvark.get_subjects(aardvark_record_all_fields) == [
assert MITAardvark.get_subjects(next(aardvark_record_all_fields)) == [
timdex.Subject(value=["Country"], kind="DCAT Keyword"),
timdex.Subject(value=["Political boundaries"], kind="DCAT Theme"),
timdex.Subject(value=["Geography"], kind="Dublin Core Subject"),
Expand Down
90 changes: 84 additions & 6 deletions transmogrifier/sources/json/aardvark.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,30 +51,32 @@ def record_is_deleted(cls, source_record: dict) -> bool:

def get_optional_fields(self, source_record: dict) -> dict | None:
"""
Retrieve optional TIMDEX fields from a Aardvar JSON record.
Retrieve optional TIMDEX fields from an Aardvark JSON record.

Overrides metaclass get_optional_fields() method.

Args:
xml: A BeautifulSoup Tag representing a single Datacite record in
oai_datacite XML.
source_record: A JSON object representing a source record.
"""
fields: dict = {}

# alternate_titles
fields["alternate_titles"] = self.get_alternate_titles(source_record) or None
ehanson8 marked this conversation as resolved.
Show resolved Hide resolved

# content_type
fields["content_type"] = ["Geospatial data"]

# contributors
fields["contributors"] = self.get_contributors(source_record) or None

# dates

# edition
# edition not used in MITAardvark

# format
fields["format"] = source_record.get("dct_format_s")

# funding_information
# funding_information not used in MITAardvark

# identifiers

Expand All @@ -86,19 +88,92 @@ def get_optional_fields(self, source_record: dict) -> dict | None:
# locations

# notes
fields["notes"] = self.get_notes(source_record) or None

# publication_information
fields["publication_information"] = (
self.get_publication_information(source_record) or None
)

# related_items
# related_items not used in MITAardvark

# rights
fields["rights"] = self.get_rights(source_record) or None

# subjects
fields["subjects"] = self.get_subjects(source_record) or None

# summary field
fields["summary"] = source_record.get("dct_description_sm")

return fields

@staticmethod
def get_alternate_titles(source_record: dict) -> list[timdex.AlternateTitle]:
"""Get values from source record for TIMDEX alternate_titles field."""
return [
timdex.AlternateTitle(value=title_value)
for title_value in source_record.get("dct_alternative_sm", [])
]

@staticmethod
def get_contributors(source_record: dict) -> list[timdex.Contributor]:
"""Get values from source record for TIMDEX contributors field."""
return [
timdex.Contributor(value=contributor_value, kind="Creator")
for contributor_value in source_record.get("dct_creator_sm", [])
]

@staticmethod
def get_notes(source_record: dict) -> list[timdex.Note]:
"""Get values from source record for TIMDEX notes field."""
return [
timdex.Note(value=[note_value], kind="Display note")
for note_value in source_record.get("gbl_displayNote_sm", [])
]

@staticmethod
def get_publication_information(source_record: dict) -> list[str]:
"""Get values from source record for TIMDEX publication_information field."""
publication_information = []

if "dct_publisher_sm" in source_record:
publication_information.extend(source_record["dct_publisher_sm"])

if "schema_provider_s" in source_record:
publication_information.append(source_record["schema_provider_s"])

return publication_information
ehanson8 marked this conversation as resolved.
Show resolved Hide resolved

@staticmethod
def get_rights(source_record: dict) -> list[timdex.Rights]:
"""Get values from source record for TIMDEX rights field."""
rights = []

if "dct_accessRights_s" in source_record:
rights.append(
timdex.Rights(
description=source_record["dct_accessRights_s"], kind="Access"
)
)

rights.extend(
[
timdex.Rights(uri=rights_uri_value)
for rights_uri_value in source_record.get("dct_license_sm", [])
]
)
Comment on lines +160 to +165
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks good!


for aardvark_rights_field in ["dct_rights_sm", "dct_rightsHolder_sm"]:
if aardvark_rights_field in source_record:
rights.append(
timdex.Rights(
description=". ".join(source_record[aardvark_rights_field])
)
)

return rights

@staticmethod
def get_subjects(source_record: dict) -> list[timdex.Subject]:
"""Get values from source record for TIMDEX subjects field.
Expand All @@ -115,18 +190,21 @@ def get_subjects(source_record: dict) -> list[timdex.Subject]:
source_record: A JSON object representing a source record.
"""
subjects = []

aardvark_subject_fields = {
"dcat_keyword_sm": "DCAT Keyword",
"dcat_theme_sm": "DCAT Theme",
"dct_subject_sm": "Dublin Core Subject",
"gbl_resourceClass_sm": "Subject scheme not provided",
"gbl_resourceType_sm": "Subject scheme not provided",
}

for aardvark_subject_field, kind_value in {
key: value
for key, value in aardvark_subject_fields.items()
if key in source_record
}.items():
for subject in source_record[aardvark_subject_field]:
subjects.append(timdex.Subject(value=[subject], kind=kind_value))

return subjects
Loading