Skip to content

Commit

Permalink
Updates based on discussion in PR # 17
Browse files Browse the repository at this point in the history
* Remove unnecessary .get from alt title processing in datacite and dspace_dim transforms
* Refactor content_type processing code
* Remove unnecessary variable from dspace_mets transform
* Refactor contributor processing code
* Update publication_year conditional logic
* Refactor file_formats processing
* Update funding_information processing logic
* Update to datacite and dspace_dim use a value when an identifier type is missing
* Add links parameter to datacite transform for digital object link
* Remove unnecessary conditional from notes processing
* Update publisher conditional logic
* Refactor rights processing
* Refactor subjects processing
* Refactor summary processing
  • Loading branch information
ehanson8 committed Jun 30, 2022
1 parent b2bd2d7 commit 236c019
Show file tree
Hide file tree
Showing 4 changed files with 121 additions and 87 deletions.
22 changes: 22 additions & 0 deletions tests/test_datacite.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
Date_Range,
Funder,
Identifier,
Link,
Location,
Note,
RelatedItem,
Expand Down Expand Up @@ -116,6 +117,13 @@ def test_datacite_record_all_fields(
Identifier(value="1234567.5524464", kind="IsIdenticalTo"),
],
locations=[Location(value="A point on the globe")],
links=[
Link(
url="https://example.com/doi:10.7910/DVN/19PPE7",
kind="Digital object URL",
text="Digital object URL",
)
],
languages=["en_US"],
notes=[
Note(value=["Survey Data"], kind="Datacite resource type"),
Expand Down Expand Up @@ -222,6 +230,13 @@ def test_datacite_optional_fields_blank_transforms_correctly(
title="The Impact of Maternal Literacy and Participation Programs",
format="electronic resource",
identifiers=[Identifier(value="10.7910/DVN/19PPE7", kind="DOI")],
links=[
Link(
url="https://example.com/doi:10.7910/DVN/19PPE7",
kind="Digital object URL",
text="Digital object URL",
)
],
)


Expand All @@ -244,6 +259,13 @@ def test_datacite_record_optional_fields_missing_transforms_correctly(
title="The Impact of Maternal Literacy and Participation Programs",
format="electronic resource",
identifiers=[Identifier(value="10.7910/DVN/19PPE7", kind="DOI")],
links=[
Link(
url="https://example.com/doi:10.7910/DVN/19PPE7",
kind="Digital object URL",
text="Digital object URL",
)
],
)


Expand Down
175 changes: 93 additions & 82 deletions transmogrifier/sources/datacite.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,69 +76,66 @@ def create_from_datacite_xml(
kwargs.setdefault("alternate_titles", []).append(
timdex.AlternateTitle(
value=alternate_title.string,
kind=alternate_title.get("titleType"),
kind=alternate_title["titleType"],
)
)

# content_type
resource_type = xml.metadata.find("resourceType")
if resource_type is None:
if resource_type and resource_type.string:
kwargs["notes"] = [
timdex.Note(value=[resource_type.string], kind="Datacite resource type")
]
if resource_type["resourceTypeGeneral"]:
kwargs["content_type"] = [resource_type["resourceTypeGeneral"]]
else:
logger.warning(
"Datacite record %s missing required Datacite field resourceType",
source_record_id,
)
else:
if resource_type.string:
kwargs["notes"] = [
timdex.Note(
value=[resource_type.string], kind="Datacite resource type"
)
]
if resource_type["resourceTypeGeneral"]:
kwargs["content_type"] = [resource_type["resourceTypeGeneral"]]

# contributors
for creator in [
c
for c in xml.metadata.find_all("creator")
if c.find("creatorName") and c.find("creatorName").string
]:
kwargs.setdefault("contributors", []).append(
timdex.Contributor(
value=creator.find("creatorName").string,
affiliation=[a.string for a in creator.find_all("affiliation")]
or None,
identifier=[
cls.generate_name_identifier_url(name_identifier)
for name_identifier in creator.find_all("nameIdentifier")
]
or None,
kind="Creator",
for creator in xml.metadata.find_all("creator"):
creator_name_element = creator.find("creatorName")
if creator_name_element and creator_name_element.string:
kwargs.setdefault("contributors", []).append(
timdex.Contributor(
value=creator_name_element.string,
affiliation=[a.string for a in creator.find_all("affiliation")]
or None,
identifier=[
cls.generate_name_identifier_url(name_identifier)
for name_identifier in creator.find_all("nameIdentifier")
]
or None,
kind="Creator",
)
)
)

for contributor in [
c
for c in xml.metadata.find_all("contributor")
if c.find("contributorName") and c.find("contributorName").string
]:
kwargs.setdefault("contributors", []).append(
timdex.Contributor(
value=contributor.find("contributorName").string,
affiliation=[a.string for a in contributor.find_all("affiliation")]
or None,
identifier=[
cls.generate_name_identifier_url(name_identifier)
for name_identifier in contributor.find_all("nameIdentifier")
]
or None,
kind=contributor["contributorType"],
for contributor in xml.metadata.find_all("contributor"):
contributor_name_element = contributor.find("contributorName")
if contributor_name_element and contributor_name_element.string:
kwargs.setdefault("contributors", []).append(
timdex.Contributor(
value=contributor_name_element.string,
affiliation=[
a.string for a in contributor.find_all("affiliation")
]
or None,
identifier=[
cls.generate_name_identifier_url(name_identifier)
for name_identifier in contributor.find_all(
"nameIdentifier"
)
]
or None,
kind=contributor["contributorType"],
)
)
)

# dates
publication_year = xml.metadata.find("publicationYear")
if publication_year is None or publication_year.string is None:
if not publication_year or not publication_year.string:
logger.warning(
"Datacite record %s missing required Datacite field publicationYear",
source_record_id,
Expand Down Expand Up @@ -168,21 +165,21 @@ def create_from_datacite_xml(
kwargs["edition"] = edition.string

# file_formats
for file_format in [f for f in xml.metadata.find_all("format") if f.string]:
kwargs.setdefault("file_formats", []).append(file_format.string)
kwargs["file_formats"] = [
f.string for f in xml.metadata.find_all("format") if f.string
] or None

# format
kwargs["format"] = "electronic resource"

# funding_information
for funding_reference in [
fr
for fr in xml.metadata.find_all("fundingReference")
if fr.find("funderName").string
fr for fr in xml.metadata.find_all("fundingReference")
]:
f = timdex.Funder(
funder_name=funding_reference.find("funderName").string,
)
f = timdex.Funder()
funder_name = funding_reference.find("funderName")
if funder_name.string:
f.funder_name = funder_name.string
award_number = funding_reference.find("awardNumber")
if award_number and award_number.string:
f.award_number = award_number.string
Expand All @@ -191,14 +188,17 @@ def create_from_datacite_xml(
if funder_identifier and funder_identifier.string:
f.funder_identifier = funder_identifier.string
f.funder_identifier_type = funder_identifier.get("funderIdentifierType")
kwargs.setdefault("funding_information", []).append(f)
if f != timdex.Funder():
kwargs.setdefault("funding_information", []).append(f)

# identifiers
identifier_xml = xml.metadata.find("identifier")
kwargs["identifiers"] = [
timdex.Identifier(
value=identifier_xml.string,
kind=identifier_xml["identifierType"],
kind=identifier_xml.get(
"identifierType", "Identifier kind not specified"
),
),
]
for alternate_identifier in [
Expand All @@ -207,7 +207,9 @@ def create_from_datacite_xml(
kwargs["identifiers"].append(
timdex.Identifier(
value=alternate_identifier.string,
kind=alternate_identifier.get("alternateIdentifierType"),
kind=alternate_identifier.get(
"alternateIdentifierType", "Identifier kind not specified"
),
)
)

Expand All @@ -219,7 +221,9 @@ def create_from_datacite_xml(
]:
i = timdex.Identifier(
value=cls.generate_related_item_identifier_url(related_identifier),
kind=related_identifier.get("relationType"),
kind=related_identifier.get(
"relationType", "Identifier kind not specified"
),
)
kwargs["identifiers"].append(i)

Expand All @@ -228,6 +232,15 @@ def create_from_datacite_xml(
if language and language.string:
kwargs["languages"] = [language.string]

# links
kwargs["links"] = [
timdex.Link(
kind="Digital object URL",
text="Digital object URL",
url=kwargs["source_link"],
)
]

# locations
for location in [
gl for gl in xml.metadata.find_all("geoLocationPlace") if gl.string
Expand All @@ -245,18 +258,17 @@ def create_from_datacite_xml(
"@descriptionType",
source_record_id,
)
else:
if description.get("descriptionType") != "Abstract":
kwargs.setdefault("notes", []).append(
timdex.Note(
value=[description.string],
kind=description.get("descriptionType"),
)
if description.get("descriptionType") != "Abstract":
kwargs.setdefault("notes", []).append(
timdex.Note(
value=[description.string],
kind=description.get("descriptionType"),
)
)

# publication_information
publisher = xml.metadata.find("publisher")
if publisher is None or publisher.string is None:
if not publication_year or not publication_year.string:
logger.warning(
"Datacite record %s missing required Datacite field publisher",
source_record_id,
Expand All @@ -274,14 +286,14 @@ def create_from_datacite_xml(
)

# rights
for rights in xml.metadata.find_all("rights"):
if rights.string or rights.get("rightsURI"):
r = timdex.Rights()
if rights.string:
r.description = rights.string
if rights.get("rightsURI"):
r.uri = rights.get("rightsURI")
kwargs.setdefault("rights", []).append(r)
for right in [
r for r in xml.metadata.find_all("rights") if r.string or r.get("rightsURI")
]:
kwargs.setdefault("rights", []).append(
timdex.Rights(
description=right.string or None, uri=right.get("rightsURI")
)
)

# subjects
subjects_dict: Dict[str, List[str]] = {}
Expand All @@ -294,18 +306,17 @@ def create_from_datacite_xml(
subjects_dict.setdefault(subject.attrs["subjectScheme"], []).append(
subject.string
)
for key, value in subjects_dict.items():
kwargs.setdefault("subjects", []).append(
timdex.Subject(value=value, kind=key)
)
kwargs["subjects"] = [
timdex.Subject(value=value, kind=key)
for key, value in subjects_dict.items()
] or None

# summary, uses description list retrieved for notes field
for description in [
d
kwargs["summary"] = [
d.string
for d in descriptions
if d.get("descriptionType") == "Abstract" and d.string
]:
kwargs.setdefault("summary", []).append(description.string)
] or None

# citation, generate citation from other fields
kwargs["citation"] = generate_citation(kwargs)
Expand Down
6 changes: 3 additions & 3 deletions transmogrifier/sources/dspace_dim.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def create_from_dspace_dim(
kwargs.setdefault("alternate_titles", []).append(
timdex.AlternateTitle(
value=alternate_title.string,
kind=alternate_title.get("qualifier"),
kind=alternate_title["qualifier"],
)
)

Expand Down Expand Up @@ -177,7 +177,7 @@ def create_from_dspace_dim(
kwargs.setdefault("identifiers", []).append(
timdex.Identifier(
value=identifier.string,
kind=identifier.get("qualifier"),
kind=identifier.get("qualifier", "Identifier kind not specified"),
)
)

Expand Down Expand Up @@ -267,7 +267,7 @@ def create_from_dspace_dim(
subject.string
)
else:
subjects_dict.setdefault(subject.attrs["qualifier"], []).append(
subjects_dict.setdefault(subject["qualifier"], []).append(
subject.string
)
for key, value in subjects_dict.items():
Expand Down
5 changes: 3 additions & 2 deletions transmogrifier/sources/dspace_mets.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,9 +105,10 @@ def create_from_dspace_mets_xml(
kwargs["citation"] = citation.string if citation and citation.string else None

# content_type
content_types = xml.find_all("mods:genre")
kwargs["content_type"] = [
content_type.string for content_type in content_types if content_type.string
content_type.string
for content_type in xml.find_all("mods:genre")
if content_type.string
] or None

# contents: relevant field in DSpace (dc.description.tableofcontents) is not
Expand Down

0 comments on commit 236c019

Please sign in to comment.