Updates based on discussion in PR # 17

* Remove unnecessary .get from alt title processing in datacite and dspace_dim transforms * Refactor content_type processing code * Remove unnecessary variable from dspace_mets transform * Refactor contributor processing code * Update publication_year conditional logic * Refactor file_formats processing * Update funding_information processing logic * Update to datacite and dspace_dim use a value when an identifier type is missing * Add links parameter to datacite transform for digital object link * Remove unnecessary conditional from notes processing * Update publisher conditional logic * Refactor rights processing * Refactor subjects processing * Refactor summary processing
MITLibraries · Jun 30, 2022 · 236c019 · 236c019
1 parent b2bd2d7
commit 236c019
Show file tree

Hide file tree

Showing 4 changed files with 121 additions and 87 deletions.
diff --git a/tests/test_datacite.py b/tests/test_datacite.py
@@ -8,6 +8,7 @@
     Date_Range,
     Funder,
     Identifier,
+    Link,
     Location,
     Note,
     RelatedItem,
@@ -116,6 +117,13 @@ def test_datacite_record_all_fields(
             Identifier(value="1234567.5524464", kind="IsIdenticalTo"),
         ],
         locations=[Location(value="A point on the globe")],
+        links=[
+            Link(
+                url="https://example.com/doi:10.7910/DVN/19PPE7",
+                kind="Digital object URL",
+                text="Digital object URL",
+            )
+        ],
         languages=["en_US"],
         notes=[
             Note(value=["Survey Data"], kind="Datacite resource type"),
@@ -222,6 +230,13 @@ def test_datacite_optional_fields_blank_transforms_correctly(
         title="The Impact of Maternal Literacy and Participation Programs",
         format="electronic resource",
         identifiers=[Identifier(value="10.7910/DVN/19PPE7", kind="DOI")],
+        links=[
+            Link(
+                url="https://example.com/doi:10.7910/DVN/19PPE7",
+                kind="Digital object URL",
+                text="Digital object URL",
+            )
+        ],
     )
 
 
@@ -244,6 +259,13 @@ def test_datacite_record_optional_fields_missing_transforms_correctly(
         title="The Impact of Maternal Literacy and Participation Programs",
         format="electronic resource",
         identifiers=[Identifier(value="10.7910/DVN/19PPE7", kind="DOI")],
+        links=[
+            Link(
+                url="https://example.com/doi:10.7910/DVN/19PPE7",
+                kind="Digital object URL",
+                text="Digital object URL",
+            )
+        ],
     )
 
 

diff --git a/transmogrifier/sources/datacite.py b/transmogrifier/sources/datacite.py
@@ -76,69 +76,66 @@ def create_from_datacite_xml(
             kwargs.setdefault("alternate_titles", []).append(
                 timdex.AlternateTitle(
                     value=alternate_title.string,
-                    kind=alternate_title.get("titleType"),
+                    kind=alternate_title["titleType"],
                 )
             )
 
         # content_type
         resource_type = xml.metadata.find("resourceType")
-        if resource_type is None:
+        if resource_type and resource_type.string:
+            kwargs["notes"] = [
+                timdex.Note(value=[resource_type.string], kind="Datacite resource type")
+            ]
+            if resource_type["resourceTypeGeneral"]:
+                kwargs["content_type"] = [resource_type["resourceTypeGeneral"]]
+        else:
             logger.warning(
                 "Datacite record %s missing required Datacite field resourceType",
                 source_record_id,
             )
-        else:
-            if resource_type.string:
-                kwargs["notes"] = [
-                    timdex.Note(
-                        value=[resource_type.string], kind="Datacite resource type"
-                    )
-                ]
-            if resource_type["resourceTypeGeneral"]:
-                kwargs["content_type"] = [resource_type["resourceTypeGeneral"]]
 
         # contributors
-        for creator in [
-            c
-            for c in xml.metadata.find_all("creator")
-            if c.find("creatorName") and c.find("creatorName").string
-        ]:
-            kwargs.setdefault("contributors", []).append(
-                timdex.Contributor(
-                    value=creator.find("creatorName").string,
-                    affiliation=[a.string for a in creator.find_all("affiliation")]
-                    or None,
-                    identifier=[
-                        cls.generate_name_identifier_url(name_identifier)
-                        for name_identifier in creator.find_all("nameIdentifier")
-                    ]
-                    or None,
-                    kind="Creator",
+        for creator in xml.metadata.find_all("creator"):
+            creator_name_element = creator.find("creatorName")
+            if creator_name_element and creator_name_element.string:
+                kwargs.setdefault("contributors", []).append(
+                    timdex.Contributor(
+                        value=creator_name_element.string,
+                        affiliation=[a.string for a in creator.find_all("affiliation")]
+                        or None,
+                        identifier=[
+                            cls.generate_name_identifier_url(name_identifier)
+                            for name_identifier in creator.find_all("nameIdentifier")
+                        ]
+                        or None,
+                        kind="Creator",
+                    )
                 )
-            )
 
-        for contributor in [
-            c
-            for c in xml.metadata.find_all("contributor")
-            if c.find("contributorName") and c.find("contributorName").string
-        ]:
-            kwargs.setdefault("contributors", []).append(
-                timdex.Contributor(
-                    value=contributor.find("contributorName").string,
-                    affiliation=[a.string for a in contributor.find_all("affiliation")]
-                    or None,
-                    identifier=[
-                        cls.generate_name_identifier_url(name_identifier)
-                        for name_identifier in contributor.find_all("nameIdentifier")
-                    ]
-                    or None,
-                    kind=contributor["contributorType"],
+        for contributor in xml.metadata.find_all("contributor"):
+            contributor_name_element = contributor.find("contributorName")
+            if contributor_name_element and contributor_name_element.string:
+                kwargs.setdefault("contributors", []).append(
+                    timdex.Contributor(
+                        value=contributor_name_element.string,
+                        affiliation=[
+                            a.string for a in contributor.find_all("affiliation")
+                        ]
+                        or None,
+                        identifier=[
+                            cls.generate_name_identifier_url(name_identifier)
+                            for name_identifier in contributor.find_all(
+                                "nameIdentifier"
+                            )
+                        ]
+                        or None,
+                        kind=contributor["contributorType"],
+                    )
                 )
-            )
 
         # dates
         publication_year = xml.metadata.find("publicationYear")
-        if publication_year is None or publication_year.string is None:
+        if not publication_year or not publication_year.string:
             logger.warning(
                 "Datacite record %s missing required Datacite field publicationYear",
                 source_record_id,
@@ -168,21 +165,21 @@ def create_from_datacite_xml(
             kwargs["edition"] = edition.string
 
         # file_formats
-        for file_format in [f for f in xml.metadata.find_all("format") if f.string]:
-            kwargs.setdefault("file_formats", []).append(file_format.string)
+        kwargs["file_formats"] = [
+            f.string for f in xml.metadata.find_all("format") if f.string
+        ] or None
 
         # format
         kwargs["format"] = "electronic resource"
 
         # funding_information
         for funding_reference in [
-            fr
-            for fr in xml.metadata.find_all("fundingReference")
-            if fr.find("funderName").string
+            fr for fr in xml.metadata.find_all("fundingReference")
         ]:
-            f = timdex.Funder(
-                funder_name=funding_reference.find("funderName").string,
-            )
+            f = timdex.Funder()
+            funder_name = funding_reference.find("funderName")
+            if funder_name.string:
+                f.funder_name = funder_name.string
             award_number = funding_reference.find("awardNumber")
             if award_number and award_number.string:
                 f.award_number = award_number.string
@@ -191,14 +188,17 @@ def create_from_datacite_xml(
             if funder_identifier and funder_identifier.string:
                 f.funder_identifier = funder_identifier.string
                 f.funder_identifier_type = funder_identifier.get("funderIdentifierType")
-            kwargs.setdefault("funding_information", []).append(f)
+            if f != timdex.Funder():
+                kwargs.setdefault("funding_information", []).append(f)
 
         # identifiers
         identifier_xml = xml.metadata.find("identifier")
         kwargs["identifiers"] = [
             timdex.Identifier(
                 value=identifier_xml.string,
-                kind=identifier_xml["identifierType"],
+                kind=identifier_xml.get(
+                    "identifierType", "Identifier kind not specified"
+                ),
             ),
         ]
         for alternate_identifier in [
@@ -207,7 +207,9 @@ def create_from_datacite_xml(
             kwargs["identifiers"].append(
                 timdex.Identifier(
                     value=alternate_identifier.string,
-                    kind=alternate_identifier.get("alternateIdentifierType"),
+                    kind=alternate_identifier.get(
+                        "alternateIdentifierType", "Identifier kind not specified"
+                    ),
                 )
             )
 
@@ -219,7 +221,9 @@ def create_from_datacite_xml(
         ]:
             i = timdex.Identifier(
                 value=cls.generate_related_item_identifier_url(related_identifier),
-                kind=related_identifier.get("relationType"),
+                kind=related_identifier.get(
+                    "relationType", "Identifier kind not specified"
+                ),
             )
             kwargs["identifiers"].append(i)
 
@@ -228,6 +232,15 @@ def create_from_datacite_xml(
         if language and language.string:
             kwargs["languages"] = [language.string]
 
+        # links
+        kwargs["links"] = [
+            timdex.Link(
+                kind="Digital object URL",
+                text="Digital object URL",
+                url=kwargs["source_link"],
+            )
+        ]
+
         # locations
         for location in [
             gl for gl in xml.metadata.find_all("geoLocationPlace") if gl.string
@@ -245,18 +258,17 @@ def create_from_datacite_xml(
                     "@descriptionType",
                     source_record_id,
                 )
-            else:
-                if description.get("descriptionType") != "Abstract":
-                    kwargs.setdefault("notes", []).append(
-                        timdex.Note(
-                            value=[description.string],
-                            kind=description.get("descriptionType"),
-                        )
+            if description.get("descriptionType") != "Abstract":
+                kwargs.setdefault("notes", []).append(
+                    timdex.Note(
+                        value=[description.string],
+                        kind=description.get("descriptionType"),
                     )
+                )
 
         # publication_information
         publisher = xml.metadata.find("publisher")
-        if publisher is None or publisher.string is None:
+        if not publication_year or not publication_year.string:
             logger.warning(
                 "Datacite record %s missing required Datacite field publisher",
                 source_record_id,
@@ -274,14 +286,14 @@ def create_from_datacite_xml(
             )
 
         # rights
-        for rights in xml.metadata.find_all("rights"):
-            if rights.string or rights.get("rightsURI"):
-                r = timdex.Rights()
-                if rights.string:
-                    r.description = rights.string
-                if rights.get("rightsURI"):
-                    r.uri = rights.get("rightsURI")
-                kwargs.setdefault("rights", []).append(r)
+        for right in [
+            r for r in xml.metadata.find_all("rights") if r.string or r.get("rightsURI")
+        ]:
+            kwargs.setdefault("rights", []).append(
+                timdex.Rights(
+                    description=right.string or None, uri=right.get("rightsURI")
+                )
+            )
 
         # subjects
         subjects_dict: Dict[str, List[str]] = {}
@@ -294,18 +306,17 @@ def create_from_datacite_xml(
                 subjects_dict.setdefault(subject.attrs["subjectScheme"], []).append(
                     subject.string
                 )
-        for key, value in subjects_dict.items():
-            kwargs.setdefault("subjects", []).append(
-                timdex.Subject(value=value, kind=key)
-            )
+        kwargs["subjects"] = [
+            timdex.Subject(value=value, kind=key)
+            for key, value in subjects_dict.items()
+        ] or None
 
         # summary, uses description list retrieved for notes field
-        for description in [
-            d
+        kwargs["summary"] = [
+            d.string
             for d in descriptions
             if d.get("descriptionType") == "Abstract" and d.string
-        ]:
-            kwargs.setdefault("summary", []).append(description.string)
+        ] or None
 
         # citation, generate citation from other fields
         kwargs["citation"] = generate_citation(kwargs)

diff --git a/transmogrifier/sources/dspace_dim.py b/transmogrifier/sources/dspace_dim.py
@@ -78,7 +78,7 @@ def create_from_dspace_dim(
             kwargs.setdefault("alternate_titles", []).append(
                 timdex.AlternateTitle(
                     value=alternate_title.string,
-                    kind=alternate_title.get("qualifier"),
+                    kind=alternate_title["qualifier"],
                 )
             )
 
@@ -177,7 +177,7 @@ def create_from_dspace_dim(
             kwargs.setdefault("identifiers", []).append(
                 timdex.Identifier(
                     value=identifier.string,
-                    kind=identifier.get("qualifier"),
+                    kind=identifier.get("qualifier", "Identifier kind not specified"),
                 )
             )
 
@@ -267,7 +267,7 @@ def create_from_dspace_dim(
                     subject.string
                 )
             else:
-                subjects_dict.setdefault(subject.attrs["qualifier"], []).append(
+                subjects_dict.setdefault(subject["qualifier"], []).append(
                     subject.string
                 )
         for key, value in subjects_dict.items():

diff --git a/transmogrifier/sources/dspace_mets.py b/transmogrifier/sources/dspace_mets.py
@@ -105,9 +105,10 @@ def create_from_dspace_mets_xml(
         kwargs["citation"] = citation.string if citation and citation.string else None
 
         # content_type
-        content_types = xml.find_all("mods:genre")
         kwargs["content_type"] = [
-            content_type.string for content_type in content_types if content_type.string
+            content_type.string
+            for content_type in xml.find_all("mods:genre")
+            if content_type.string
         ] or None
 
         # contents: relevant field in DSpace (dc.description.tableofcontents) is not