zenodo soure & testing changes

MITLibraries · Nov 6, 2020 · f86c4df · f86c4df
1 parent aa01a11
commit f86c4df
Show file tree

Hide file tree

Showing 16 changed files with 847 additions and 58 deletions.
diff --git a/Pipfile b/Pipfile
@@ -26,6 +26,7 @@ sqlalchemy = "*"
 cx-oracle = "*"
 structlog = "*"
 colorama = "*"
+lxml = "*"
 
 [requires]
 python_version = "3.8"

diff --git a/hoard/cli.py b/hoard/cli.py
@@ -12,7 +12,7 @@
 from hoard.models import Dataset
 from hoard.names import AuthorService, engine, Warehouse
 from hoard.names.logging import s3_log
-from hoard.sources import JPAL, LincolnLab, WHOAS
+from hoard.sources import JPAL, LincolnLab, WHOAS, Zenodo
 
 
 @click.group()
@@ -38,7 +38,8 @@ def main():
 
 @main.command()
 @click.argument(
-    "source", type=click.Choice(["jpal", "llab", "whoas"], case_sensitive=False)
+    "source",
+    type=click.Choice(["jpal", "llab", "whoas", "zenodo"], case_sensitive=False),
 )
 @click.argument("source_url")
 @click.option("--key", "-k", envvar="HOARD_RDR_KEY", help="RDR authentication key.")
@@ -82,8 +83,12 @@ def ingest(
         stream = open(source_url)
         records = LincolnLab(stream)
     elif source == "whoas":
+        # do we need to be able to specify the set for whoas and zenodo?
         client = OAIClient(source_url, "dim", "com_1912_4134")
         records = WHOAS(client)
+    elif source == "zenodo":
+        client = OAIClient(source_url, "oai_datacite", "????")
+        records = Zenodo(client)
     for record in records:
         try:
             dv_id, p_id = rdr.create(record, parent=parent)

diff --git a/hoard/sources/__init__.py b/hoard/sources/__init__.py
@@ -1,6 +1,7 @@
 from hoard.sources.jpal import JPAL
 from hoard.sources.lincolnLab import LincolnLab
 from hoard.sources.whoas import WHOAS
+from hoard.sources.zenodo import Zenodo
 
 
-__all__ = ["JPAL", "LincolnLab", "WHOAS"]
+__all__ = ["JPAL", "LincolnLab", "WHOAS", "Zenodo"]
diff --git a/hoard/sources/zenodo.py b/hoard/sources/zenodo.py
@@ -0,0 +1,173 @@
+from datetime import datetime
+from typing import Any, Dict, Iterator
+
+import pycountry  # type: ignore
+import structlog  # type: ignore
+import xml.etree.ElementTree as ET
+
+from hoard.client import OAIClient
+from hoard.models import (
+    Author,
+    Contact,
+    Contributor,
+    Dataset,
+    Description,
+    Keyword,
+    OtherId,
+    Producer,
+    Publication,
+)
+
+logger = structlog.get_logger()
+
+namespace = {
+    "oai": "http://www.openarchives.org/OAI/2.0/",
+    "datacite": "http://schema.datacite.org/oai/oai-1.0/",
+    "datacite3": "http://datacite.org/schema/kernel-3",
+}
+
+
+class Zenodo:
+    def __init__(self, client: OAIClient) -> None:
+        self.client = client
+
+    def __iter__(self) -> Iterator[Dataset]:
+        return self
+
+    def __next__(self) -> Dataset:
+        while True:
+            record = next(self.client)
+            parsed_record = ET.fromstring(record)
+            if parsed_record.find(".//oai:error", namespace) is not None:
+                continue
+            else:
+                try:
+                    dataset = create_from_zenodo_datacite_xml(record, self.client)
+                    return dataset
+                except TypeError as ex:
+                    id_elem = parsed_record.find(".//oai:identifier", namespace)
+                    if id_elem is not None:
+                        rec_id = id_elem.text
+                    logger.info(f"Error with {rec_id}: {str(ex)}")
+
+
+def create_from_zenodo_datacite_xml(data: str, client: OAIClient) -> Dataset:
+    kwargs: Dict[str, Any] = {}
+    record = ET.fromstring(data)
+    titles = record.findall(".//datacite3:title", namespace)
+    # how to deal with multiple titles?
+    for title in titles:
+        kwargs["title"] = title.text
+    creators = record.findall(".//datacite3:creator", namespace)
+    for creator in creators:
+        creatorName = get_child_elem_text(creator, "creatorName")
+        creatorAffiliation = get_child_elem_text(creator, "affiliation")
+        creatorNameIdentifier = get_child_elem_text(creator, "nameIdentifier")
+        kwargs.setdefault("authors", []).append(
+            Author(
+                authorName=creatorName,
+                authorAffiliation=creatorAffiliation,
+                authorIdentifier=creatorNameIdentifier,
+            )
+        )
+    kwargs["contacts"] = [
+        Contact(datasetContactName="!!!!!!", datasetContactEmail="!!!!!!",)
+    ]
+    descriptions = record.findall(".//datacite3:description", namespace)
+    for description in [
+        d
+        for d in descriptions
+        if "descriptionType" in d.attrib and d.attrib["descriptionType"] == "Abstract"
+    ]:
+        if description.text is not None:
+            kwargs.setdefault("description", []).append(
+                Description(dsDescriptionValue=description.text)
+            )
+    subjects = record.findall(".//datacite3:subject", namespace)
+    for subject in subjects:
+        kwargs.setdefault("keywords", []).append(Keyword(keywordValue=subject.text))
+    dates = record.findall(".//datacite3:date", namespace)
+    for date in [
+        d for d in dates if "dateType" in d.attrib and d.attrib["dateType"] == "Issued"
+    ]:
+        if date.text is not None:
+            try:
+                datetime.strptime(date.text, "%Y-%m-%d")
+                kwargs["distributionDate"] = date.text
+            except ValueError:
+                pass
+    identifier = record.find(".//datacite3:identifier", namespace)
+    if identifier is not None:
+        kwargs["alternativeURL"] = identifier.text
+    alternate_ids = record.findall(".//datacite3:alternateIdentifier", namespace)
+    for alternate_id in alternate_ids:
+        kwargs.setdefault("otherIds", []).append(
+            OtherId(otherIdValue=alternate_id.text)
+        )
+    resource_types = record.findall(".//datacite3:resourceType", namespace)
+    for resource_type in resource_types:
+        if resource_type.text is not None:
+            kwargs.setdefault("kindOfData", []).append(resource_type.text)
+        if resource_type.attrib["resourceTypeGeneral"] is not None:
+            kwargs.setdefault("kindOfData", []).append(
+                resource_type.attrib["resourceTypeGeneral"]
+            )
+    contributors = record.findall(".//datacite3:contributor", namespace)
+    for contributor in contributors:
+        contributorName = get_child_elem_text(contributor, "contributorName")
+        contributorType = get_child_elem_text(contributor, "contributorType")
+        kwargs.setdefault("contributors", []).append(
+            Contributor(
+                contributorName=contributorName, contributorType=contributorType
+            )
+        )
+    publishers = record.findall(".//datacite3:publisher", namespace)
+    for publisher in [p for p in publishers if p is not None]:
+        kwargs.setdefault("producers", []).append(Producer(producerName=publisher.text))
+    related_ids = record.findall(".//datacite3:relatedIdentifier", namespace)
+    for related_id in [
+        r
+        for r in related_ids
+        if "relationType" in r.attrib and r.attrib["relationType"] == "IsSupplementTo"
+    ]:
+        publicationIDNumber = None
+        publicationIDType = None
+        if related_id.text is not None:
+            publicationIDNumber = related_id.text
+        if "relatedIdentifierType" in related_id.attrib:
+            publicationIDType = related_id.attrib["relatedIdentifierType"]
+        kwargs.setdefault("publications", []).append(
+            Publication(
+                publicationIDNumber=publicationIDNumber,
+                publicationIDType=publicationIDType,
+            )
+        )
+    languages = record.findall(".//datacite3:language", namespace)
+    for language in [x for x in languages if x is not None]:
+        if language.text is not None:
+            lang_value = pycountry.languages.get(alpha_2=language.text[:2])
+            if lang_value != "":
+                kwargs.setdefault("language", []).append(lang_value.name)
+    rights_list = record.findall(".//datacite3:rights", namespace)
+    # workaround until we figure out parsing
+    all_rights = ""
+    for rights in rights_list:
+        if rights.text is not None:
+            if all_rights != "":
+                all_rights += f". {rights.text}"
+            else:
+                all_rights = rights.text
+    if all_rights != "":
+        kwargs["license"] = all_rights
+        kwargs["termsOfUse"] = all_rights
+    kwargs["subjects"] = ["!!!!!!"]
+    if "description" not in kwargs:
+        kwargs["description"] = [Description(dsDescriptionValue=kwargs["title"])]
+    return Dataset(**kwargs)
+
+
+def get_child_elem_text(child_elem, child_elem_name):
+    child_elem = child_elem.find(f"datacite3:{child_elem_name}", namespace)
+    if child_elem is not None:
+        child_elem = child_elem.text
+    return child_elem
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -110,9 +110,6 @@ def whoas_oai_server(requests_mock, shared_datadir, request):
         "oai:darchive.mblwhoilibrary.org:1912/2372": (
             shared_datadir / "whoas/GetRecord_06.xml"
         ).read_text(),
-        "oai:darchive.mblwhoilibrary.org:1912/2373": (
-            shared_datadir / "whoas/GetRecord_07.xml"
-        ).read_text(),
     }
     requests_mock.get(
         f"{url}?verb=ListIdentifiers",
@@ -127,7 +124,45 @@ def whoas_oai_server(requests_mock, shared_datadir, request):
         records["oai:darchive.mblwhoilibrary.org:1912/2370"],
         records["oai:darchive.mblwhoilibrary.org:1912/2371"],
         records["oai:darchive.mblwhoilibrary.org:1912/2372"],
-        records["oai:darchive.mblwhoilibrary.org:1912/2373"],
+    ]
+
+
+@pytest.fixture
+def zenodo_oai_server(requests_mock, shared_datadir, request):
+    url = "http+mock://example.com/oai"
+    records = {
+        "oai:zenodo.org:807748": (
+            shared_datadir / "zenodo/GetRecord_01.xml"
+        ).read_text(),
+        "oai:zenodo.org:807749": (
+            shared_datadir / "zenodo/GetRecord_02.xml"
+        ).read_text(),
+        "oai:zenodo.org:807750": (
+            shared_datadir / "zenodo/GetRecord_03.xml"
+        ).read_text(),
+        "oai:zenodo.org:807751": (
+            shared_datadir / "zenodo/GetRecord_04.xml"
+        ).read_text(),
+        "oai:zenodo.org:807752": (
+            shared_datadir / "zenodo/GetRecord_05.xml"
+        ).read_text(),
+        "oai:zenodo.org:807753": (
+            shared_datadir / "zenodo/GetRecord_06.xml"
+        ).read_text(),
+    }
+    requests_mock.get(
+        f"{url}?verb=ListIdentifiers",
+        text=(shared_datadir / "zenodo/ListRecords.xml").read_text(),
+    )
+    for k, v in records.items():
+        requests_mock.get(f"{url}?identifier={k}", text=v)
+    return [
+        records["oai:zenodo.org:807748"],
+        records["oai:zenodo.org:807749"],
+        records["oai:zenodo.org:807750"],
+        records["oai:zenodo.org:807751"],
+        records["oai:zenodo.org:807752"],
+        records["oai:zenodo.org:807753"],
     ]
 
 

diff --git a/tests/data/whoas/GetRecord_07.xml b/tests/data/whoas/GetRecord_07.xml
diff --git a/tests/data/whoas/ListRecords.xml b/tests/data/whoas/ListRecords.xml
@@ -28,5 +28,29 @@
       <setSpec>com_1912_4</setSpec>
       <setSpec>col_1912_2364</setSpec>
     </header>
+    <header>
+      <identifier>oai:darchive.mblwhoilibrary.org:1912/2370</identifier>
+      <datestamp>2016-09-26T17:42:49Z</datestamp>
+      <setSpec>com_1912_1726</setSpec>
+      <setSpec>com_1912_1725</setSpec>
+      <setSpec>com_1912_4</setSpec>
+      <setSpec>col_1912_2364</setSpec>
+    </header>
+    <header>
+      <identifier>oai:darchive.mblwhoilibrary.org:1912/2371</identifier>
+      <datestamp>2016-09-26T17:42:49Z</datestamp>
+      <setSpec>com_1912_1726</setSpec>
+      <setSpec>com_1912_1725</setSpec>
+      <setSpec>com_1912_4</setSpec>
+      <setSpec>col_1912_2364</setSpec>
+    </header>
+    <header>
+      <identifier>oai:darchive.mblwhoilibrary.org:1912/2372</identifier>
+      <datestamp>2016-09-26T17:42:49Z</datestamp>
+      <setSpec>com_1912_1726</setSpec>
+      <setSpec>com_1912_1725</setSpec>
+      <setSpec>com_1912_4</setSpec>
+      <setSpec>col_1912_2364</setSpec>
+    </header>
   </ListIdentifiers>
 </OAI-PMH>