Skip to content
This repository has been archived by the owner on Jun 2, 2022. It is now read-only.

Commit

Permalink
zenodo soure & testing changes
Browse files Browse the repository at this point in the history
  • Loading branch information
ehanson8 committed Nov 6, 2020
1 parent aa01a11 commit f86c4df
Show file tree
Hide file tree
Showing 16 changed files with 847 additions and 58 deletions.
1 change: 1 addition & 0 deletions Pipfile
Expand Up @@ -26,6 +26,7 @@ sqlalchemy = "*"
cx-oracle = "*"
structlog = "*"
colorama = "*"
lxml = "*"

[requires]
python_version = "3.8"
Expand Down
9 changes: 7 additions & 2 deletions hoard/cli.py
Expand Up @@ -12,7 +12,7 @@
from hoard.models import Dataset
from hoard.names import AuthorService, engine, Warehouse
from hoard.names.logging import s3_log
from hoard.sources import JPAL, LincolnLab, WHOAS
from hoard.sources import JPAL, LincolnLab, WHOAS, Zenodo


@click.group()
Expand All @@ -38,7 +38,8 @@ def main():

@main.command()
@click.argument(
"source", type=click.Choice(["jpal", "llab", "whoas"], case_sensitive=False)
"source",
type=click.Choice(["jpal", "llab", "whoas", "zenodo"], case_sensitive=False),
)
@click.argument("source_url")
@click.option("--key", "-k", envvar="HOARD_RDR_KEY", help="RDR authentication key.")
Expand Down Expand Up @@ -82,8 +83,12 @@ def ingest(
stream = open(source_url)
records = LincolnLab(stream)
elif source == "whoas":
# do we need to be able to specify the set for whoas and zenodo?
client = OAIClient(source_url, "dim", "com_1912_4134")
records = WHOAS(client)
elif source == "zenodo":
client = OAIClient(source_url, "oai_datacite", "????")
records = Zenodo(client)
for record in records:
try:
dv_id, p_id = rdr.create(record, parent=parent)
Expand Down
3 changes: 2 additions & 1 deletion hoard/sources/__init__.py
@@ -1,6 +1,7 @@
from hoard.sources.jpal import JPAL
from hoard.sources.lincolnLab import LincolnLab
from hoard.sources.whoas import WHOAS
from hoard.sources.zenodo import Zenodo


__all__ = ["JPAL", "LincolnLab", "WHOAS"]
__all__ = ["JPAL", "LincolnLab", "WHOAS", "Zenodo"]
173 changes: 173 additions & 0 deletions hoard/sources/zenodo.py
@@ -0,0 +1,173 @@
from datetime import datetime
from typing import Any, Dict, Iterator

import pycountry # type: ignore
import structlog # type: ignore
import xml.etree.ElementTree as ET

from hoard.client import OAIClient
from hoard.models import (
Author,
Contact,
Contributor,
Dataset,
Description,
Keyword,
OtherId,
Producer,
Publication,
)

logger = structlog.get_logger()

namespace = {
"oai": "http://www.openarchives.org/OAI/2.0/",
"datacite": "http://schema.datacite.org/oai/oai-1.0/",
"datacite3": "http://datacite.org/schema/kernel-3",
}


class Zenodo:
def __init__(self, client: OAIClient) -> None:
self.client = client

def __iter__(self) -> Iterator[Dataset]:
return self

def __next__(self) -> Dataset:
while True:
record = next(self.client)
parsed_record = ET.fromstring(record)
if parsed_record.find(".//oai:error", namespace) is not None:
continue
else:
try:
dataset = create_from_zenodo_datacite_xml(record, self.client)
return dataset
except TypeError as ex:
id_elem = parsed_record.find(".//oai:identifier", namespace)
if id_elem is not None:
rec_id = id_elem.text
logger.info(f"Error with {rec_id}: {str(ex)}")


def create_from_zenodo_datacite_xml(data: str, client: OAIClient) -> Dataset:
kwargs: Dict[str, Any] = {}
record = ET.fromstring(data)
titles = record.findall(".//datacite3:title", namespace)
# how to deal with multiple titles?
for title in titles:
kwargs["title"] = title.text
creators = record.findall(".//datacite3:creator", namespace)
for creator in creators:
creatorName = get_child_elem_text(creator, "creatorName")
creatorAffiliation = get_child_elem_text(creator, "affiliation")
creatorNameIdentifier = get_child_elem_text(creator, "nameIdentifier")
kwargs.setdefault("authors", []).append(
Author(
authorName=creatorName,
authorAffiliation=creatorAffiliation,
authorIdentifier=creatorNameIdentifier,
)
)
kwargs["contacts"] = [
Contact(datasetContactName="!!!!!!", datasetContactEmail="!!!!!!",)
]
descriptions = record.findall(".//datacite3:description", namespace)
for description in [
d
for d in descriptions
if "descriptionType" in d.attrib and d.attrib["descriptionType"] == "Abstract"
]:
if description.text is not None:
kwargs.setdefault("description", []).append(
Description(dsDescriptionValue=description.text)
)
subjects = record.findall(".//datacite3:subject", namespace)
for subject in subjects:
kwargs.setdefault("keywords", []).append(Keyword(keywordValue=subject.text))
dates = record.findall(".//datacite3:date", namespace)
for date in [
d for d in dates if "dateType" in d.attrib and d.attrib["dateType"] == "Issued"
]:
if date.text is not None:
try:
datetime.strptime(date.text, "%Y-%m-%d")
kwargs["distributionDate"] = date.text
except ValueError:
pass
identifier = record.find(".//datacite3:identifier", namespace)
if identifier is not None:
kwargs["alternativeURL"] = identifier.text
alternate_ids = record.findall(".//datacite3:alternateIdentifier", namespace)
for alternate_id in alternate_ids:
kwargs.setdefault("otherIds", []).append(
OtherId(otherIdValue=alternate_id.text)
)
resource_types = record.findall(".//datacite3:resourceType", namespace)
for resource_type in resource_types:
if resource_type.text is not None:
kwargs.setdefault("kindOfData", []).append(resource_type.text)
if resource_type.attrib["resourceTypeGeneral"] is not None:
kwargs.setdefault("kindOfData", []).append(
resource_type.attrib["resourceTypeGeneral"]
)
contributors = record.findall(".//datacite3:contributor", namespace)
for contributor in contributors:
contributorName = get_child_elem_text(contributor, "contributorName")
contributorType = get_child_elem_text(contributor, "contributorType")
kwargs.setdefault("contributors", []).append(
Contributor(
contributorName=contributorName, contributorType=contributorType
)
)
publishers = record.findall(".//datacite3:publisher", namespace)
for publisher in [p for p in publishers if p is not None]:
kwargs.setdefault("producers", []).append(Producer(producerName=publisher.text))
related_ids = record.findall(".//datacite3:relatedIdentifier", namespace)
for related_id in [
r
for r in related_ids
if "relationType" in r.attrib and r.attrib["relationType"] == "IsSupplementTo"
]:
publicationIDNumber = None
publicationIDType = None
if related_id.text is not None:
publicationIDNumber = related_id.text
if "relatedIdentifierType" in related_id.attrib:
publicationIDType = related_id.attrib["relatedIdentifierType"]
kwargs.setdefault("publications", []).append(
Publication(
publicationIDNumber=publicationIDNumber,
publicationIDType=publicationIDType,
)
)
languages = record.findall(".//datacite3:language", namespace)
for language in [x for x in languages if x is not None]:
if language.text is not None:
lang_value = pycountry.languages.get(alpha_2=language.text[:2])
if lang_value != "":
kwargs.setdefault("language", []).append(lang_value.name)
rights_list = record.findall(".//datacite3:rights", namespace)
# workaround until we figure out parsing
all_rights = ""
for rights in rights_list:
if rights.text is not None:
if all_rights != "":
all_rights += f". {rights.text}"
else:
all_rights = rights.text
if all_rights != "":
kwargs["license"] = all_rights
kwargs["termsOfUse"] = all_rights
kwargs["subjects"] = ["!!!!!!"]
if "description" not in kwargs:
kwargs["description"] = [Description(dsDescriptionValue=kwargs["title"])]
return Dataset(**kwargs)


def get_child_elem_text(child_elem, child_elem_name):
child_elem = child_elem.find(f"datacite3:{child_elem_name}", namespace)
if child_elem is not None:
child_elem = child_elem.text
return child_elem
43 changes: 39 additions & 4 deletions tests/conftest.py
Expand Up @@ -110,9 +110,6 @@ def whoas_oai_server(requests_mock, shared_datadir, request):
"oai:darchive.mblwhoilibrary.org:1912/2372": (
shared_datadir / "whoas/GetRecord_06.xml"
).read_text(),
"oai:darchive.mblwhoilibrary.org:1912/2373": (
shared_datadir / "whoas/GetRecord_07.xml"
).read_text(),
}
requests_mock.get(
f"{url}?verb=ListIdentifiers",
Expand All @@ -127,7 +124,45 @@ def whoas_oai_server(requests_mock, shared_datadir, request):
records["oai:darchive.mblwhoilibrary.org:1912/2370"],
records["oai:darchive.mblwhoilibrary.org:1912/2371"],
records["oai:darchive.mblwhoilibrary.org:1912/2372"],
records["oai:darchive.mblwhoilibrary.org:1912/2373"],
]


@pytest.fixture
def zenodo_oai_server(requests_mock, shared_datadir, request):
url = "http+mock://example.com/oai"
records = {
"oai:zenodo.org:807748": (
shared_datadir / "zenodo/GetRecord_01.xml"
).read_text(),
"oai:zenodo.org:807749": (
shared_datadir / "zenodo/GetRecord_02.xml"
).read_text(),
"oai:zenodo.org:807750": (
shared_datadir / "zenodo/GetRecord_03.xml"
).read_text(),
"oai:zenodo.org:807751": (
shared_datadir / "zenodo/GetRecord_04.xml"
).read_text(),
"oai:zenodo.org:807752": (
shared_datadir / "zenodo/GetRecord_05.xml"
).read_text(),
"oai:zenodo.org:807753": (
shared_datadir / "zenodo/GetRecord_06.xml"
).read_text(),
}
requests_mock.get(
f"{url}?verb=ListIdentifiers",
text=(shared_datadir / "zenodo/ListRecords.xml").read_text(),
)
for k, v in records.items():
requests_mock.get(f"{url}?identifier={k}", text=v)
return [
records["oai:zenodo.org:807748"],
records["oai:zenodo.org:807749"],
records["oai:zenodo.org:807750"],
records["oai:zenodo.org:807751"],
records["oai:zenodo.org:807752"],
records["oai:zenodo.org:807753"],
]


Expand Down
50 changes: 0 additions & 50 deletions tests/data/whoas/GetRecord_07.xml

This file was deleted.

24 changes: 24 additions & 0 deletions tests/data/whoas/ListRecords.xml
Expand Up @@ -28,5 +28,29 @@
<setSpec>com_1912_4</setSpec>
<setSpec>col_1912_2364</setSpec>
</header>
<header>
<identifier>oai:darchive.mblwhoilibrary.org:1912/2370</identifier>
<datestamp>2016-09-26T17:42:49Z</datestamp>
<setSpec>com_1912_1726</setSpec>
<setSpec>com_1912_1725</setSpec>
<setSpec>com_1912_4</setSpec>
<setSpec>col_1912_2364</setSpec>
</header>
<header>
<identifier>oai:darchive.mblwhoilibrary.org:1912/2371</identifier>
<datestamp>2016-09-26T17:42:49Z</datestamp>
<setSpec>com_1912_1726</setSpec>
<setSpec>com_1912_1725</setSpec>
<setSpec>com_1912_4</setSpec>
<setSpec>col_1912_2364</setSpec>
</header>
<header>
<identifier>oai:darchive.mblwhoilibrary.org:1912/2372</identifier>
<datestamp>2016-09-26T17:42:49Z</datestamp>
<setSpec>com_1912_1726</setSpec>
<setSpec>com_1912_1725</setSpec>
<setSpec>com_1912_4</setSpec>
<setSpec>col_1912_2364</setSpec>
</header>
</ListIdentifiers>
</OAI-PMH>

0 comments on commit f86c4df

Please sign in to comment.