Skip to content

Commit

Permalink
Update WHOAS and Zenodo content type filters
Browse files Browse the repository at this point in the history
Why these changes are being introduced:

Some adjustments need to be made to these content type filters per
stakeholder review after ingest with the initial filters. In addition to
updating the types we want to filter, a request was also made to not
ingest records without a content type in the source record from the
WHOAS source, instead of setting the default value of "Not specified".

How this addresses that need:
* Updates Zenodo content type filter to select records based on valid
  content types instead of invalid ones, and adds to valid types per
  stakeholder request.
* Updates dspace_dim transform to include a get_content_types method
  since the method needs to be different for the base transform vs.
  WHOAS
* Adds some types to the WHOAS invalid content types list, including
  "no content type in source record", and subclasses the
  get_content_type method to set content type to that value if there are
  no content types provided in the source record.

Relevant ticket(s):
* https://mitlibraries.atlassian.net/browse/RDI-243
* https://mitlibraries.atlassian.net/browse/RDI-246
  • Loading branch information
hakbailey committed Jul 29, 2022
1 parent 0a629ae commit 73a93e3
Show file tree
Hide file tree
Showing 8 changed files with 124 additions and 31 deletions.
8 changes: 6 additions & 2 deletions tests/fixtures/dspace/dspace_dim_record_all_fields.xml
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
<records>
<record xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<record xmlns="http://www.openarchives.org/OAI/2.0/"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<header>
<identifier>oai:darchive.mblwhoilibrary.org:1912/2641</identifier>
<datestamp>2020-01-28T19:30:01Z</datestamp>
<setSpec>com_1912_3</setSpec>
<setSpec>col_1912_534</setSpec>
</header>
<metadata>
<dim:dim xmlns:dim="http://www.dspace.org/xmlns/dspace/dim" xmlns:doc="http://www.lyncode.com/xoai" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.dspace.org/xmlns/dspace/dim http://www.dspace.org/schema/dim.xsd">
<dim:dim xmlns:dim="http://www.dspace.org/xmlns/dspace/dim"
xmlns:doc="http://www.lyncode.com/xoai"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.dspace.org/xmlns/dspace/dim http://www.dspace.org/schema/dim.xsd">
<dim:field mdschema="dc" element="contributor" qualifier="author" authority="0a81af48-388e-49c4-936a-547560e4ad1c" confidence="600">LaFountain, James R.</dim:field>
<dim:field mdschema="dc" element="contributor" qualifier="author" authority="0a81af48-388e-49c4-936a-547560e4ad1c" confidence="600">Oldenbourg, Rudolf</dim:field>
<dim:field mdschema="dc" element="coverage" qualifier="spatial">Central equatorial Pacific Ocean</dim:field>
Expand Down Expand Up @@ -54,6 +57,7 @@ Movie01_LaFountainOldenbourg_MeiosisI_MPEG4.mov: 31105110 bytes, checksum: 5b08f
<dim:field mdschema="dc" element="title" lang="en">Time lapse movie of meiosis I in a living spermatocyte from the crane fly, Nephrotoma suturalis, viewed with polarized light microscopy</dim:field>
<dim:field mdschema="dc" element="title" qualifier="alternative" lang="en">An Alternative Title</dim:field>
<dim:field mdschema="dc" element="type" lang="en">Moving Image</dim:field>
<dim:field mdschema="dc" element="type" lang="en">Dataset</dim:field>
</dim:dim>
</metadata>
</record>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.dspace.org/xmlns/dspace/dim http://www.dspace.org/schema/dim.xsd">
<dim:field mdschema="dc" element="type" lang="en">Book</dim:field>
<dim:field mdschema="dc" element="type" lang="en">Book chapter</dim:field>
<dim:field mdschema="dc" element="type" lang="en">Text</dim:field>
</dim:dim>
</metadata>
</record>
Expand All @@ -32,7 +33,7 @@
<record xmlns="http://www.openarchives.org/OAI/2.0/"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<header>
<identifier>oai:darchive.mblwhoilibrary.org:valid_content_type</identifier>
<identifier>oai:darchive.mblwhoilibrary.org:valid_content_types</identifier>
<datestamp>2020-01-28T19:30:01Z</datestamp>
<setSpec>com_1912_3</setSpec>
<setSpec>col_1912_534</setSpec>
Expand All @@ -42,6 +43,22 @@
xmlns:doc="http://www.lyncode.com/xoai"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.dspace.org/xmlns/dspace/dim http://www.dspace.org/schema/dim.xsd">
<dim:field mdschema="dc" element="type" lang="en">Moving Image</dim:field>
<dim:field mdschema="dc" element="type" lang="en">Dataset</dim:field>
</dim:dim>
</metadata>
</record>
<record xmlns="http://www.openarchives.org/OAI/2.0/"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<header>
<identifier>oai:darchive.mblwhoilibrary.org:no_content_type</identifier>
<datestamp>2020-01-28T19:30:01Z</datestamp>
<setSpec>com_1912_3</setSpec>
<setSpec>col_1912_534</setSpec>
</header>
<metadata>
<dim:dim xmlns:dim="http://www.dspace.org/xmlns/dspace/dim"
xmlns:doc="http://www.lyncode.com/xoai"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.dspace.org/xmlns/dspace/dim http://www.dspace.org/schema/dim.xsd">
</dim:dim>
</metadata>
</record>
Expand Down
2 changes: 1 addition & 1 deletion tests/test_dspace_dim.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def test_dspace_dim_transform_with_all_fields_transforms_correctly():
alternate_titles=[
timdex.AlternateTitle(value="An Alternative Title", kind="alternative"),
],
content_type=["Moving Image"],
content_type=["Moving Image", "Dataset"],
contents=["Chapter 1"],
contributors=[
timdex.Contributor(
Expand Down
24 changes: 19 additions & 5 deletions tests/test_whoas.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,26 +3,40 @@


def test_valid_content_types_with_all_invalid():
content_types = ["Book", "Thesis"]
content_types = [
"Article",
"authority list",
"book",
"book chapter",
"course",
"no content type in source record",
"other",
"preprint",
"presentation",
"Technical report",
"thesis",
"text",
"working paper",
]
assert Whoas.valid_content_types(content_types) is False


def test_valid_content_types_with_some_invalid():
content_types = ["Preprint", "Dataset"]
content_types = ["Preprint", "dataset"]
assert Whoas.valid_content_types(content_types) is True


def test_valid_content_types_with_all_valid():
content_types = ["Dataset", "Image"]
content_types = ["Dataset", "image"]
assert Whoas.valid_content_types(content_types) is True


def test_whoas_skips_records_with_only_invalid_content_types():
def test_whoas_skips_records_with_only_invalid_or_not_present_content_types():
input_records = list(
parse_xml_records(
"tests/fixtures/dspace/whoas_records_with_valid_and_invalid_content_types.xml"
)
)
assert len(input_records) == 3
assert len(input_records) == 4
output_records = Whoas("whoas", iter(input_records))
assert len(list(output_records)) == 2
17 changes: 15 additions & 2 deletions tests/test_zenodo.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ def test_zenodo_create_source_record_id_generates_correct_id():


def test_valid_content_types_with_all_invalid():
content_types = ["lesson", "poster"]
content_types = ["journalarticle", "poster"]
assert Zenodo.valid_content_types(content_types) is False


Expand All @@ -21,7 +21,20 @@ def test_valid_content_types_with_some_invalid():


def test_valid_content_types_with_all_valid():
content_types = ["dataset", "image"]
content_types = [
"dataset",
"diagram",
"drawing",
"figure",
"image",
"other",
"photo",
"physicalobject",
"plot",
"software",
"taxonomictreatment",
"video",
]
assert Zenodo.valid_content_types(content_types) is True


Expand Down
23 changes: 18 additions & 5 deletions transmogrifier/sources/dspace_dim.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,9 @@ def get_optional_fields(self, xml: Tag) -> Optional[dict]:
fields["citation"] = citation.string if citation and citation.string else None

# content_type
if content_type_list := [
t.string for t in xml.find_all("dim:field", element="type") if t.string
]:
if self.valid_content_types(content_type_list):
fields["content_type"] = content_type_list
if content_types := self.get_content_types(xml):
if self.valid_content_types(content_types):
fields["content_type"] = content_types
else:
return None

Expand Down Expand Up @@ -251,6 +249,21 @@ def get_optional_fields(self, xml: Tag) -> Optional[dict]:

return fields

@classmethod
def get_content_types(cls, xml: Tag) -> Optional[list[str]]:
"""
Retrieve content types from a DSpace DIM XML record.
May be overridden by source subclasses that retrieve content type values
differently.
Args:
xml: A BeautifulSoup Tag representing a single DSpace DIM XML record.
"""
return [
t.string for t in xml.find_all("dim:field", element="type", string=True)
] or None

@classmethod
def get_main_titles(cls, xml: Tag) -> list[Tag]:
"""
Expand Down
41 changes: 30 additions & 11 deletions transmogrifier/sources/whoas.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,41 @@
from bs4 import Tag

from transmogrifier.sources.dspace_dim import DspaceDim

INVALID_CONTENT_TYPES = [
"Article",
"Authority List",
"Book",
"Book chapter",
"Course",
"Preprint",
"Presentation",
"Technical Report",
"Thesis",
"Working Paper",
"article",
"authority list",
"book",
"book chapter",
"course",
"no content type in source record",
"other",
"preprint",
"presentation",
"technical report",
"thesis",
"text",
"working paper",
]


class Whoas(DspaceDim):
"""Whoas transformer class."""

@classmethod
def get_content_types(cls, xml: Tag) -> list[str]:
"""
Retrieve content types from a DSpace DIM XML record.
Overrides the base DspaceDim.get_content_types() method.
Args:
xml: A BeautifulSoup Tag representing a single DSpace DIM XML record.
"""
return [
t.string for t in xml.find_all("dim:field", element="type", string=True)
] or ["no content type in source record"]

@classmethod
def valid_content_types(cls, content_type_list: list[str]) -> bool:
"""
Expand All @@ -27,7 +46,7 @@ def valid_content_types(cls, content_type_list: list[str]) -> bool:
Args:
content_type_list: A list of content_type values.
"""
if all(item in INVALID_CONTENT_TYPES for item in content_type_list):
if all(item.lower() in INVALID_CONTENT_TYPES for item in content_type_list):
return False
else:
return True
21 changes: 17 additions & 4 deletions transmogrifier/sources/zenodo.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,20 @@

from transmogrifier.sources.datacite import Datacite

INVALID_CONTENT_TYPES = ["lesson", "poster", "presentation", "publication"]
VALID_CONTENT_TYPES = [
"dataset",
"diagram",
"drawing",
"figure",
"image",
"other",
"photo",
"physicalobject",
"plot",
"software",
"taxonomictreatment",
"video",
]


class Zenodo(Datacite):
Expand Down Expand Up @@ -33,7 +46,7 @@ def valid_content_types(cls, content_type_list: List[str]) -> bool:
Args:
content_type_list: A list of content_type values.
"""
if all(item in INVALID_CONTENT_TYPES for item in content_type_list):
return False
else:
if any(item.lower() in VALID_CONTENT_TYPES for item in content_type_list):
return True
else:
return False

0 comments on commit 73a93e3

Please sign in to comment.