Skip to content

Commit

Permalink
Re-map MARC portfolio items to holdings and links
Browse files Browse the repository at this point in the history
Why these changes are being introduced:
Our initial mapping for e-portfolio items (data enrichment field 986)
didn't include all the possible subfields where the URL could exist. In
addition, we weren't adding these to links, only holdings, which was
potentially confusing since they are links to the items.

How this addresses that need:
* Adds a new static method in the Marc class to get a single subfield
  value string if present in the datafield. This is more clear than
  using the method that is intended to retrieve multiple values, in
  cases like this where there will not be multiple values.
* Refactors the 986 field mapping to use the new static method to
  retrieve a single value from each single non-repeatable subfield.
* Adds additional subfield mappings for location value to the holdings
  mapping from the MARC 986 field.
* If an electronic holding has a location value, also add it as a Link
  (location value must be present because URL is required in the Link
  field).
* Adds/updates tests and fixtures to reflect changes.

Side effects of this change:
Electronic item links will be duplicated in the links and holdings
fields for some links. We decided this was ok for now.

Relevant ticket(s):
* https://mitlibraries.atlassian.net/browse/TIMX-184
  • Loading branch information
hakbailey committed Feb 16, 2023
1 parent 5202fcf commit 2e70a98
Show file tree
Hide file tree
Showing 4 changed files with 175 additions and 33 deletions.
7 changes: 5 additions & 2 deletions tests/fixtures/marc/marc_record_all_fields.xml
Original file line number Diff line number Diff line change
Expand Up @@ -465,8 +465,11 @@
<datafield tag="986" ind1=" " ind2=" ">
<subfield code="k">Alexander Street Press Parent Record</subfield>
<subfield code="j">Music Online: Classical Music Library - United States</subfield>
<subfield code="j">Music Online: Classical Music Library</subfield>
<subfield code="f">http://BLCMIT.NaxosMusicLibrary.com/catalogue/item.asp?cid=19029653</subfield>
<subfield code="l">http://BLCMIT.NaxosMusicLibrary.com/catalogue/item.asp?cid=19029653</subfield>
</datafield>
<datafield tag="986" ind1=" " ind2=" ">
<subfield code="j">O'Reilly Online Learning</subfield>
<subfield code="d">http://link-resolver-url</subfield>
</datafield>
</record>
</collection>
Original file line number Diff line number Diff line change
Expand Up @@ -858,19 +858,38 @@
<subfield code="bb">bb</subfield>
<subfield code="i">HUM</subfield>
</datafield>
<datafield tag="986" ind1=" " ind2=" ">
<datafield tag="986">
<subfield></subfield>
<subfield code=""></subfield>
<subfield code="d"></subfield>
<subfield code="f"></subfield>
<subfield code="i"></subfield>
<subfield code="j"></subfield>
<subfield code="l"></subfield>
</datafield>
<datafield tag="986" ind1=" " ind2=" ">
<subfield></subfield>
<subfield code=""></subfield>
<subfield code="f">f</subfield>
<subfield code="i">i</subfield>
<subfield code="j">j</subfield>
<datafield tag="986">
<subfield code="d">only subfield d</subfield>
</datafield>
<datafield tag="986">
<subfield code="f">only subfield f</subfield>
</datafield>
<datafield tag="986">
<subfield code="i">only subfield i</subfield>
</datafield>
<datafield tag="986">
<subfield code="j">only subfield j</subfield>
</datafield>
<datafield tag="986">
<subfield code="l">only subfield l</subfield>
</datafield>
<datafield tag="986">
<subfield code="d">d: should not be used</subfield>
<subfield code="f">f: d and l present</subfield>
<subfield code="l">l: should not be used</subfield>
</datafield>
<datafield tag="986">
<subfield code="d">d: should not be used</subfield>
<subfield code="l">l: d present</subfield>
</datafield>
</record>
</collection>
103 changes: 93 additions & 10 deletions tests/test_marc.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import logging

from bs4 import BeautifulSoup

import transmogrifier.models as timdex
from transmogrifier.helpers import parse_xml_records
from transmogrifier.sources.marc import Marc
Expand Down Expand Up @@ -175,14 +177,15 @@ def test_marc_record_all_fields_transform_correctly():
note="Available from 06/01/2001 volume: 1 issue: 1.",
),
timdex.Holding(
collection=(
"Music Online: Classical Music Library - United States, Music "
"Online: Classical Music Library"
),
collection="Music Online: Classical Music Library - United States",
format="electronic resource",
location=(
"http://BLCMIT.NaxosMusicLibrary.com/catalogue/item.asp?cid=19029653"
),
location="http://BLCMIT.NaxosMusicLibrary.com/catalogue/item.asp?"
"cid=19029653",
),
timdex.Holding(
collection="O'Reilly Online Learning",
format="electronic resource",
location="http://link-resolver-url",
),
],
identifiers=[
Expand Down Expand Up @@ -212,6 +215,23 @@ def test_marc_record_all_fields_transform_correctly():
"Sung in French",
],
links=[
timdex.Link(
kind="Digital object URL",
text="HeinOnline U.S. Congressional Documents Library",
url="http://BLCMIT.NaxosMusicLibrary.com/catalogue/item.asp?"
"cid=ACC24383",
),
timdex.Link(
kind="Digital object URL",
text="Music Online: Classical Music Library - United States",
url="http://BLCMIT.NaxosMusicLibrary.com/catalogue/item.asp?"
"cid=19029653",
),
timdex.Link(
kind="Digital object URL",
text="O'Reilly Online Learning",
url="http://link-resolver-url",
),
timdex.Link(
url="http://catalog.hathitrust.org/api/volumes/oclc/1606890.html",
kind="Hathi Trust",
Expand Down Expand Up @@ -534,10 +554,32 @@ def test_marc_record_attribute_and_subfield_variations_transforms_correctly():
note="g",
),
timdex.Holding(
collection="j",
format="electronic resource",
location="f",
note="i",
location="only subfield d",
),
timdex.Holding(
format="electronic resource",
location="only subfield f",
),
timdex.Holding(
format="electronic resource",
note="only subfield i",
),
timdex.Holding(
format="electronic resource",
collection="only subfield j",
),
timdex.Holding(
format="electronic resource",
location="only subfield l",
),
timdex.Holding(
format="electronic resource",
location="f: d and l present",
),
timdex.Holding(
format="electronic resource",
location="l: d present",
),
],
identifiers=[
Expand All @@ -558,6 +600,26 @@ def test_marc_record_attribute_and_subfield_variations_transforms_correctly():
"Aljamía",
],
links=[
timdex.Link(
kind="Digital object URL",
url="only subfield d",
),
timdex.Link(
kind="Digital object URL",
url="only subfield f",
),
timdex.Link(
kind="Digital object URL",
url="only subfield l",
),
timdex.Link(
kind="Digital object URL",
url="f: d and l present",
),
timdex.Link(
kind="Digital object URL",
url="l: d present",
),
timdex.Link(url="u", kind="3", restrictions="z", text="y"),
timdex.Link(url="u", kind="3", restrictions="z", text="y"),
],
Expand Down Expand Up @@ -750,6 +812,27 @@ def test_create_subfield_value_string_from_datafield_with_blank_values():
assert Marc.create_subfield_value_string_from_datafield(datafield, "ad") == ""


def test_get_single_subfield_string_returns_expected_string():
datafield = BeautifulSoup(
'<datafield><subfield code="found"> the string </subfield>/<datafield>', "xml"
)
assert Marc.get_single_subfield_string(datafield, "found") == "the string"


def test_get_single_subfield_string_returns_none_if_no_string():
datafield = BeautifulSoup(
'<datafield><subfield code="empty"></subfield>/<datafield>', "xml"
)
assert Marc.get_single_subfield_string(datafield, "empty") is None


def test_get_single_subfield_string_returns_none_if_whitespace_string():
datafield = BeautifulSoup(
'<datafield><subfield code="whitespace"> </subfield>/<datafield>', "xml"
)
assert Marc.get_single_subfield_string(datafield, "found") is None


def test_json_crosswalk_code_to_name_returns_none_if_invalid(
caplog, marc_content_type_crosswalk
):
Expand Down
65 changes: 51 additions & 14 deletions transmogrifier/sources/marc.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,7 @@ def get_optional_fields(self, xml: Tag) -> Optional[dict]:
# funding_information

# holdings
# physical items
for datafield in xml.find_all("datafield", tag="985"):
holding_call_number_value = (
self.create_subfield_value_string_from_datafield(datafield, ["bb"])
Expand Down Expand Up @@ -271,23 +272,37 @@ def get_optional_fields(self, xml: Tag) -> Optional[dict]:
note=holding_note_value or None,
)
)
for datafield in xml.find_all("datafield", tag="986"):
holding_collection_value = self.create_subfield_value_string_from_datafield(
datafield, "j", ", "
# electronic portfolio items
for field_986 in xml.find_all("datafield", tag="986"):
electronic_item_collection = self.get_single_subfield_string(field_986, "j")
electronic_item_location = (
self.get_single_subfield_string(field_986, "f")
or self.get_single_subfield_string(field_986, "l")
or self.get_single_subfield_string(field_986, "d")
)
holding_location_value = self.create_subfield_value_string_from_datafield(
datafield, "f", ", "
)
holding_note_value = self.create_subfield_value_string_from_datafield(
datafield, "i", ", "
)
if holding_collection_value or holding_location_value or holding_note_value:
electronic_item_note = self.get_single_subfield_string(field_986, "i")
if any(
[
electronic_item_collection,
electronic_item_location,
electronic_item_note,
]
):
fields.setdefault("holdings", []).append(
timdex.Holding(
collection=holding_collection_value or None,
collection=electronic_item_collection,
format="electronic resource",
location=holding_location_value or None,
note=holding_note_value or None,
location=electronic_item_location,
note=electronic_item_note,
)
)
# If there's a URL, add to links field as well
if electronic_item_location:
fields.setdefault("links", []).append(
timdex.Link(
url=electronic_item_location,
kind="Digital object URL",
text=electronic_item_collection,
)
)

Expand Down Expand Up @@ -363,7 +378,7 @@ def get_optional_fields(self, xml: Tag) -> Optional[dict]:

fields["languages"] = list(dict.fromkeys(languages)) or None

# links
# links - see also: holdings field for electronic portfolio items
# If indicator 1 is 4 and indicator 2 is 0 or 1, take the URL from subfield u,
# the kind from subfield 3, link text from subfield y, and restrictions from
# subfield z."
Expand Down Expand Up @@ -718,6 +733,28 @@ def create_subfield_value_string_from_datafield(
Marc.create_subfield_value_list_from_datafield(xml_element, subfield_codes)
)

@staticmethod
def get_single_subfield_string(
xml_element: Tag, subfield_code: str
) -> Optional[str]:
"""
Get the string value of a <subfield> element with specified code(s).
Finds and returns the string value of a single <subfield> element if the
element contains a string. This uses bs4's find() method and thus will return
only the string value from the first <subfield> element matching the criteria.
Returns None if no matching <subfield> element containing a string is found, or
if the matching element's string value is only whitespace.
Args:
xml_element: A BeautifulSoup Tag representing a single MARC XML element.
subfield_code: The code attribute of the subfields to extract.
"""
if subfield := xml_element.find("subfield", code=subfield_code, string=True):
return str(subfield.string).strip() or None
return None

@staticmethod
def json_crosswalk_code_to_name(
code: str, crosswalk: dict, record_id: str, field_name: str
Expand Down

0 comments on commit 2e70a98

Please sign in to comment.