Skip to content

Commit

Permalink
Rename xml > source_record
Browse files Browse the repository at this point in the history
  • Loading branch information
ehanson8 committed May 2, 2024
1 parent 7f742e2 commit cee6ffc
Showing 1 changed file with 43 additions and 30 deletions.
73 changes: 43 additions & 30 deletions transmogrifier/sources/xml/dspace_dim.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,23 +13,24 @@
class DspaceDim(XMLTransformer):
"""DSpace DIM transformer."""

def get_optional_fields(self, xml: Tag) -> dict | None:
def get_optional_fields(self, source_record: Tag) -> dict | None:
"""
Retrieve optional TIMDEX fields from a DSpace DIM XML record.
Overrides metaclass get_optional_fields() method.
Args:
xml: A BeautifulSoup Tag representing a single DSpace DIM XML record.
source_record: A BeautifulSoup Tag representing a single DSpace DIM XML
record.
"""
fields: dict = {}

source_record_id = self.get_source_record_id(xml)
source_record_id = self.get_source_record_id(source_record)

# alternate_titles
for alternate_title in [
t
for t in xml.find_all("dim:field", element="title")
for t in source_record.find_all("dim:field", element="title")
if "qualifier" in t.attrs and t.string
]:
fields.setdefault("alternate_titles", []).append(
Expand All @@ -39,29 +40,31 @@ def get_optional_fields(self, xml: Tag) -> dict | None:
)
)
# If the record has more than one main title, add extras to alternate_titles
for index, title in enumerate(self.get_main_titles(xml)):
for index, title in enumerate(self.get_main_titles(source_record)):
if index > 0:
fields.setdefault("alternate_titles", []).append(
timdex.AlternateTitle(value=title)
)

# citation
citation = xml.find("dim:field", element="identifier", qualifier="citation")
citation = source_record.find(
"dim:field", element="identifier", qualifier="citation"
)
fields["citation"] = citation.string if citation and citation.string else None

# content_type
if content_types := self.get_content_types(xml):
if content_types := self.get_content_types(source_record):
if self.valid_content_types(content_types):
fields["content_type"] = content_types
else:
return None

# contents
fields["contents"] = self.get_contents(xml) or None
fields["contents"] = self.get_contents(source_record) or None

# contributors
for creator in [
c for c in xml.find_all("dim:field", element="creator") if c.string
c for c in source_record.find_all("dim:field", element="creator") if c.string
]:
fields.setdefault("contributors", []).append(
timdex.Contributor(
Expand All @@ -71,7 +74,9 @@ def get_optional_fields(self, xml: Tag) -> dict | None:
)

for contributor in [
c for c in xml.find_all("dim:field", element="contributor") if c.string
c
for c in source_record.find_all("dim:field", element="contributor")
if c.string
]:
fields.setdefault("contributors", []).append(
timdex.Contributor(
Expand All @@ -81,12 +86,12 @@ def get_optional_fields(self, xml: Tag) -> dict | None:
)

# dates
fields["dates"] = self.get_dates(xml, source_record_id) or None
fields["dates"] = self.get_dates(source_record, source_record_id) or None

# file_formats
fields["file_formats"] = [
f.string
for f in xml.find_all("dim:field", element="format")
for f in source_record.find_all("dim:field", element="format")
if f.get("qualifier") == "mimetype" and f.string
] or None

Expand All @@ -96,7 +101,7 @@ def get_optional_fields(self, xml: Tag) -> dict | None:
# funding_information
for funding_reference in [
f
for f in xml.find_all(
for f in source_record.find_all(
"dim:field", element="description", qualifier="sponsorship"
)
if f.string
Expand All @@ -108,7 +113,7 @@ def get_optional_fields(self, xml: Tag) -> dict | None:
)

# identifiers
identifiers = xml.find_all("dim:field", element="identifier")
identifiers = source_record.find_all("dim:field", element="identifier")
for identifier in [
i for i in identifiers if i.get("qualifier") != "citation" and i.string
]:
Expand All @@ -121,7 +126,9 @@ def get_optional_fields(self, xml: Tag) -> dict | None:

# language
fields["languages"] = [
la.string for la in xml.find_all("dim:field", element="language") if la.string
la.string
for la in source_record.find_all("dim:field", element="language")
if la.string
] or None

# links, uses identifiers list retrieved for identifiers field
Expand All @@ -139,12 +146,14 @@ def get_optional_fields(self, xml: Tag) -> dict | None:
# locations
fields["locations"] = [
timdex.Location(value=lo.string)
for lo in xml.find_all("dim:field", element="coverage", qualifier="spatial")
for lo in source_record.find_all(
"dim:field", element="coverage", qualifier="spatial"
)
if lo.string
] or None

# notes
descriptions = xml.find_all("dim:field", element="description")
descriptions = source_record.find_all("dim:field", element="description")
for description in [
d
for d in descriptions
Expand All @@ -167,13 +176,13 @@ def get_optional_fields(self, xml: Tag) -> dict | None:
# publishers
fields["publishers"] = [
timdex.Publisher(name=p.string)
for p in xml.find_all("dim:field", element="publisher")
for p in source_record.find_all("dim:field", element="publisher")
if p.string
] or None

# related_items
for related_item in [
r for r in xml.find_all("dim:field", element="relation") if r.string
r for r in source_record.find_all("dim:field", element="relation") if r.string
]:
if related_item.get("qualifier") == "uri":
ri = timdex.RelatedItem(
Expand All @@ -188,7 +197,7 @@ def get_optional_fields(self, xml: Tag) -> dict | None:

# rights
for rights in [
r for r in xml.find_all("dim:field", element="rights") if r.string
r for r in source_record.find_all("dim:field", element="rights") if r.string
]:
if rights.get("qualifier") == "uri":
rg = timdex.Rights(uri=rights.string)
Expand All @@ -201,7 +210,7 @@ def get_optional_fields(self, xml: Tag) -> dict | None:
# subjects
subjects_dict: dict[str, list[str]] = {}
for subject in [
s for s in xml.find_all("dim:field", element="subject") if s.string
s for s in source_record.find_all("dim:field", element="subject") if s.string
]:
if not subject.get("qualifier"):
subjects_dict.setdefault("Subject scheme not provided", []).append(
Expand Down Expand Up @@ -298,47 +307,51 @@ def _parse_date_range(
return None

@classmethod
def get_content_types(cls, xml: Tag) -> list[str] | None:
def get_content_types(cls, source_record: Tag) -> list[str] | None:
"""
Retrieve content types from a DSpace DIM XML record.
May be overridden by source subclasses that retrieve content type values
differently.
Args:
xml: A BeautifulSoup Tag representing a single DSpace DIM XML record.
source_record: A BeautifulSoup Tag representing a single DSpace DIM XML
record.
"""
return [
t.string for t in xml.find_all("dim:field", element="type", string=True)
t.string
for t in source_record.find_all("dim:field", element="type", string=True)
] or None

@classmethod
def get_main_titles(cls, xml: Tag) -> list[str]:
def get_main_titles(cls, source_record: Tag) -> list[str]:
"""
Retrieve main title(s) from a DSpace DIM XML record.
Overrides metaclass get_main_titles() method.
Args:
xml: A BeautifulSoup Tag representing a single DSpace DIM XML record.
source_record: A BeautifulSoup Tag representing a single DSpace DIM XML
record.
"""
return [
t.string
for t in xml.find_all("dim:field", element="title", string=True)
for t in source_record.find_all("dim:field", element="title", string=True)
if "qualifier" not in t.attrs
]

@classmethod
def get_source_record_id(cls, xml: Tag) -> str:
def get_source_record_id(cls, source_record: Tag) -> str:
"""
Get the source record ID from a DSpace DIM XML record.
Overrides metaclass get_source_record_id() method.
Args:
xml: A BeautifulSoup Tag representing a single DSpace DIM XML record.
source_record: A BeautifulSoup Tag representing a single DSpace DIM XML
record.
"""
return xml.header.identifier.string.split(":")[2]
return source_record.header.identifier.string.split(":")[2]

@classmethod
def valid_content_types(cls, _content_type_list: list[str]) -> bool:
Expand Down

0 comments on commit cee6ffc

Please sign in to comment.