Skip to content

Commit

Permalink
Add method option to harvest command
Browse files Browse the repository at this point in the history
Why these changes are being introduced:
We would generally prefer to use the Sickle library's built-in
ListRecords command to harvest records, but it doesn't work correctly
for full harvests from ArchivesSpace. The best solution for now is to
add the list method as an option, set as the default, with the current
list-identifiers-then-get-each-record (get method) still available as an
option when needed.

How this addresses that need:
* Adds `list_records` method to the OAIClient, along with a
  `retrieve_records` method that handles calling the internal logic of
  calling the correct internal method sequence based on the passed
  `method` argument.
* Adds a `method` option to the harvest command with "get" and "list"
  options and "list" as the default.
* Updates the OAIClient `get_identifiers` and `get_records` methods for
  clarity, efficienty, better error handling/logging, and to use
  Sickle's built-in deleted records handling.
* Adds tests for new functionality, updates some existing tests and one
  cassette fixture to reflect changes.
* Also adds an optional env variable to change the interval for logging
  status updates during a harvest.

Side effects of this change:
* External tools using this app will need to specify the get method when
  harvesting from ArchivesSpace.
* The harvest command no longer logs the number of records to retrieve
  prior to harvesting those records, because 1) this is not possible at
  all using the list method and 2) when using the get method, it adds a
  memory inefficiency (storing all the identifiers in a list in memory
  instead of retrieving them on-demand via the generator function) that
  can be signficant for large harvests.

Relevant ticket(s):
* https://mitlibraries.atlassian.net/browse/RDI-176
* https://mitlibraries.atlassian.net/browse/RDI-177
* https://mitlibraries.atlassian.net/browse/RDI-181
  • Loading branch information
hakbailey committed Jul 8, 2022
1 parent f4df5d3 commit 7a91b16
Show file tree
Hide file tree
Showing 9 changed files with 4,868 additions and 3,536 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,4 +57,5 @@ If you have your credentials stored locally, you can omit the passed params like

## Required ENV
`SENTRY_DSN` = If set to a valid Sentry DSN, enables Sentry exception monitoring. This is not needed for local development.
`STATUS_UPDATE_INTERVAL` = The transform process logs the # of records transformed every nth record (1000 by default). Set this env variable to any integer to change the frequency of logging status updates. Can be useful for development/debugging.
`WORKSPACE` = Set to `dev` for local development, this will be set to `stage` and `prod` in those environments by Terraform.
32 changes: 21 additions & 11 deletions harvester/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,15 @@ def main(ctx, host, output_file, verbose):


@main.command()
@click.option(
"--method",
default="list",
show_default=True,
help="Record retrieval method to use. Default 'list' method is faster and should "
"be used in most cases; 'get' method should be used for ArchivesSpace due to "
"errors retrieving a full record set with the 'list' method.",
type=click.Choice(["get", "list"], case_sensitive=False),
)
@click.option(
"-m",
"--metadata-format",
Expand Down Expand Up @@ -78,36 +87,37 @@ def main(ctx, host, output_file, verbose):
is_flag=True,
)
@click.pass_context
def harvest(ctx, metadata_format, from_date, until_date, set_spec, exclude_deleted):
def harvest(
ctx, method, metadata_format, from_date, until_date, set_spec, exclude_deleted
):
"""Harvest records from an OAI-PMH compliant source and write to an output file."""
logger.info(
"OAI-PMH harvesting from source %s with parameters: metadata_format=%s, "
"from_date=%s, until_date=%s, set=%s, exclude_deleted=%s",
"OAI-PMH harvesting from source %s with parameters: method=%s, "
"metadata_format=%s, from_date=%s, until_date=%s, set=%s, exclude_deleted=%s",
ctx.obj["HOST"],
method,
metadata_format,
from_date,
until_date,
set_spec,
exclude_deleted,
)

oai_client = OAIClient(
ctx.obj["HOST"], metadata_format, from_date, until_date, set_spec
)
records = oai_client.retrieve_records(method, exclude_deleted=exclude_deleted)

logger.info("Writing records to output file: %s", ctx.obj["OUTPUT_FILE"])
try:
identifiers = oai_client.get_identifiers()
count = write_records(records, ctx.obj["OUTPUT_FILE"])
except NoRecordsMatch:
logger.error(
"No records harvested: the combination of the provided options results in "
"an empty list."
)
sys.exit()
logger.info(
"Number of records to harvest (including deleted records): %d",
len(identifiers),
)
records = oai_client.get_records(identifiers, exclude_deleted)
logger.info("Writing records to output file: %s", ctx.obj["OUTPUT_FILE"])
count = write_records(records, ctx.obj["OUTPUT_FILE"])

logger.info(
"Harvest completed. Total records harvested (%sincluding deleted records): %d",
"not " if exclude_deleted else "",
Expand Down
62 changes: 41 additions & 21 deletions harvester/oai.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
"""oai.py module."""
import json
import logging
from typing import Iterator, Optional
import os
from typing import Iterator, Literal, Optional

import smart_open
from sickle import Sickle
from sickle.models import Record
from sickle.oaiexceptions import IdDoesNotExist

from harvester.config import DEFAULT_RETRY_AFTER, MAX_RETRIES, RETRY_STATUS_CODES

Expand Down Expand Up @@ -49,43 +51,61 @@ def _set_params(
params["set"] = set_spec
self.params = params

def get_identifiers(self) -> list[str]:
responses = self.client.ListIdentifiers(**self.params)
return [record.identifier for record in responses]
def get_identifiers(self, exclude_deleted: bool) -> Iterator[str]:
responses = self.client.ListIdentifiers(
ignore_deleted=exclude_deleted, **self.params
)
for record in responses:
yield record.identifier

def get_records(
self, identifiers: list[str], exclude_deleted: bool
) -> Iterator[Record]:
def get_records(self, identifiers: Iterator[str]) -> Iterator[Record]:
for identifier in identifiers:
record = self.client.GetRecord(
identifier=identifier, metadataPrefix=self.metadata_format
)
logger.debug(
"Record retrieved:\n Deleted:%s\n Header:%s\n Raw:%s\n",
record.deleted,
record.header,
record.raw,
)
if exclude_deleted is True and record.deleted is True:
continue
try:
record = self.client.GetRecord(
identifier=identifier, metadataPrefix=self.metadata_format
)
logger.debug("Record retrieved: %s", identifier)
except IdDoesNotExist:
logger.warning(
"Identifier %s retrieved in identifiers list returned 'id does not "
"exist' during getRecord request",
identifier,
)
yield record

def get_sets(self):
responses = self.client.ListSets()
sets = [{"Set name": set.setName, "Set spec": set.setSpec} for set in responses]
return sets

def list_records(self, exclude_deleted: bool) -> Iterator[Record]:
return self.client.ListRecords(ignore_deleted=exclude_deleted, **self.params)

def retrieve_records(
self, method: Literal["get", "list"], exclude_deleted: bool
) -> Iterator[Record]:
if method == "get":
identifiers = self.get_identifiers(exclude_deleted)
return self.get_records(identifiers)
elif method == "list":
return self.list_records(exclude_deleted)
else:
raise ValueError(
'Method must be either "get" or "list", method provided was "{method}"'
)


def write_records(records: Iterator, filepath: str) -> int:
count = 0
with smart_open.open(filepath, "wb") as file:
file.write("<records>\n".encode())
file.write('<?xml version="1.0" encoding="UTF-8"?>\n<records>\n'.encode())
for record in records:
file.write(" ".encode() + record.raw.encode() + "\n".encode())
count += 1
if count % 1000 == 0:
if count % int(os.getenv("STATUS_UPDATE_INTERVAL", "1000")) == 0:
logger.info(
"Status update: %s records written to output file so far!", count
"Status update: %s records written to output file so far!",
count,
)
file.write("</records>".encode())
return count
Expand Down
Loading

0 comments on commit 7a91b16

Please sign in to comment.