Skip to content

Commit

Permalink
Merge pull request #551 from MITLibraries/TIMX-258-graceful-record-skip
Browse files Browse the repository at this point in the history
Skip errors during harvest and report to Sentry
  • Loading branch information
ghukill committed Nov 20, 2023
2 parents 052ae12 + 7948e35 commit 0b5c2ab
Show file tree
Hide file tree
Showing 12 changed files with 1,001 additions and 569 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -107,5 +107,6 @@ tmp/*
out.xml
.DS_Store
.vscode
.idea

output/
3 changes: 2 additions & 1 deletion Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ click = "*"
sentry-sdk = "*"
sickle = "*"
smart-open = {extras = ["s3"], version = "*"}
urllib3 = "==1.26.18"

[dev-packages]
bandit = "*"
Expand All @@ -17,7 +18,7 @@ flake8 = "*"
isort = "*"
mypy = "*"
pytest = "*"
vcrpy = "*"
vcrpy = "==4.2.1"

[requires]
python_version = "3.11"
Expand Down
1,130 changes: 572 additions & 558 deletions Pipfile.lock

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion harvester/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@

DEFAULT_RETRY_AFTER = 30
MAX_RETRIES = 10
RETRY_STATUS_CODES = (429, 500, 503)
RETRY_STATUS_CODES = [429, 500, 503]
MAX_ALLOWED_ERRORS = 10


def configure_logger(logger: logging.Logger, verbose: bool) -> str:
Expand Down
7 changes: 7 additions & 0 deletions harvester/exceptions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
"""exceptions.py module."""


class MaxAllowedErrorsReached(Exception):
"""Thrown when maximum numbers of errors reached during GetRecords harvest."""

pass
52 changes: 45 additions & 7 deletions harvester/oai.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,24 @@
"""oai.py module."""

import json
import logging
import os
from typing import Iterator, Literal, Optional
from typing import Any, Iterator, Literal, Optional

from requests import HTTPError
import smart_open
from sickle import Sickle
from sickle.models import Record
from sickle.oaiexceptions import IdDoesNotExist
from sickle.oaiexceptions import IdDoesNotExist, OAIError

from harvester.config import DEFAULT_RETRY_AFTER, MAX_RETRIES, RETRY_STATUS_CODES
from harvester.config import (
DEFAULT_RETRY_AFTER,
MAX_RETRIES,
RETRY_STATUS_CODES,
MAX_ALLOWED_ERRORS,
)
from harvester.exceptions import MaxAllowedErrorsReached
from harvester.utils import send_sentry_message

logger = logging.getLogger(__name__)

Expand All @@ -22,13 +31,15 @@ def __init__(
from_date: Optional[str] = None,
until_date: Optional[str] = None,
set_spec: Optional[str] = None,
max_retries: Optional[int] = MAX_RETRIES,
retry_status_codes: list[int] = RETRY_STATUS_CODES,
) -> None:
self.source_url = source_url
self.client = Sickle(
self.source_url,
default_retry_after=DEFAULT_RETRY_AFTER,
max_retries=MAX_RETRIES,
retry_status_codes=RETRY_STATUS_CODES,
max_retries=max_retries,
retry_status_codes=retry_status_codes,
)
self.metadata_format = metadata_format
self._set_params(metadata_format, from_date, until_date, set_spec)
Expand Down Expand Up @@ -59,9 +70,23 @@ def get_identifiers(self, exclude_deleted: bool) -> Iterator[str]:
yield record.identifier

def get_records(
self, identifiers: Iterator[str], skip_list: Optional[tuple[str]] = None
self,
identifiers: Iterator[str],
skip_list: Optional[tuple[str]] = None,
max_allowed_errors: int = MAX_ALLOWED_ERRORS,
) -> Iterator[Record]:
failed_records: list[tuple[str, Any | str | None]] = []
for identifier in identifiers:
if len(failed_records) == max_allowed_errors:
message = (
f"OAI harvest ABORTED, max errors reached: {max_allowed_errors}."
)
send_sentry_message(
message,
{"failed_records": failed_records},
)
raise MaxAllowedErrorsReached(message)

if skip_list and identifier in skip_list:
logger.warning(
"Skipped retrieving record with identifier %s because it is in the "
Expand All @@ -74,6 +99,12 @@ def get_records(
identifier=identifier, metadataPrefix=self.metadata_format
)
logger.debug("Record retrieved: %s", identifier)
except (HTTPError, OAIError) as e:
logger.warning(
"GetRecord error for identifier %s, reporting to Sentry", identifier
)
failed_records.append((identifier, getattr(e.request, "url", None)))
continue
except IdDoesNotExist:
logger.warning(
"Identifier %s retrieved in identifiers list returned 'id does not "
Expand All @@ -83,6 +114,13 @@ def get_records(
continue
yield record

if len(failed_records) > 0:
send_sentry_message(
f"OAI harvest COMPLETED, but with errors: {len(failed_records)} "
f"records skipped.",
{"failed_records": failed_records},
)

def get_sets(self):
responses = self.client.ListSets()
sets = [{"Set name": set.setName, "Set spec": set.setSpec} for set in responses]
Expand All @@ -104,7 +142,7 @@ def retrieve_records(
return self.list_records(exclude_deleted)
else:
raise ValueError(
'Method must be either "get" or "list", method provided was "{method}"'
f'Method must be either "get" or "list", method provided was "{method}"'
)


Expand Down
28 changes: 28 additions & 0 deletions harvester/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
"""utils.py module."""

import sentry_sdk


def send_sentry_message(
message: str,
scopes: dict | None = None,
level: str = "warning",
):
"""Send message directly to Sentry.
This allows both reporting information without raising an Exception, and optionally
including additional information in the sent Sentry message via "scopes".
Args:
message: Primary message string for Sentry message.
scopes: Dictionary of key/value pairs which become additional information in the
Sentry message.
level: String of [info,debug,warning,error] that will set the severity of the
Sentry message.
"""
with sentry_sdk.push_scope() as scope:
if scopes:
for scope_key, scope_value in scopes.items():
scope.set_extra(scope_key, scope_value)
send_receipt = sentry_sdk.capture_message(message, level=level)
return send_receipt
3 changes: 3 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,8 @@ ignore_missing_imports = True
[mypy-smart_open.*]
ignore_missing_imports = True

[mypy-requests.*]
ignore_missing_imports = True

[tool:pytest]
log_level = DEBUG
8 changes: 8 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import pytest
from click.testing import CliRunner
from unittest.mock import patch


@pytest.fixture(autouse=True)
Expand All @@ -14,3 +15,10 @@ def test_env():
def cli_runner():
runner = CliRunner()
return runner


@pytest.fixture
def mock_sentry_capture_message():
with patch("harvester.utils.sentry_sdk.capture_message") as mock_capture:
mock_capture.return_value = "message-sent-ok"
yield mock_capture
Loading

0 comments on commit 0b5c2ab

Please sign in to comment.