Skip to content

Commit

Permalink
Merge pull request #257 from MITLibraries/GDT-269-ogm-incremental-use…
Browse files Browse the repository at this point in the history
…-rss-vs-api

GDT 269 - Incremental OGM harvests use Github RSS feed vs API
  • Loading branch information
ghukill committed Apr 4, 2024
2 parents 5a32497 + 5e6257a commit 0fd059d
Show file tree
Hide file tree
Showing 13 changed files with 411 additions and 353 deletions.
1 change: 1 addition & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ shapely = "*"
requests = "*"
pyaml = "*"
marcalyx = "*"
xmltodict = "*"

[dev-packages]
black = "*"
Expand Down
571 changes: 329 additions & 242 deletions Pipfile.lock

Large diffs are not rendered by default.

1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ GEOHARVESTER_SQS_TOPIC_NAME=### default value for CLI argument --sqs-topic-name
OGM_CONFIG_FILEPATH=### optional location for OGM configuration YAML
OGM_CLONE_ROOT_URL=### optional base URL or filepath for where to clone OGM repositories from
OGM_CLONE_ROOT_DIR=### optional location for where cloned repositories are saved locally
GITHUB_API_TOKEN=### optional Github API token to avoid potential rate limiting for OGM incremental harvests
```

## CLI Commands
Expand Down
15 changes: 9 additions & 6 deletions docs/ogm_harvests.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,14 @@ sequenceDiagram
ogm_config_yaml-->>geo_harv: Detailed list of repositories, paths, <br> and formats
loop GitHub Repository
geo_harv->>ogm_repo: Clone repo
geo_harv->>ogm_repo: Get list of commits after X date
ogm_repo-->>geo_harv: Git commits
geo_harv->>geo_harv: Parse commits and determine metadata <br> files that were modified or deleted
geo_harv->>geo_harv: Filter to list of files based on <br> supported metadata formats from config
geo_harv->>s3_timdex: Write MIT aardvark
geo_harv->>ogm_repo: Get recent commits via RSS XML
alt Has commit on or after X date
geo_harv->>ogm_repo: Clone repo
ogm_repo-->>geo_harv: Repo files
geo_harv->>geo_harv: Get list of commits on or after X date
geo_harv->>geo_harv: Parse commits and determine metadata <br> files that were modified or deleted
geo_harv->>geo_harv: Filter to list of files based on <br> supported metadata formats from config
geo_harv->>s3_timdex: Write MIT aardvark
end
end
```
9 changes: 0 additions & 9 deletions harvester/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ class Config:
"OGM_CONFIG_FILEPATH",
"OGM_CLONE_ROOT_URL",
"OGM_CLONE_ROOT_DIR",
"GITHUB_API_TOKEN",
)

def check_required_env_vars(self) -> None:
Expand Down Expand Up @@ -59,14 +58,6 @@ def ogm_clone_root_url(self) -> str:
def ogm_clone_root_dir(self) -> str:
return os.getenv("OGM_CLONE_ROOT_DIR", "output/ogm")

@property
def github_api_base_url(self) -> str:
return "https://api.github.com/repos/OpenGeoMetadata" # pragma: no cover

@property
def github_api_token(self) -> str:
return self.GITHUB_API_TOKEN


def configure_logger(
logger: logging.Logger,
Expand Down
4 changes: 0 additions & 4 deletions harvester/harvest/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,3 @@ class OGMFilenameFilterMethodError(Exception):

class OGMFromDateExceedsEpochDateError(Exception):
"""Only dates after 1979-01-01 are supported for OGM incremental harvests."""


class GithubApiRateLimitExceededError(Exception):
"""Raised when Github rate limit exceeded."""
58 changes: 23 additions & 35 deletions harvester/harvest/ogm.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,18 @@
import shutil
import time
from collections.abc import Callable, Iterator
from http import HTTPStatus

import pygit2 # type: ignore[import-untyped]
import requests
import smart_open # type: ignore[import-untyped]
import xmltodict # type: ignore[import-untyped]
import yaml
from attrs import define, field

from harvester.config import Config
from harvester.harvest import Harvester
from harvester.harvest.exceptions import (
GithubApiRateLimitExceededError,
OGMFilenameFilterMethodError,
OGMFromDateExceedsEpochDateError,
)
Expand All @@ -38,9 +39,6 @@

CONFIG = Config()

HTTP_NOT_FOUND = 404
HTTP_FORBIDDEN = 403


@define
class OGMHarvester(Harvester):
Expand Down Expand Up @@ -242,10 +240,10 @@ def get_all_records(self) -> Iterator["OGMRecord"]:
def get_modified_records(self, from_date: str) -> Iterator["OGMRecord"]:
"""Get all modified files since a "from" date.
To avoid cloning a full repository only to discover no commits after the target
date, this method first makes an HTTP call to the Github API to retrieve a list of
recent commits. If no commits are found after the target date, it returns an
empty list immediately.
To avoid cloning a full repository only to discover no commits on or after the
target date, this method first retrieves a list of recent commits via a Github
Atom RSS endpoint for the repository. If no commits are found on or after the
target date an empty list is returned.
If commits are found, the repository is cloned so that the git history can be
interrogated locally for modified files that are within the harvesting scope of
Expand Down Expand Up @@ -286,39 +284,29 @@ def get_modified_records(self, from_date: str) -> Iterator["OGMRecord"]:
)

def _remote_repository_has_new_commits(self, target_date: str) -> bool:
"""Return boolean if remote repository has commits on or after "from" date.
If env var GITHUB_API_TOKEN is not set, perform unauthenticated requests which may
result in rate limiting.
"""
headers = {}
if CONFIG.github_api_token:
headers["Authorization"] = f"token {CONFIG.github_api_token}"
else:
message = "Github API token not set, may encounter rate limiting."
logger.warning(message)
response = requests.get(
f"{CONFIG.github_api_base_url}/{self.name}/commits",
headers=headers,
timeout=20,
"""Return boolean if remote repository has commits on or after "from" date."""
atom_rss_response = requests.get(
f"https://github.com/OpenGeoMetadata/{self.name}/commits.atom", timeout=10
)

# handle unknown repository
if response.status_code == HTTP_NOT_FOUND and response.reason == "Not Found":
if atom_rss_response.status_code == HTTPStatus.NOT_FOUND:
message = f"Repository not found: OpenGeoMetadata/{self.name}"
raise requests.HTTPError(message)
if atom_rss_response.status_code != HTTPStatus.OK:
message = (
f"HTTP Error retrieving commits RSS: OpenGeoMetadata/{self.name}, "
f"{atom_rss_response.status_code}"
)
raise requests.HTTPError(message)

# handle rate limit
if (
response.status_code == HTTP_FORBIDDEN
and response.reason == "rate limit exceeded"
):
message = "Retry later or set env var GITHUB_API_TOKEN."
raise GithubApiRateLimitExceededError(message)
rss = xmltodict.parse(atom_rss_response.content)
if "entry" not in rss["feed"]:
return False
commits = rss["feed"]["entry"]
if isinstance(commits, dict):
commits = [commits]

commits = response.json()
return any(
date_parser(commit["commit"]["committer"]["date"]) >= date_parser(target_date)
date_parser(commit["updated"]) >= date_parser(target_date)
for commit in commits
)

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ ignore_missing_imports = true
[tool.pytest.ini_options]
log_level = "INFO"
markers = [
"use_github_api: marker to bypass autoused fixture that bypasses Github api calls"
"use_github_rss: marker to bypass autoused fixture that make Github RSS calls"
]

[tool.ruff]
Expand Down
31 changes: 12 additions & 19 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,6 @@ def _test_env(monkeypatch):
monkeypatch.setenv("OGM_CONFIG_FILEPATH", "tests/fixtures/ogm/ogm_test_config.yaml")
monkeypatch.setenv("OGM_CLONE_ROOT_URL", "tests/fixtures/ogm/repositories")
monkeypatch.setenv("OGM_CLONE_ROOT_DIR", "output/ogm")
if "GITHUB_API_TOKEN" in os.environ:
monkeypatch.delenv("GITHUB_API_TOKEN")


@pytest.fixture
Expand Down Expand Up @@ -476,7 +474,7 @@ def ogm_config():

@pytest.fixture(autouse=True)
def mock_remote_repository_has_commits(request):
if "use_github_api" not in request.keywords:
if "use_github_rss" not in request.keywords:
with patch(
"harvester.harvest.ogm.OGMRepository._remote_repository_has_new_commits"
) as mocked_method:
Expand All @@ -487,34 +485,29 @@ def mock_remote_repository_has_commits(request):


@pytest.fixture
def mocked_github_api_url():
return "https://api.github.com/repos/OpenGeoMetadata/edu.earth/commits"
def mocked_github_commits_rss():
return "https://github.com/OpenGeoMetadata/edu.earth/commits.atom"


@pytest.fixture
def _mock_github_api_response_one_2010_commit(mocked_github_api_url):
response = [
{
"sha": "94d2c8d2d34b41381fa3c80712f235788f5a1cd8",
"commit": {"committer": {"date": "2010-01-01T00:00:00Z"}},
"message": "I am a commit.",
}
]
responses.add(responses.GET, mocked_github_api_url, json=response, status=200)
def _mock_github_rss_response_one_2010_commit(mocked_github_commits_rss):
with open("tests/fixtures/github_commits_rss/single_commit.xml", "rb") as f:
responses.add(responses.GET, mocked_github_commits_rss, body=f.read(), status=200)


@pytest.fixture
def _mock_github_api_response_zero_commits(mocked_github_api_url):
responses.add(responses.GET, mocked_github_api_url, json=[], status=200)
def _mock_github_rss_response_zero_commits(mocked_github_commits_rss):
with open("tests/fixtures/github_commits_rss/no_commits.xml", "rb") as f:
responses.add(responses.GET, mocked_github_commits_rss, body=f.read(), status=200)


@pytest.fixture
def _mock_github_api_response_404_not_found(mocked_github_api_url):
responses.add(responses.GET, mocked_github_api_url, status=404)
def _mock_github_rss_response_404_not_found(mocked_github_commits_rss):
responses.add(responses.GET, mocked_github_commits_rss, status=404)


@pytest.fixture
def _mock_github_api_response_403_rate_limit(mocked_github_api_url):
def _mock_github_api_response_403_rate_limit(mocked_github_commits_rss):
"""Generic mocked response to set custom .reason attribute on response"""

class MockResponse:
Expand Down
7 changes: 7 additions & 0 deletions tests/fixtures/github_commits_rss/no_commits.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<feed xmlns="http://www.w3.org/2005/Atom" xmlns:media="http://search.yahoo.com/mrss/"
xml:lang="en-US">
<id>tag:github.com,2008:/OpenGeoMetadata/edu.earth/commits/main</id>
<title>Recent Commits to edu.harvard:main</title>
<updated>2024-01-08T20:01:10Z</updated>
</feed>
13 changes: 13 additions & 0 deletions tests/fixtures/github_commits_rss/single_commit.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
<?xml version="1.0" encoding="UTF-8"?>
<feed xmlns="http://www.w3.org/2005/Atom" xmlns:media="http://search.yahoo.com/mrss/"
xml:lang="en-US">
<id>tag:github.com,2008:/OpenGeoMetadata/edu.earth/commits/main</id>
<title>Recent Commits to edu.harvard:main</title>
<updated>2024-01-08T20:01:10Z</updated>
<entry>
<id>tag:github.com,2008:Grit::Commit/f6e09517b333a4e7a3224b91945e7f2400ef9c92</id>
<title>Commit title</title>
<updated>2024-01-08T20:01:10Z</updated>
<content type="html">Commit notes here.</content>
</entry>
</feed>
10 changes: 0 additions & 10 deletions tests/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,13 +61,3 @@ def test_config_env_var_access_error(config_instance):

def test_config_cdn_root(config_instance):
assert config_instance.http_cdn_root == "https://cdn.dev1.mitlibrary.net/geo"


def test_config_github_api_token_set_from_env_var(monkeypatch, config_instance):
fake_token = "abc123" # noqa: S105
monkeypatch.setenv("GITHUB_API_TOKEN", fake_token)
assert config_instance.github_api_token == fake_token


def test_config_github_api_token_none_from_no_env_var(monkeypatch, config_instance):
assert config_instance.github_api_token is None
42 changes: 16 additions & 26 deletions tests/test_harvest/test_ogm_harvester.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@

from harvester.config import Config
from harvester.harvest.exceptions import (
GithubApiRateLimitExceededError,
OGMFilenameFilterMethodError,
OGMFromDateExceedsEpochDateError,
)
Expand Down Expand Up @@ -249,54 +248,45 @@ def test_ogm_harvester_no_modified_records_when_remote_repo_no_commits(
assert list(ogm_repository_earth.get_modified_records("2000-01-01")) == []


@pytest.mark.use_github_api
@pytest.mark.usefixtures("_mock_github_api_response_one_2010_commit")
@pytest.mark.use_github_rss
@pytest.mark.usefixtures("_mock_github_rss_response_one_2010_commit")
@responses.activate
def test_ogm_repository_check_remote_repo_commits_has_commits_success(
ogm_repository_earth,
):
assert ogm_repository_earth._remote_repository_has_new_commits("2005-01-01")


@pytest.mark.use_github_api
@pytest.mark.usefixtures("_mock_github_api_response_zero_commits")
@pytest.mark.use_github_rss
@pytest.mark.usefixtures("_mock_github_rss_response_zero_commits")
@responses.activate
def test_ogm_repository_check_remote_repo_commits_has_no_commits_success(
ogm_repository_earth,
):
assert not ogm_repository_earth._remote_repository_has_new_commits("2005-01-01")


@pytest.mark.use_github_api
@pytest.mark.usefixtures("_mock_github_api_response_404_not_found")
@pytest.mark.use_github_rss
@pytest.mark.usefixtures("_mock_github_rss_response_404_not_found")
@responses.activate
def test_ogm_repository_check_remote_repo_unknown_repository_raise_error(
ogm_repository_earth,
):
with pytest.raises(requests.HTTPError):
with pytest.raises(
requests.HTTPError,
match="Repository not found: OpenGeoMetadata/edu.earth",
):
ogm_repository_earth._remote_repository_has_new_commits("2005-01-01")


@pytest.mark.use_github_api
@pytest.mark.use_github_rss
@pytest.mark.usefixtures("_mock_github_api_response_403_rate_limit")
@responses.activate
def test_ogm_repository_check_remote_repo_api_rate_limit_exceeded_raises_error(
def test_ogm_repository_check_remote_repo_rate_limit_error(
ogm_repository_earth,
):
with pytest.raises(GithubApiRateLimitExceededError):
ogm_repository_earth._remote_repository_has_new_commits("2005-01-01")


@pytest.mark.use_github_api
@pytest.mark.usefixtures("_mock_github_api_response_one_2010_commit")
@responses.activate
def test_ogm_repository_check_remote_repo_github_token_set_as_auth_header_success(
monkeypatch, ogm_repository_earth
):
fake_github_token = "abc123" # noqa: S105
monkeypatch.setenv("GITHUB_API_TOKEN", fake_github_token)
with mock.patch("requests.get") as mocked_get_request:
with pytest.raises(
requests.HTTPError,
match="HTTP Error retrieving commits RSS: OpenGeoMetadata/edu.earth, 403",
):
ogm_repository_earth._remote_repository_has_new_commits("2005-01-01")
assert mocked_get_request.call_args[1]["headers"] == {
"Authorization": f"token {fake_github_token}"
}

0 comments on commit 0fd059d

Please sign in to comment.