Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions lambdas/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,33 @@ def generate_extract_command(
)
extract_command.append(source.removeprefix("gis"))

elif source == "mitlibwebsite":
# defaults
extract_command.append("harvest")
extract_command.append("--include-fulltext")

# required
extract_command.append(
f"--config-yaml-file={input_data['btrix-config-yaml-file']}"
)
extract_command.append(
f"--metadata-output-file=s3://{timdex_bucket}/{extract_output_file}"
)

# optional
if sitemaps := input_data.get("btrix-sitemaps"):
extract_command.extend([f"--sitemap={sitemap}" for sitemap in sitemaps])
if run_type == "daily":
extract_command.append(
f"--sitemap-from-date={helpers.generate_harvest_from_date(run_date)}"
)
if sitemap_urls_out := input_data.get("btrix-sitemap-urls-output-file"):
extract_command.append(f"--sitemap-urls-output-file={sitemap_urls_out}")
if sitemap_urls_previous := input_data.get("btrix-previous-sitemap-urls-file"):
extract_command.append(
f"--previous-sitemap-urls-file={sitemap_urls_previous}"
)

else:
extract_command.append(f"--host={input_data['oai-pmh-host']}")
extract_command.append(
Expand Down
23 changes: 21 additions & 2 deletions lambdas/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,30 @@ class Config:
INDEX_ALIASES: ClassVar = {
"geo": GIS_SOURCES,
"rdi": ["jpal", "whoas", "zenodo"],
"timdex": ["alma", "aspace", "dspace", "libguides", "researchdatabases"],
"use": ["aspace", "dspace", *list(GIS_SOURCES), "libguides", "researchdatabases"],
"timdex": [
"alma",
"aspace",
"dspace",
"libguides",
"mitlibwebsite",
"researchdatabases",
],
"use": [
"aspace",
"dspace",
*list(GIS_SOURCES),
"libguides",
"mitlibwebsite",
"researchdatabases",
],
}
REQUIRED_FIELDS = ("next-step", "run-date", "run-type", "source")
REQUIRED_OAI_HARVEST_FIELDS = ("oai-pmh-host", "oai-metadata-format")
REQUIRED_BTRIX_HARVEST_FIELDS = (
"btrix-config-yaml-file",
"btrix-sitemaps",
"btrix-sitemap-urls-output-file",
)
VALID_DATE_FORMATS = ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%SZ")
VALID_RUN_TYPES = ("full", "daily")
VALID_STEPS = ("extract", "transform", "load")
Expand Down
2 changes: 2 additions & 0 deletions lambdas/format_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ def lambda_handler(event: dict, _context: dict) -> dict:
if next_step == "extract":
if source in CONFIG.GIS_SOURCES:
result["harvester-type"] = "geo"
elif source == "mitlibwebsite":
result["harvester-type"] = "browsertrix"
else:
result["harvester-type"] = "oai"
result["next-step"] = "transform"
Expand Down
40 changes: 30 additions & 10 deletions lambdas/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,19 +48,35 @@ def validate_input(input_data: dict) -> None:
raise ValueError(message)

# If next step is extract step, required harvest fields are present
# ruff: noqa: SIM102
if input_data["next-step"] == "extract":
if input_data["source"] not in CONFIG.GIS_SOURCES:
if missing_harvest_fields := [
field
for field in CONFIG.REQUIRED_OAI_HARVEST_FIELDS
if field not in input_data
]:
missing_harvest_fields = None
if input_data["source"] in CONFIG.GIS_SOURCES:
pass # Currently no specific GeoHarvester requirements
elif input_data["source"] == "mitlibwebsite":
missing_harvest_fields = set(CONFIG.REQUIRED_BTRIX_HARVEST_FIELDS).difference(
set(input_data.keys())
)
# require previous sitemaps URLs argument for daily runs
if (
input_data["run-type"] == "daily"
and "btrix-previous-sitemap-urls-file" not in input_data
):
message = (
"Input must include all required harvest fields when starting "
f"with harvest step. Missing fields: {missing_harvest_fields}"
"Field 'btrix-previous-sitemap-urls-file' "
"required when 'run-type=daily'"
)
raise ValueError(message)
else:
missing_harvest_fields = set(CONFIG.REQUIRED_OAI_HARVEST_FIELDS).difference(
set(input_data.keys())
)

if missing_harvest_fields:
message = (
"Input must include all required harvest fields when starting "
f"with harvest step. Missing fields: {list(missing_harvest_fields)}"
)
raise ValueError(message)


def format_run_date(input_date: str) -> str:
Expand Down Expand Up @@ -109,7 +125,11 @@ def generate_step_output_filename(
"""
sequence_suffix = f"_{sequence}" if sequence else ""
if step == "extract":
file_type = "jsonl" if source in CONFIG.GIS_SOURCES else "xml"
file_type = (
"jsonl"
if (source in CONFIG.GIS_SOURCES or source == "mitlibwebsite")
else "xml"
)
elif load_type == "delete":
file_type = "txt"
else:
Expand Down
16 changes: 16 additions & 0 deletions tests/fixtures/event_payloads/mitlibwebsite-daily-extract.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
{
"next-step": "extract",
"run-date": "2025-10-14",
"run-type": "daily",
"source": "mitlibwebsite",
"verbose": "true",
"run-id": "abc123",
"btrix-config-yaml-file": "s3://timdex-bucket/mitlibwebsite/config/mitlibwebsite.yaml",
"btrix-sitemaps": [
"https://libraries.mit.edu/sitemap.xml",
"https://libraries.mit.edu/news/sitemap.xml"
],
"btrix-sitemap-from-date": "2025-10-13",
"btrix-sitemap-urls-output-file": "s3://timdex-bucket/mitlibwebsite/last-sitemaps-urls.txt",
"btrix-previous-sitemap-urls-file": "s3://timdex-bucket/mitlibwebsite/last-sitemaps-urls.txt"
}
14 changes: 14 additions & 0 deletions tests/fixtures/event_payloads/mitlibwebsite-full-extract.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"next-step": "extract",
"run-date": "2025-10-14",
"run-type": "full",
"source": "mitlibwebsite",
"verbose": "true",
"run-id": "abc123",
"btrix-config-yaml-file": "s3://timdex-bucket/mitlibwebsite/config/mitlibwebsite.yaml",
"btrix-sitemaps": [
"https://libraries.mit.edu/sitemap.xml",
"https://libraries.mit.edu/news/sitemap.xml"
],
"btrix-sitemap-urls-output-file": "s3://timdex-bucket/mitlibwebsite/last-sitemaps-urls.txt"
}
57 changes: 57 additions & 0 deletions tests/test_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,63 @@ def test_generate_extract_command_geoharvester():
}


def test_generate_extract_command_mitlibwebsite_full():
input_data = {
"run-date": "2022-01-02T12:13:14Z",
"run-type": "full",
"next-step": "extract",
"source": "mitlibwebsite",
"btrix-config-yaml-file": "s3://bucket/config.yaml",
"btrix-sitemaps": [
"https://libraries.mit.edu/sitemap.xml",
"https://libraries.mit.edu/news/sitemap.xml",
],
"btrix-sitemap-urls-output-file": "s3://bucket/output.txt",
}
assert commands.generate_extract_command(
input_data, "2022-01-02", "test-timdex-bucket", False
) == {
"extract-command": [
"harvest",
"--include-fulltext",
"--config-yaml-file=s3://bucket/config.yaml",
"--metadata-output-file=s3://test-timdex-bucket/mitlibwebsite/"
"mitlibwebsite-2022-01-02-full-extracted-records-to-index.jsonl",
"--sitemap=https://libraries.mit.edu/sitemap.xml",
"--sitemap=https://libraries.mit.edu/news/sitemap.xml",
"--sitemap-urls-output-file=s3://bucket/output.txt",
]
}


def test_generate_extract_command_mitlibwebsite_daily():
input_data = {
"run-date": "2022-01-02T12:13:14Z",
"run-type": "daily",
"next-step": "extract",
"source": "mitlibwebsite",
"btrix-config-yaml-file": "s3://bucket/config.yaml",
"btrix-sitemaps": ["https://libraries.mit.edu/sitemap.xml"],
"btrix-sitemap-urls-output-file": "s3://bucket/output.txt",
"btrix-previous-sitemap-urls-file": "s3://bucket/previous.txt",
}
assert commands.generate_extract_command(
input_data, "2022-01-02", "test-timdex-bucket", False
) == {
"extract-command": [
"harvest",
"--include-fulltext",
"--config-yaml-file=s3://bucket/config.yaml",
"--metadata-output-file=s3://test-timdex-bucket/mitlibwebsite/"
"mitlibwebsite-2022-01-02-daily-extracted-records-to-index.jsonl",
"--sitemap=https://libraries.mit.edu/sitemap.xml",
"--sitemap-from-date=2022-01-01",
"--sitemap-urls-output-file=s3://bucket/output.txt",
"--previous-sitemap-urls-file=s3://bucket/previous.txt",
]
}


def test_generate_transform_commands_required_input_fields(run_id, run_timestamp):
input_data = {
"next-step": "transform",
Expand Down
73 changes: 73 additions & 0 deletions tests/test_format_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,79 @@ def test_lambda_handler_with_next_step_extract():
}


def test_lambda_handler_with_next_step_extract_mitlibwebsite_full():
event = {
"run-date": "2022-01-02T12:13:14Z",
"run-type": "full",
"next-step": "extract",
"source": "mitlibwebsite",
"run-id": "run-abc-123",
"btrix-config-yaml-file": "s3://bucket/config.yaml",
"btrix-sitemaps": [
"https://libraries.mit.edu/sitemap.xml",
"https://libraries.mit.edu/news/sitemap.xml",
],
"btrix-sitemap-urls-output-file": "s3://bucket/output.txt",
}
output = format_input.lambda_handler(event, {})
assert output == {
"run-date": "2022-01-02",
"run-type": "full",
"source": "mitlibwebsite",
"verbose": False,
"harvester-type": "browsertrix",
"next-step": "transform",
"extract": {
"extract-command": [
"harvest",
"--include-fulltext",
"--config-yaml-file=s3://bucket/config.yaml",
"--metadata-output-file=s3://test-timdex-bucket/mitlibwebsite/"
"mitlibwebsite-2022-01-02-full-extracted-records-to-index.jsonl",
"--sitemap=https://libraries.mit.edu/sitemap.xml",
"--sitemap=https://libraries.mit.edu/news/sitemap.xml",
"--sitemap-urls-output-file=s3://bucket/output.txt",
]
},
}


def test_lambda_handler_with_next_step_extract_mitlibwebsite_daily():
event = {
"run-date": "2022-01-02T12:13:14Z",
"run-type": "daily",
"next-step": "extract",
"source": "mitlibwebsite",
"run-id": "run-abc-123",
"btrix-config-yaml-file": "s3://bucket/config.yaml",
"btrix-sitemaps": ["https://libraries.mit.edu/sitemap.xml"],
"btrix-sitemap-urls-output-file": "s3://bucket/output.txt",
"btrix-previous-sitemap-urls-file": "s3://bucket/previous.txt",
}
output = format_input.lambda_handler(event, {})
assert output == {
"run-date": "2022-01-02",
"run-type": "daily",
"source": "mitlibwebsite",
"verbose": False,
"harvester-type": "browsertrix",
"next-step": "transform",
"extract": {
"extract-command": [
"harvest",
"--include-fulltext",
"--config-yaml-file=s3://bucket/config.yaml",
"--metadata-output-file=s3://test-timdex-bucket/mitlibwebsite/"
"mitlibwebsite-2022-01-02-daily-extracted-records-to-index.jsonl",
"--sitemap=https://libraries.mit.edu/sitemap.xml",
"--sitemap-from-date=2022-01-01",
"--sitemap-urls-output-file=s3://bucket/output.txt",
"--previous-sitemap-urls-file=s3://bucket/previous.txt",
]
},
}


def test_lambda_handler_with_next_step_transform_files_present(s3_client, run_timestamp):
s3_client.put_object(
Bucket="test-timdex-bucket",
Expand Down
62 changes: 62 additions & 0 deletions tests/test_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,68 @@ def test_validate_input_with_all_required_harvest_fields_returns_none():
assert helpers.validate_input(event) is None


def test_validate_input_mitlibwebsite_missing_harvest_fields_raises_error():
event = {
"next-step": "extract",
"run-date": "2022-01-02",
"run-type": "full",
"source": "mitlibwebsite",
"btrix-config-yaml-file": "s3://bucket/config.yaml",
"btrix-sitemaps": ["https://example.com/sitemap.xml"],
}
with pytest.raises(ValueError) as error:
helpers.validate_input(event)
assert "Input must include all required harvest fields when starting with " in str(
error.value
)
assert "'btrix-sitemap-urls-output-file'" in str(error.value)


def test_validate_input_mitlibwebsite_daily_missing_previous_urls_raises_error():
event = {
"next-step": "extract",
"run-date": "2022-01-02",
"run-type": "daily",
"source": "mitlibwebsite",
"btrix-config-yaml-file": "s3://bucket/config.yaml",
"btrix-sitemaps": ["https://example.com/sitemap.xml"],
"btrix-sitemap-urls-output-file": "s3://bucket/output.txt",
}
with pytest.raises(ValueError) as error:
helpers.validate_input(event)
assert (
"Field 'btrix-previous-sitemap-urls-file' required when 'run-type=daily'"
in str(error.value)
)


def test_validate_input_mitlibwebsite_full_without_previous_urls_returns_none():
event = {
"next-step": "extract",
"run-date": "2022-01-02",
"run-type": "full",
"source": "mitlibwebsite",
"btrix-config-yaml-file": "s3://bucket/config.yaml",
"btrix-sitemaps": ["https://example.com/sitemap.xml"],
"btrix-sitemap-urls-output-file": "s3://bucket/output.txt",
}
assert helpers.validate_input(event) is None


def test_validate_input_mitlibwebsite_daily_with_all_required_fields_returns_none():
event = {
"next-step": "extract",
"run-date": "2022-01-02",
"run-type": "daily",
"source": "mitlibwebsite",
"btrix-config-yaml-file": "s3://bucket/config.yaml",
"btrix-sitemaps": ["https://example.com/sitemap.xml"],
"btrix-sitemap-urls-output-file": "s3://bucket/output.txt",
"btrix-previous-sitemap-urls-file": "s3://bucket/previous.txt",
}
assert helpers.validate_input(event) is None


def test_format_run_date_valid_run_date_string():
assert helpers.format_run_date("2022-01-02T12:13:14Z") == "2022-01-02"

Expand Down