diff --git a/lambdas/commands.py b/lambdas/commands.py index b57b769..38cc105 100644 --- a/lambdas/commands.py +++ b/lambdas/commands.py @@ -46,6 +46,33 @@ def generate_extract_command( ) extract_command.append(source.removeprefix("gis")) + elif source == "mitlibwebsite": + # defaults + extract_command.append("harvest") + extract_command.append("--include-fulltext") + + # required + extract_command.append( + f"--config-yaml-file={input_data['btrix-config-yaml-file']}" + ) + extract_command.append( + f"--metadata-output-file=s3://{timdex_bucket}/{extract_output_file}" + ) + + # optional + if sitemaps := input_data.get("btrix-sitemaps"): + extract_command.extend([f"--sitemap={sitemap}" for sitemap in sitemaps]) + if run_type == "daily": + extract_command.append( + f"--sitemap-from-date={helpers.generate_harvest_from_date(run_date)}" + ) + if sitemap_urls_out := input_data.get("btrix-sitemap-urls-output-file"): + extract_command.append(f"--sitemap-urls-output-file={sitemap_urls_out}") + if sitemap_urls_previous := input_data.get("btrix-previous-sitemap-urls-file"): + extract_command.append( + f"--previous-sitemap-urls-file={sitemap_urls_previous}" + ) + else: extract_command.append(f"--host={input_data['oai-pmh-host']}") extract_command.append( diff --git a/lambdas/config.py b/lambdas/config.py index 7196bf9..3495a70 100644 --- a/lambdas/config.py +++ b/lambdas/config.py @@ -17,11 +17,30 @@ class Config: INDEX_ALIASES: ClassVar = { "geo": GIS_SOURCES, "rdi": ["jpal", "whoas", "zenodo"], - "timdex": ["alma", "aspace", "dspace", "libguides", "researchdatabases"], - "use": ["aspace", "dspace", *list(GIS_SOURCES), "libguides", "researchdatabases"], + "timdex": [ + "alma", + "aspace", + "dspace", + "libguides", + "mitlibwebsite", + "researchdatabases", + ], + "use": [ + "aspace", + "dspace", + *list(GIS_SOURCES), + "libguides", + "mitlibwebsite", + "researchdatabases", + ], } REQUIRED_FIELDS = ("next-step", "run-date", "run-type", "source") REQUIRED_OAI_HARVEST_FIELDS = ("oai-pmh-host", "oai-metadata-format") + REQUIRED_BTRIX_HARVEST_FIELDS = ( + "btrix-config-yaml-file", + "btrix-sitemaps", + "btrix-sitemap-urls-output-file", + ) VALID_DATE_FORMATS = ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%SZ") VALID_RUN_TYPES = ("full", "daily") VALID_STEPS = ("extract", "transform", "load") diff --git a/lambdas/format_input.py b/lambdas/format_input.py index c4f14c7..9d22ec8 100644 --- a/lambdas/format_input.py +++ b/lambdas/format_input.py @@ -36,6 +36,8 @@ def lambda_handler(event: dict, _context: dict) -> dict: if next_step == "extract": if source in CONFIG.GIS_SOURCES: result["harvester-type"] = "geo" + elif source == "mitlibwebsite": + result["harvester-type"] = "browsertrix" else: result["harvester-type"] = "oai" result["next-step"] = "transform" diff --git a/lambdas/helpers.py b/lambdas/helpers.py index e2b4667..7e5e931 100644 --- a/lambdas/helpers.py +++ b/lambdas/helpers.py @@ -48,19 +48,35 @@ def validate_input(input_data: dict) -> None: raise ValueError(message) # If next step is extract step, required harvest fields are present - # ruff: noqa: SIM102 if input_data["next-step"] == "extract": - if input_data["source"] not in CONFIG.GIS_SOURCES: - if missing_harvest_fields := [ - field - for field in CONFIG.REQUIRED_OAI_HARVEST_FIELDS - if field not in input_data - ]: + missing_harvest_fields = None + if input_data["source"] in CONFIG.GIS_SOURCES: + pass # Currently no specific GeoHarvester requirements + elif input_data["source"] == "mitlibwebsite": + missing_harvest_fields = set(CONFIG.REQUIRED_BTRIX_HARVEST_FIELDS).difference( + set(input_data.keys()) + ) + # require previous sitemaps URLs argument for daily runs + if ( + input_data["run-type"] == "daily" + and "btrix-previous-sitemap-urls-file" not in input_data + ): message = ( - "Input must include all required harvest fields when starting " - f"with harvest step. Missing fields: {missing_harvest_fields}" + "Field 'btrix-previous-sitemap-urls-file' " + "required when 'run-type=daily'" ) raise ValueError(message) + else: + missing_harvest_fields = set(CONFIG.REQUIRED_OAI_HARVEST_FIELDS).difference( + set(input_data.keys()) + ) + + if missing_harvest_fields: + message = ( + "Input must include all required harvest fields when starting " + f"with harvest step. Missing fields: {list(missing_harvest_fields)}" + ) + raise ValueError(message) def format_run_date(input_date: str) -> str: @@ -109,7 +125,11 @@ def generate_step_output_filename( """ sequence_suffix = f"_{sequence}" if sequence else "" if step == "extract": - file_type = "jsonl" if source in CONFIG.GIS_SOURCES else "xml" + file_type = ( + "jsonl" + if (source in CONFIG.GIS_SOURCES or source == "mitlibwebsite") + else "xml" + ) elif load_type == "delete": file_type = "txt" else: diff --git a/tests/fixtures/event_payloads/mitlibwebsite-daily-extract.json b/tests/fixtures/event_payloads/mitlibwebsite-daily-extract.json new file mode 100644 index 0000000..0520745 --- /dev/null +++ b/tests/fixtures/event_payloads/mitlibwebsite-daily-extract.json @@ -0,0 +1,16 @@ +{ + "next-step": "extract", + "run-date": "2025-10-14", + "run-type": "daily", + "source": "mitlibwebsite", + "verbose": "true", + "run-id": "abc123", + "btrix-config-yaml-file": "s3://timdex-bucket/mitlibwebsite/config/mitlibwebsite.yaml", + "btrix-sitemaps": [ + "https://libraries.mit.edu/sitemap.xml", + "https://libraries.mit.edu/news/sitemap.xml" + ], + "btrix-sitemap-from-date": "2025-10-13", + "btrix-sitemap-urls-output-file": "s3://timdex-bucket/mitlibwebsite/last-sitemaps-urls.txt", + "btrix-previous-sitemap-urls-file": "s3://timdex-bucket/mitlibwebsite/last-sitemaps-urls.txt" +} \ No newline at end of file diff --git a/tests/fixtures/event_payloads/mitlibwebsite-full-extract.json b/tests/fixtures/event_payloads/mitlibwebsite-full-extract.json new file mode 100644 index 0000000..648a0cb --- /dev/null +++ b/tests/fixtures/event_payloads/mitlibwebsite-full-extract.json @@ -0,0 +1,14 @@ +{ + "next-step": "extract", + "run-date": "2025-10-14", + "run-type": "full", + "source": "mitlibwebsite", + "verbose": "true", + "run-id": "abc123", + "btrix-config-yaml-file": "s3://timdex-bucket/mitlibwebsite/config/mitlibwebsite.yaml", + "btrix-sitemaps": [ + "https://libraries.mit.edu/sitemap.xml", + "https://libraries.mit.edu/news/sitemap.xml" + ], + "btrix-sitemap-urls-output-file": "s3://timdex-bucket/mitlibwebsite/last-sitemaps-urls.txt" +} \ No newline at end of file diff --git a/tests/test_commands.py b/tests/test_commands.py index 149dcfd..89ed356 100644 --- a/tests/test_commands.py +++ b/tests/test_commands.py @@ -77,6 +77,63 @@ def test_generate_extract_command_geoharvester(): } +def test_generate_extract_command_mitlibwebsite_full(): + input_data = { + "run-date": "2022-01-02T12:13:14Z", + "run-type": "full", + "next-step": "extract", + "source": "mitlibwebsite", + "btrix-config-yaml-file": "s3://bucket/config.yaml", + "btrix-sitemaps": [ + "https://libraries.mit.edu/sitemap.xml", + "https://libraries.mit.edu/news/sitemap.xml", + ], + "btrix-sitemap-urls-output-file": "s3://bucket/output.txt", + } + assert commands.generate_extract_command( + input_data, "2022-01-02", "test-timdex-bucket", False + ) == { + "extract-command": [ + "harvest", + "--include-fulltext", + "--config-yaml-file=s3://bucket/config.yaml", + "--metadata-output-file=s3://test-timdex-bucket/mitlibwebsite/" + "mitlibwebsite-2022-01-02-full-extracted-records-to-index.jsonl", + "--sitemap=https://libraries.mit.edu/sitemap.xml", + "--sitemap=https://libraries.mit.edu/news/sitemap.xml", + "--sitemap-urls-output-file=s3://bucket/output.txt", + ] + } + + +def test_generate_extract_command_mitlibwebsite_daily(): + input_data = { + "run-date": "2022-01-02T12:13:14Z", + "run-type": "daily", + "next-step": "extract", + "source": "mitlibwebsite", + "btrix-config-yaml-file": "s3://bucket/config.yaml", + "btrix-sitemaps": ["https://libraries.mit.edu/sitemap.xml"], + "btrix-sitemap-urls-output-file": "s3://bucket/output.txt", + "btrix-previous-sitemap-urls-file": "s3://bucket/previous.txt", + } + assert commands.generate_extract_command( + input_data, "2022-01-02", "test-timdex-bucket", False + ) == { + "extract-command": [ + "harvest", + "--include-fulltext", + "--config-yaml-file=s3://bucket/config.yaml", + "--metadata-output-file=s3://test-timdex-bucket/mitlibwebsite/" + "mitlibwebsite-2022-01-02-daily-extracted-records-to-index.jsonl", + "--sitemap=https://libraries.mit.edu/sitemap.xml", + "--sitemap-from-date=2022-01-01", + "--sitemap-urls-output-file=s3://bucket/output.txt", + "--previous-sitemap-urls-file=s3://bucket/previous.txt", + ] + } + + def test_generate_transform_commands_required_input_fields(run_id, run_timestamp): input_data = { "next-step": "transform", diff --git a/tests/test_format_input.py b/tests/test_format_input.py index bbb9382..f042f6b 100644 --- a/tests/test_format_input.py +++ b/tests/test_format_input.py @@ -34,6 +34,79 @@ def test_lambda_handler_with_next_step_extract(): } +def test_lambda_handler_with_next_step_extract_mitlibwebsite_full(): + event = { + "run-date": "2022-01-02T12:13:14Z", + "run-type": "full", + "next-step": "extract", + "source": "mitlibwebsite", + "run-id": "run-abc-123", + "btrix-config-yaml-file": "s3://bucket/config.yaml", + "btrix-sitemaps": [ + "https://libraries.mit.edu/sitemap.xml", + "https://libraries.mit.edu/news/sitemap.xml", + ], + "btrix-sitemap-urls-output-file": "s3://bucket/output.txt", + } + output = format_input.lambda_handler(event, {}) + assert output == { + "run-date": "2022-01-02", + "run-type": "full", + "source": "mitlibwebsite", + "verbose": False, + "harvester-type": "browsertrix", + "next-step": "transform", + "extract": { + "extract-command": [ + "harvest", + "--include-fulltext", + "--config-yaml-file=s3://bucket/config.yaml", + "--metadata-output-file=s3://test-timdex-bucket/mitlibwebsite/" + "mitlibwebsite-2022-01-02-full-extracted-records-to-index.jsonl", + "--sitemap=https://libraries.mit.edu/sitemap.xml", + "--sitemap=https://libraries.mit.edu/news/sitemap.xml", + "--sitemap-urls-output-file=s3://bucket/output.txt", + ] + }, + } + + +def test_lambda_handler_with_next_step_extract_mitlibwebsite_daily(): + event = { + "run-date": "2022-01-02T12:13:14Z", + "run-type": "daily", + "next-step": "extract", + "source": "mitlibwebsite", + "run-id": "run-abc-123", + "btrix-config-yaml-file": "s3://bucket/config.yaml", + "btrix-sitemaps": ["https://libraries.mit.edu/sitemap.xml"], + "btrix-sitemap-urls-output-file": "s3://bucket/output.txt", + "btrix-previous-sitemap-urls-file": "s3://bucket/previous.txt", + } + output = format_input.lambda_handler(event, {}) + assert output == { + "run-date": "2022-01-02", + "run-type": "daily", + "source": "mitlibwebsite", + "verbose": False, + "harvester-type": "browsertrix", + "next-step": "transform", + "extract": { + "extract-command": [ + "harvest", + "--include-fulltext", + "--config-yaml-file=s3://bucket/config.yaml", + "--metadata-output-file=s3://test-timdex-bucket/mitlibwebsite/" + "mitlibwebsite-2022-01-02-daily-extracted-records-to-index.jsonl", + "--sitemap=https://libraries.mit.edu/sitemap.xml", + "--sitemap-from-date=2022-01-01", + "--sitemap-urls-output-file=s3://bucket/output.txt", + "--previous-sitemap-urls-file=s3://bucket/previous.txt", + ] + }, + } + + def test_lambda_handler_with_next_step_transform_files_present(s3_client, run_timestamp): s3_client.put_object( Bucket="test-timdex-bucket", diff --git a/tests/test_helpers.py b/tests/test_helpers.py index 1e18314..b17d8e4 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -90,6 +90,68 @@ def test_validate_input_with_all_required_harvest_fields_returns_none(): assert helpers.validate_input(event) is None +def test_validate_input_mitlibwebsite_missing_harvest_fields_raises_error(): + event = { + "next-step": "extract", + "run-date": "2022-01-02", + "run-type": "full", + "source": "mitlibwebsite", + "btrix-config-yaml-file": "s3://bucket/config.yaml", + "btrix-sitemaps": ["https://example.com/sitemap.xml"], + } + with pytest.raises(ValueError) as error: + helpers.validate_input(event) + assert "Input must include all required harvest fields when starting with " in str( + error.value + ) + assert "'btrix-sitemap-urls-output-file'" in str(error.value) + + +def test_validate_input_mitlibwebsite_daily_missing_previous_urls_raises_error(): + event = { + "next-step": "extract", + "run-date": "2022-01-02", + "run-type": "daily", + "source": "mitlibwebsite", + "btrix-config-yaml-file": "s3://bucket/config.yaml", + "btrix-sitemaps": ["https://example.com/sitemap.xml"], + "btrix-sitemap-urls-output-file": "s3://bucket/output.txt", + } + with pytest.raises(ValueError) as error: + helpers.validate_input(event) + assert ( + "Field 'btrix-previous-sitemap-urls-file' required when 'run-type=daily'" + in str(error.value) + ) + + +def test_validate_input_mitlibwebsite_full_without_previous_urls_returns_none(): + event = { + "next-step": "extract", + "run-date": "2022-01-02", + "run-type": "full", + "source": "mitlibwebsite", + "btrix-config-yaml-file": "s3://bucket/config.yaml", + "btrix-sitemaps": ["https://example.com/sitemap.xml"], + "btrix-sitemap-urls-output-file": "s3://bucket/output.txt", + } + assert helpers.validate_input(event) is None + + +def test_validate_input_mitlibwebsite_daily_with_all_required_fields_returns_none(): + event = { + "next-step": "extract", + "run-date": "2022-01-02", + "run-type": "daily", + "source": "mitlibwebsite", + "btrix-config-yaml-file": "s3://bucket/config.yaml", + "btrix-sitemaps": ["https://example.com/sitemap.xml"], + "btrix-sitemap-urls-output-file": "s3://bucket/output.txt", + "btrix-previous-sitemap-urls-file": "s3://bucket/previous.txt", + } + assert helpers.validate_input(event) is None + + def test_format_run_date_valid_run_date_string(): assert helpers.format_run_date("2022-01-02T12:13:14Z") == "2022-01-02"