diff --git a/backend/app/services/validation/__pycache__/validation_engine.cpython-313.pyc b/backend/app/services/validation/__pycache__/validation_engine.cpython-313.pyc index 92752a77..85ec5fc5 100644 Binary files a/backend/app/services/validation/__pycache__/validation_engine.cpython-313.pyc and b/backend/app/services/validation/__pycache__/validation_engine.cpython-313.pyc differ diff --git a/backend/app/services/validation/tests/__pycache__/test_validation_engine.cpython-313-pytest-8.4.2.pyc b/backend/app/services/validation/tests/__pycache__/test_validation_engine.cpython-313-pytest-8.4.2.pyc index 26219722..bc5e51c4 100644 Binary files a/backend/app/services/validation/tests/__pycache__/test_validation_engine.cpython-313-pytest-8.4.2.pyc and b/backend/app/services/validation/tests/__pycache__/test_validation_engine.cpython-313-pytest-8.4.2.pyc differ diff --git a/backend/app/services/validation/tests/test_validation_engine.py b/backend/app/services/validation/tests/test_validation_engine.py index 84e928b8..d75f5728 100644 --- a/backend/app/services/validation/tests/test_validation_engine.py +++ b/backend/app/services/validation/tests/test_validation_engine.py @@ -1,5 +1,54 @@ import pytest -from backend.app.services.validation.validation_engine import perform_cross_source_checks +from backend.app.services.validation.validation_engine import perform_cross_source_checks, normalize_missing + +def test_normalize_missing(): + data = { + "field1": "value1", + "field2": None, + "field3": "", + "nested": { + "nested_field1": "nested_value1", + "nested_field2": None, + "nested_field3": "" + }, + "list_field": [ + "item1", + None, + "item3", + "" + ] + } + + expected_normalized_data = { + "field1": "value1", + "field2": "N/A", + "field3": "N/A", + "nested": { + "nested_field1": "nested_value1", + "nested_field2": "N/A", + "nested_field3": "N/A" + }, + "list_field": [ + "item1", + "N/A", + "item3", + "N/A" + ], + "missing_data_report": { + "field2": "Missing or empty field replaced with 'N/A'.", + "field3": "Missing or empty field replaced with 'N/A'.", + "nested.nested_field2": "Missing or empty field replaced with 'N/A'.", + "nested.nested_field3": "Missing or empty field replaced with 'N/A'.", + "list_field[1]": "Missing or empty field replaced with 'N/A'.", + "list_field[3]": "Missing or empty field replaced with 'N/A'." + } + } + + import copy + original_data = copy.deepcopy(data) + normalized_data = normalize_missing(data) + assert normalized_data == expected_normalized_data + assert data == original_data def test_circulating_supply_match(): data = { diff --git a/backend/app/services/validation/validation_engine.py b/backend/app/services/validation/validation_engine.py index c0d4416a..99c4e1b1 100644 --- a/backend/app/services/validation/validation_engine.py +++ b/backend/app/services/validation/validation_engine.py @@ -2,7 +2,9 @@ Validation engine for ensuring data quality and consistency before NLG and summary generation. """ +import re from typing import Dict, Any, Optional, List +from copy import deepcopy DEFAULT_ESSENTIAL_FIELDS = ["report_id", "project_name", "summary"] # Example default essential fields @@ -119,6 +121,51 @@ def perform_cross_source_checks(data: Dict[str, Any]) -> Dict[str, Any]: "INFO: Documentation circulating supply not found." ) + if validation_results["alerts"]: + validation_results["cross_source_checks"] = "COMPLETED_WITH_ALERTS" + else: + validation_results["cross_source_checks"] = "PASSED" + return validation_results -# You can add more validation functions as needed. + +def normalize_missing(data: Dict[str, Any]) -> Dict[str, Any]: + """ + Normalizes the input data by replacing missing or empty fields with explicit placeholders + and generates a `missing_data_report` explaining the gaps. + + Args: + data: The input data dictionary to normalize. + + Returns: + A new dictionary with missing fields normalized and a `missing_data_report`.""" + normalized_data = deepcopy(data) + missing_data_report = {} + + def _traverse_and_normalize(parent, key_or_index, current_data, path): + if isinstance(current_data, dict): + for key, value in current_data.items(): + new_path = f"{path}.{key}" if path else key + if value is None or (isinstance(value, str) and value.strip() == ""): + parent[key_or_index][key] = "N/A" # Replace with placeholder + missing_data_report[new_path] = "Missing or empty field replaced with 'N/A'." + elif isinstance(value, (dict, list)): + _traverse_and_normalize(current_data, key, value, new_path) + elif isinstance(current_data, list): + for index, item in enumerate(current_data): + new_path = f"{path}[{index}]" + if item is None or (isinstance(item, str) and item.strip() == ""): + parent[key_or_index][index] = "N/A" # Replace with placeholder + missing_data_report[new_path] = "Missing or empty field replaced with 'N/A'." + elif isinstance(item, (dict, list)): + _traverse_and_normalize(current_data, index, item, new_path) + + # Initial call to _traverse_and_normalize + # We use a temporary key '__root__' to hold the original data for the initial call + temp_root = {'__root__': normalized_data} + _traverse_and_normalize(temp_root, '__root__', normalized_data, "") + normalized_data.update(temp_root['__root__']) # Update normalized_data with the modified content + normalized_data["missing_data_report"] = missing_data_report + return normalized_data + +# You can add more validation functions as needed. \ No newline at end of file