diff --git a/src/evaluation/tests/test_loader.py b/src/evaluation/tests/test_loader.py index 24260b34..3c31bf7d 100644 --- a/src/evaluation/tests/test_loader.py +++ b/src/evaluation/tests/test_loader.py @@ -56,6 +56,28 @@ def test_load_scenarios_single_object(tmp_path: Path): assert [s.id for s in out] == ["7"] +_SCENARIOS_LOCAL = Path(__file__).resolve().parents[2] / "scenarios" / "local" + + +def test_workorder_scenarios_load_and_conform(): + """The bundled work order scenarios parse and carry the expected schema.""" + path = _SCENARIOS_LOCAL / "workorder_utterance.json" + scenarios = load_scenarios(path) + + assert len(scenarios) >= 5 + assert all(isinstance(s, Scenario) for s in scenarios) + # Every scenario is a work order scenario with a non-empty question and rubric. + for s in scenarios: + assert s.type == "WorkOrder" + assert s.text.strip() + assert s.category.strip() + assert s.characteristic_form and s.characteristic_form.strip() + # IDs are unique and at least one scenario targets failure-code categorization. + ids = [s.id for s in scenarios] + assert len(ids) == len(set(ids)) + assert any(s.category == "Categorization" for s in scenarios) + + def test_join_drops_orphans(make_persisted_record): from evaluation.models import PersistedTrajectory diff --git a/src/scenarios/local/workorder_utterance.json b/src/scenarios/local/workorder_utterance.json new file mode 100644 index 00000000..abed0a32 --- /dev/null +++ b/src/scenarios/local/workorder_utterance.json @@ -0,0 +1,44 @@ +[ + { + "id": 401, + "type": "WorkOrder", + "text": "What failure code categories are used to classify work orders?", + "category": "Knowledge Query", + "characteristic_form": "The expected response should call get_failure_codes and list the distinct top-level categories that work orders are grouped into (e.g., 'Maintenance and Routine Checks', 'Corrective'), describing what each category represents." + }, + { + "id": 402, + "type": "WorkOrder", + "text": "List all failure codes that belong to the 'Maintenance and Routine Checks' category, with their descriptions.", + "category": "Knowledge Query", + "characteristic_form": "The expected response should call get_failure_codes, filter to the 'Maintenance and Routine Checks' category, and report the primary and secondary codes with descriptions (e.g., MT010 / Oil Analysis with secondary MT010b / Routine Oil Analysis, and MT001 / Routine Maintenance)." + }, + { + "id": 403, + "type": "WorkOrder", + "text": "Which failure code category does the primary code MT010 belong to, and what does that code describe?", + "category": "Categorization", + "characteristic_form": "The expected response should look up MT010 via get_failure_codes and report that it falls under the 'Maintenance and Routine Checks' category with the description 'Oil Analysis', distinguishing the primary code from its secondary codes." + }, + { + "id": 404, + "type": "WorkOrder", + "text": "A work order is described as 'Routine Oil Analysis'. Determine the most appropriate failure code and the category it should be filed under.", + "category": "Categorization", + "characteristic_form": "The expected response should map the description 'Routine Oil Analysis' to the secondary code MT010b (under primary code MT010, 'Oil Analysis') and assign it to the 'Maintenance and Routine Checks' category, using get_failure_codes to justify the mapping." + }, + { + "id": 405, + "type": "WorkOrder", + "text": "Is failure code MT013 a preventive or a corrective maintenance type? Justify the answer using its category.", + "category": "Categorization", + "characteristic_form": "The expected response should identify MT013 via get_failure_codes as belonging to the 'Corrective' category (description 'Corrective'), and conclude it is a corrective rather than preventive maintenance type based on that category." + }, + { + "id": 406, + "type": "WorkOrder", + "text": "For chiller CWC04013, summarize its work orders grouped by failure code category and state which category is most common.", + "category": "Distribution Analysis", + "characteristic_form": "The expected response should call get_work_order_distribution for equipment 'CWC04013' (optionally with get_failure_codes to resolve categories), aggregate the per-code counts up to the category level (e.g., 'Maintenance and Routine Checks' vs 'Corrective'), and identify the most frequent category for the asset." + } + ]