From c477d0c253cb617c90b86b93739c0ec479aa8a0a Mon Sep 17 00:00:00 2001 From: Andre Manoel Date: Tue, 12 May 2026 19:08:40 +0000 Subject: [PATCH 01/14] feat: add workflow chaining --- .../data_designer/config/preview_results.py | 13 + .../src/data_designer/interface/__init__.py | 10 + .../interface/composite_workflow.py | 342 +++++++++++ .../data_designer/interface/data_designer.py | 42 ++ .../src/data_designer/interface/errors.py | 4 + .../src/data_designer/interface/results.py | 11 + .../tests/interface/test_acreate.py | 85 +++ .../interface/test_composite_workflow.py | 566 ++++++++++++++++++ 8 files changed, 1073 insertions(+) create mode 100644 packages/data-designer/src/data_designer/interface/composite_workflow.py create mode 100644 packages/data-designer/tests/interface/test_acreate.py create mode 100644 packages/data-designer/tests/interface/test_composite_workflow.py diff --git a/packages/data-designer-config/src/data_designer/config/preview_results.py b/packages/data-designer-config/src/data_designer/config/preview_results.py index 5805c50d3..6ea90ab59 100644 --- a/packages/data-designer-config/src/data_designer/config/preview_results.py +++ b/packages/data-designer-config/src/data_designer/config/preview_results.py @@ -8,6 +8,7 @@ from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults from data_designer.config.config_builder import DataDesignerConfigBuilder from data_designer.config.dataset_metadata import DatasetMetadata +from data_designer.config.seed_source_dataframe import DataFrameSeedSource from data_designer.config.utils.visualization import WithRecordSamplerMixin if TYPE_CHECKING: @@ -41,3 +42,15 @@ def __init__( self.dataset_metadata: DatasetMetadata | None = dataset_metadata self.task_traces: list[Any] | None = task_traces self._config_builder = config_builder + + def to_config_builder(self, columns: list[str] | None = None) -> DataDesignerConfigBuilder: + """Create a new config builder seeded from this preview dataset.""" + if self.dataset is None: + raise ValueError("Preview dataset is not available.") + df = self.dataset + if columns is not None: + df = df.loc[:, columns] + return DataDesignerConfigBuilder( + model_configs=self._config_builder.model_configs, + tool_configs=self._config_builder.tool_configs, + ).with_seed_dataset(DataFrameSeedSource(df=df.copy())) diff --git a/packages/data-designer/src/data_designer/interface/__init__.py b/packages/data-designer/src/data_designer/interface/__init__.py index c5e426e35..351d023f0 100644 --- a/packages/data-designer/src/data_designer/interface/__init__.py +++ b/packages/data-designer/src/data_designer/interface/__init__.py @@ -8,21 +8,31 @@ if TYPE_CHECKING: from data_designer.engine.storage.artifact_storage import ResumeMode # noqa: F401 + from data_designer.interface.composite_workflow import ( # noqa: F401 + CompositeWorkflow, + CompositeWorkflowResults, + SkippedStageResult, + ) from data_designer.interface.data_designer import DataDesigner # noqa: F401 from data_designer.interface.errors import ( # noqa: F401 DataDesignerEarlyShutdownError, DataDesignerGenerationError, DataDesignerProfilingError, + DataDesignerWorkflowError, ) from data_designer.interface.results import DatasetCreationResults # noqa: F401 _LAZY_IMPORTS: dict[str, tuple[str, str]] = { + "CompositeWorkflow": ("data_designer.interface.composite_workflow", "CompositeWorkflow"), + "CompositeWorkflowResults": ("data_designer.interface.composite_workflow", "CompositeWorkflowResults"), "DataDesigner": ("data_designer.interface.data_designer", "DataDesigner"), "DataDesignerEarlyShutdownError": ("data_designer.interface.errors", "DataDesignerEarlyShutdownError"), "DataDesignerGenerationError": ("data_designer.interface.errors", "DataDesignerGenerationError"), "DataDesignerProfilingError": ("data_designer.interface.errors", "DataDesignerProfilingError"), + "DataDesignerWorkflowError": ("data_designer.interface.errors", "DataDesignerWorkflowError"), "DatasetCreationResults": ("data_designer.interface.results", "DatasetCreationResults"), "ResumeMode": ("data_designer.engine.storage.artifact_storage", "ResumeMode"), + "SkippedStageResult": ("data_designer.interface.composite_workflow", "SkippedStageResult"), } __all__ = list(_LAZY_IMPORTS.keys()) diff --git a/packages/data-designer/src/data_designer/interface/composite_workflow.py b/packages/data-designer/src/data_designer/interface/composite_workflow.py new file mode 100644 index 000000000..f39a05702 --- /dev/null +++ b/packages/data-designer/src/data_designer/interface/composite_workflow.py @@ -0,0 +1,342 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import hashlib +import json +import shutil +import time +from collections.abc import Callable, Iterator +from contextlib import contextmanager +from dataclasses import dataclass +from pathlib import Path +from typing import TYPE_CHECKING, Any + +import data_designer.lazy_heavy_imports as lazy +from data_designer.config.config_builder import BuilderConfig, DataDesignerConfigBuilder +from data_designer.config.data_designer_config import DataDesignerConfig +from data_designer.config.seed import IndexRange, PartitionBlock, SamplingStrategy +from data_designer.config.seed_source import LocalFileSeedSource +from data_designer.config.utils.constants import DEFAULT_NUM_RECORDS +from data_designer.config.version import get_library_version +from data_designer.interface.errors import DataDesignerWorkflowError +from data_designer.interface.results import DatasetCreationResults + +if TYPE_CHECKING: + from data_designer.interface.data_designer import DataDesigner + + +OnSuccessCallback = Callable[[Path], Path | str] + + +@dataclass(frozen=True) +class _WorkflowStage: + name: str + config_builder: DataDesignerConfigBuilder + depends_on: tuple[str, ...] + num_records: int | None + on_success: OnSuccessCallback | None + on_success_version: str | None + allow_empty: bool + sampling_strategy: SamplingStrategy + selection_strategy: IndexRange | PartitionBlock | None + + +@dataclass(frozen=True) +class SkippedStageResult: + status: str + upstream_stage: str + + +class CompositeWorkflowResults: + def __init__( + self, + *, + name: str, + stage_results: dict[str, DatasetCreationResults | SkippedStageResult], + final_stage_name: str, + ): + self.name = name + self.stage_results = stage_results + self.final_stage_name = final_stage_name + + def __getitem__(self, stage_name: str) -> DatasetCreationResults | SkippedStageResult: + return self.stage_results[stage_name] + + def __iter__(self) -> Iterator[str]: + return iter(self.stage_results) + + def keys(self): + return self.stage_results.keys() + + def items(self): + return self.stage_results.items() + + @property + def final_result(self) -> DatasetCreationResults: + result = self.stage_results[self.final_stage_name] + if isinstance(result, SkippedStageResult): + raise DataDesignerWorkflowError(f"Final stage {self.final_stage_name!r} was skipped: {result.status}.") + return result + + def load_dataset(self): + return self.final_result.load_dataset() + + def load_analysis(self): + return self.final_result.load_analysis() + + def count_records(self) -> int: + return self.final_result.count_records() + + def export(self, *args, **kwargs): + return self.final_result.export(*args, **kwargs) + + def push_to_hub(self, *args, **kwargs): + return self.final_result.push_to_hub(*args, **kwargs) + + +class CompositeWorkflow: + def __init__(self, *, name: str, data_designer: DataDesigner): + _validate_dir_name(name, "workflow name") + self.name = name + self._data_designer = data_designer + self._stages: list[_WorkflowStage] = [] + + def add_stage( + self, + name: str, + config_builder: DataDesignerConfigBuilder, + *, + num_records: int | None = None, + on_success: OnSuccessCallback | None = None, + on_success_version: str | None = None, + allow_empty: bool = False, + sampling_strategy: SamplingStrategy = SamplingStrategy.ORDERED, + selection_strategy: IndexRange | PartitionBlock | None = None, + ) -> CompositeWorkflow: + _validate_dir_name(name, "stage name") + if any(stage.name == name for stage in self._stages): + raise DataDesignerWorkflowError(f"Stage name {name!r} is already used in workflow {self.name!r}.") + if num_records is not None and num_records < 1: + raise DataDesignerWorkflowError("Stage num_records must be at least 1.") + self._stages.append( + _WorkflowStage( + name=name, + config_builder=config_builder, + depends_on=(self._stages[-1].name,) if self._stages else (), + num_records=num_records, + on_success=on_success, + on_success_version=on_success_version, + allow_empty=allow_empty, + sampling_strategy=sampling_strategy, + selection_strategy=selection_strategy, + ) + ) + return self + + def run(self) -> CompositeWorkflowResults: + if not self._stages: + raise DataDesignerWorkflowError(f"Workflow {self.name!r} has no stages.") + + workflow_path = self._data_designer._artifact_path / self.name + workflow_path.mkdir(parents=True, exist_ok=True) + metadata: dict[str, Any] = { + "name": self.name, + "library_version": get_library_version(), + "stages": [], + } + stage_results: dict[str, DatasetCreationResults | SkippedStageResult] = {} + previous_seed_path: Path | None = None + previous_output_records: int | None = None + previous_stage_name: str | None = None + previous_stage_fingerprint: str | None = None + skipped_upstream_stage: str | None = None + + for index, stage in enumerate(self._stages): + stage_dir_name = _stage_dir_name(index, stage.name) + stage_metadata = _base_stage_metadata(index, stage, stage_dir_name) + metadata["stages"].append(stage_metadata) + + if skipped_upstream_stage is not None: + stage_metadata.update( + { + "status": "skipped_empty_upstream", + "upstream_stage": skipped_upstream_stage, + } + ) + stage_results[stage.name] = SkippedStageResult( + status="skipped_empty_upstream", + upstream_stage=skipped_upstream_stage, + ) + _write_workflow_metadata(workflow_path, metadata) + continue + + stage_builder = _clone_config_builder(stage.config_builder) + if previous_seed_path is not None: + stage_builder.with_seed_dataset( + _local_seed_source_from_path(previous_seed_path), + sampling_strategy=stage.sampling_strategy, + selection_strategy=stage.selection_strategy, + ) + + num_records = stage.num_records or previous_output_records or DEFAULT_NUM_RECORDS + stage_config = stage_builder.build() + stage_fingerprint = _stage_fingerprint( + stage_config=stage_config, + stage=stage, + num_records=num_records, + upstream_fingerprint=previous_stage_fingerprint, + ) + stage_path = workflow_path / stage_dir_name + if stage_path.exists(): + shutil.rmtree(stage_path) + + stage_metadata.update( + { + "status": "running", + "fingerprint": stage_fingerprint, + "num_records_requested": num_records, + "seeded_from_stage": previous_stage_name, + "seed_path": str(previous_seed_path) if previous_seed_path is not None else None, + "config": stage_config.model_dump(mode="json"), + } + ) + _write_workflow_metadata(workflow_path, metadata) + + start_time = time.monotonic() + try: + with _temporary_artifact_path(self._data_designer, workflow_path): + result = self._data_designer.create( + stage_builder, + num_records=num_records, + dataset_name=stage_dir_name, + ) + actual_records = result.count_records() + output_seed_path = result.artifact_storage.final_dataset_path + callback_output_path = None + output_records = actual_records + + if stage.on_success is not None: + callback_output_path = Path(stage.on_success(result.artifact_storage.base_dataset_path)) + output_seed_path = callback_output_path + output_records = _count_parquet_records(callback_output_path) + + if output_records == 0: + if not stage.allow_empty: + raise DataDesignerWorkflowError(f"Stage {stage.name!r} produced an empty output.") + status = "completed_empty" + skipped_upstream_stage = stage.name + else: + status = "completed" + + stage_metadata.update( + { + "status": status, + "num_records_actual": actual_records, + "output_records": output_records, + "output_seed_path": str(output_seed_path), + "callback_output_path": str(callback_output_path) if callback_output_path else None, + "duration_sec": time.monotonic() - start_time, + } + ) + except Exception: + stage_metadata.update({"status": "failed", "duration_sec": time.monotonic() - start_time}) + _write_workflow_metadata(workflow_path, metadata) + raise + + stage_results[stage.name] = result + previous_seed_path = output_seed_path + previous_output_records = output_records + previous_stage_name = stage.name + previous_stage_fingerprint = stage_fingerprint + _write_workflow_metadata(workflow_path, metadata) + + return CompositeWorkflowResults( + name=self.name, + stage_results=stage_results, + final_stage_name=self._stages[-1].name, + ) + + +def _clone_config_builder(config_builder: DataDesignerConfigBuilder) -> DataDesignerConfigBuilder: + return DataDesignerConfigBuilder.from_config(BuilderConfig(data_designer=config_builder.build())) + + +@contextmanager +def _temporary_artifact_path(data_designer: DataDesigner, artifact_path: Path): + original_artifact_path = data_designer._artifact_path + data_designer._artifact_path = artifact_path + try: + yield + finally: + data_designer._artifact_path = original_artifact_path + + +def _stage_dir_name(index: int, name: str) -> str: + return f"stage-{index}-{name}" + + +def _base_stage_metadata(index: int, stage: _WorkflowStage, stage_dir_name: str) -> dict[str, Any]: + return { + "index": index, + "name": stage.name, + "stage_dir": stage_dir_name, + "depends_on": list(stage.depends_on), + "allow_empty": stage.allow_empty, + "on_success_version": stage.on_success_version, + "sampling_strategy": stage.sampling_strategy.value, + "selection_strategy": _selection_strategy_payload(stage.selection_strategy), + } + + +def _stage_fingerprint( + *, + stage_config: DataDesignerConfig, + stage: _WorkflowStage, + num_records: int, + upstream_fingerprint: str | None, +) -> str: + payload = { + "config_fingerprint": stage_config.fingerprint(), + "num_records": num_records, + "sampling_strategy": stage.sampling_strategy.value, + "selection_strategy": _selection_strategy_payload(stage.selection_strategy), + "allow_empty": stage.allow_empty, + "on_success_version": stage.on_success_version, + "library_version": get_library_version(), + "upstream_fingerprint": upstream_fingerprint, + } + return hashlib.sha256(json.dumps(payload, sort_keys=True).encode("utf-8")).hexdigest() + + +def _selection_strategy_payload(selection_strategy: IndexRange | PartitionBlock | None) -> dict[str, Any] | None: + if selection_strategy is None: + return None + return selection_strategy.model_dump(mode="json") + + +def _local_seed_source_from_path(path: Path) -> LocalFileSeedSource: + if path.is_dir(): + return LocalFileSeedSource(path=str(path / "*.parquet")) + return LocalFileSeedSource(path=str(path)) + + +def _count_parquet_records(path: Path) -> int: + parquet_files = sorted(path.glob("*.parquet")) if path.is_dir() else [path] + if not parquet_files: + raise DataDesignerWorkflowError(f"No parquet files found at {str(path)!r}.") + return sum(lazy.pq.read_metadata(file_path).num_rows for file_path in parquet_files) + + +def _write_workflow_metadata(workflow_path: Path, metadata: dict[str, Any]) -> None: + path = workflow_path / "workflow-metadata.json" + path.write_text(json.dumps(metadata, indent=2, sort_keys=True), encoding="utf-8") + + +def _validate_dir_name(name: str, label: str) -> None: + if not name: + raise DataDesignerWorkflowError(f"{label} must be a non-empty string.") + invalid_chars = {"<", ">", ":", '"', "/", "\\", "|", "?", "*"} + if any(char in name for char in invalid_chars): + raise DataDesignerWorkflowError(f"{label} {name!r} contains invalid path characters.") diff --git a/packages/data-designer/src/data_designer/interface/data_designer.py b/packages/data-designer/src/data_designer/interface/data_designer.py index e296d756c..e1acbc341 100644 --- a/packages/data-designer/src/data_designer/interface/data_designer.py +++ b/packages/data-designer/src/data_designer/interface/data_designer.py @@ -3,7 +3,9 @@ from __future__ import annotations +import asyncio import logging +import threading import warnings from pathlib import Path from typing import TYPE_CHECKING @@ -74,6 +76,7 @@ if TYPE_CHECKING: from data_designer.engine.models.facade import ModelFacade + from data_designer.interface.composite_workflow import CompositeWorkflow logger = logging.getLogger(__name__) @@ -151,6 +154,7 @@ def __init__( self._secret_resolver = secret_resolver or DEFAULT_SECRET_RESOLVER self._artifact_path = Path(artifact_path) if artifact_path is not None else Path.cwd() / "artifacts" self._run_config = RunConfig() + self._create_lock = threading.Lock() self._managed_assets_path = Path(managed_assets_path or MANAGED_ASSETS_PATH) self._person_reader = person_reader # Only consult the YAML's `default:` key when we are also falling back to @@ -359,6 +363,39 @@ def create( task_traces=task_traces, ) + async def acreate( + self, + config_builder: DataDesignerConfigBuilder, + *, + num_records: int = DEFAULT_NUM_RECORDS, + dataset_name: str = "dataset", + resume: ResumeMode = ResumeMode.NEVER, + ) -> DatasetCreationResults: + """Async wrapper for creating a dataset without blocking the caller's event loop.""" + return await asyncio.to_thread( + self._create_threadsafe, + config_builder, + num_records=num_records, + dataset_name=dataset_name, + resume=resume, + ) + + def _create_threadsafe( + self, + config_builder: DataDesignerConfigBuilder, + *, + num_records: int, + dataset_name: str, + resume: ResumeMode, + ) -> DatasetCreationResults: + with self._create_lock: + return self.create( + config_builder, + num_records=num_records, + dataset_name=dataset_name, + resume=resume, + ) + def preview( self, config_builder: DataDesignerConfigBuilder, *, num_records: int = DEFAULT_NUM_RECORDS ) -> PreviewResults: @@ -445,6 +482,11 @@ def preview( task_traces=builder.task_traces or None, ) + def compose_workflow(self, *, name: str) -> CompositeWorkflow: + from data_designer.interface.composite_workflow import CompositeWorkflow + + return CompositeWorkflow(name=name, data_designer=self) + def _log_jinja_rendering_engine_mode(self) -> None: engine = JinjaRenderingEngine(self._run_config.jinja_rendering_engine) icon = "🔒" if engine == JinjaRenderingEngine.SECURE else "🏠" diff --git a/packages/data-designer/src/data_designer/interface/errors.py b/packages/data-designer/src/data_designer/interface/errors.py index 3b113ef9b..8a7fb248f 100644 --- a/packages/data-designer/src/data_designer/interface/errors.py +++ b/packages/data-designer/src/data_designer/interface/errors.py @@ -14,6 +14,10 @@ class DataDesignerGenerationError(DataDesignerError): """Raised for errors related to a Data Designer dataset generation.""" +class DataDesignerWorkflowError(DataDesignerError): + """Raised for errors related to composite workflow orchestration.""" + + class DataDesignerEarlyShutdownError(DataDesignerGenerationError): """Raised when a run terminated via early shutdown and produced no records. diff --git a/packages/data-designer/src/data_designer/interface/results.py b/packages/data-designer/src/data_designer/interface/results.py index 29f222279..f44a1b72a 100644 --- a/packages/data-designer/src/data_designer/interface/results.py +++ b/packages/data-designer/src/data_designer/interface/results.py @@ -11,6 +11,7 @@ from data_designer.config.config_builder import DataDesignerConfigBuilder from data_designer.config.dataset_metadata import DatasetMetadata from data_designer.config.errors import InvalidFileFormatError +from data_designer.config.seed_source_dataframe import DataFrameSeedSource from data_designer.config.utils.visualization import WithRecordSamplerMixin from data_designer.engine.dataset_builders.errors import ArtifactStorageError from data_designer.engine.storage.artifact_storage import ArtifactStorage @@ -85,6 +86,16 @@ def load_dataset(self) -> pd.DataFrame: """ return self.artifact_storage.load_dataset() + def to_config_builder(self, columns: list[str] | None = None) -> DataDesignerConfigBuilder: + """Create a new config builder seeded from this result dataset.""" + df = self.load_dataset() + if columns is not None: + df = df.loc[:, columns] + return DataDesignerConfigBuilder( + model_configs=self._config_builder.model_configs, + tool_configs=self._config_builder.tool_configs, + ).with_seed_dataset(DataFrameSeedSource(df=df.copy())) + def count_records(self) -> int: """Return the total number of records in the generated dataset. diff --git a/packages/data-designer/tests/interface/test_acreate.py b/packages/data-designer/tests/interface/test_acreate.py new file mode 100644 index 000000000..5dd008be6 --- /dev/null +++ b/packages/data-designer/tests/interface/test_acreate.py @@ -0,0 +1,85 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import asyncio +from pathlib import Path +from unittest.mock import MagicMock + +import pytest + +import data_designer.lazy_heavy_imports as lazy +from data_designer.config.column_configs import ExpressionColumnConfig +from data_designer.config.config_builder import DataDesignerConfigBuilder +from data_designer.config.dataset_metadata import DatasetMetadata +from data_designer.config.models import ModelConfig, ModelProvider +from data_designer.config.seed_source_dataframe import DataFrameSeedSource +from data_designer.engine.secret_resolver import PlaintextResolver +from data_designer.engine.storage.artifact_storage import ResumeMode +from data_designer.interface.data_designer import DataDesigner +from data_designer.interface.results import DatasetCreationResults + + +def _seeded_builder(model_configs: list[ModelConfig], names: list[str]) -> DataDesignerConfigBuilder: + builder = DataDesignerConfigBuilder(model_configs=model_configs) + builder.with_seed_dataset(DataFrameSeedSource(df=lazy.pd.DataFrame({"name": names}))) + builder.add_column(ExpressionColumnConfig(name="name_copy", expr="{{ name }}")) + return builder + + +@pytest.mark.asyncio +async def test_acreate_delegates_to_create( + tmp_path: Path, + stub_model_providers: list[ModelProvider], + stub_model_configs: list[ModelConfig], + stub_dataset_profiler_results, +) -> None: + data_designer = DataDesigner(artifact_path=tmp_path / "artifacts", model_providers=stub_model_providers) + artifact_storage = MagicMock() + expected = DatasetCreationResults( + artifact_storage=artifact_storage, + analysis=stub_dataset_profiler_results, + config_builder=_seeded_builder(stub_model_configs, ["Ada"]), + dataset_metadata=DatasetMetadata(), + ) + data_designer.create = MagicMock(return_value=expected) + builder = _seeded_builder(stub_model_configs, ["Ada"]) + + result = await data_designer.acreate( + builder, + num_records=1, + dataset_name="async-dataset", + resume=ResumeMode.IF_POSSIBLE, + ) + + assert result is expected + data_designer.create.assert_called_once_with( + builder, + num_records=1, + dataset_name="async-dataset", + resume=ResumeMode.IF_POSSIBLE, + ) + + +@pytest.mark.asyncio +async def test_acreate_supports_gathered_real_async_workflows( + tmp_path: Path, + stub_model_providers: list[ModelProvider], + stub_model_configs: list[ModelConfig], +) -> None: + data_designer = DataDesigner( + artifact_path=tmp_path / "artifacts", + model_providers=stub_model_providers, + secret_resolver=PlaintextResolver(), + ) + left = _seeded_builder(stub_model_configs, ["Ada", "Linus"]) + right = _seeded_builder(stub_model_configs, ["Grace"]) + + left_result, right_result = await asyncio.gather( + data_designer.acreate(left, num_records=2, dataset_name="left"), + data_designer.acreate(right, num_records=1, dataset_name="right"), + ) + + assert left_result.load_dataset().sort_values("name")["name_copy"].tolist() == ["Ada", "Linus"] + assert right_result.load_dataset()["name_copy"].tolist() == ["Grace"] diff --git a/packages/data-designer/tests/interface/test_composite_workflow.py b/packages/data-designer/tests/interface/test_composite_workflow.py new file mode 100644 index 000000000..ed401034e --- /dev/null +++ b/packages/data-designer/tests/interface/test_composite_workflow.py @@ -0,0 +1,566 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import json +from pathlib import Path +from unittest.mock import MagicMock + +import pytest + +import data_designer.lazy_heavy_imports as lazy +from data_designer.config.column_configs import CustomColumnConfig, ExpressionColumnConfig, SamplerColumnConfig +from data_designer.config.config_builder import DataDesignerConfigBuilder +from data_designer.config.custom_column import custom_column_generator +from data_designer.config.dataset_metadata import DatasetMetadata +from data_designer.config.models import ModelConfig, ModelProvider +from data_designer.config.preview_results import PreviewResults +from data_designer.config.processors import DropColumnsProcessorConfig, SchemaTransformProcessorConfig +from data_designer.config.seed_source import LocalFileSeedSource +from data_designer.config.seed_source_dataframe import DataFrameSeedSource +from data_designer.engine.secret_resolver import PlaintextResolver +from data_designer.engine.storage.artifact_storage import ArtifactStorage, BatchStage +from data_designer.interface.composite_workflow import SkippedStageResult +from data_designer.interface.data_designer import DataDesigner +from data_designer.interface.errors import DataDesignerWorkflowError +from data_designer.interface.results import DatasetCreationResults + + +@pytest.fixture +def stub_artifact_path(tmp_path: Path) -> Path: + return tmp_path / "artifacts" + + +def _data_designer(artifact_path: Path, model_providers: list[ModelProvider]) -> DataDesigner: + return DataDesigner(artifact_path=artifact_path, model_providers=model_providers) + + +def _real_data_designer(artifact_path: Path, model_providers: list[ModelProvider]) -> DataDesigner: + return DataDesigner( + artifact_path=artifact_path, + model_providers=model_providers, + secret_resolver=PlaintextResolver(), + ) + + +def _result_from_df( + artifact_path: Path, + dataset_name: str, + df: lazy.pd.DataFrame, + config_builder: DataDesignerConfigBuilder, + stub_dataset_profiler_results, +) -> DatasetCreationResults: + ArtifactStorage.mkdir_if_needed(artifact_path) + artifact_storage = ArtifactStorage(artifact_path=artifact_path, dataset_name=dataset_name) + artifact_storage.write_batch_to_parquet_file(0, df, BatchStage.FINAL_RESULT) + return DatasetCreationResults( + artifact_storage=artifact_storage, + analysis=stub_dataset_profiler_results, + config_builder=config_builder, + dataset_metadata=DatasetMetadata(), + ) + + +def _patch_create(data_designer: DataDesigner, stub_dataset_profiler_results) -> MagicMock: + def fake_create( + config_builder: DataDesignerConfigBuilder, + *, + num_records: int, + dataset_name: str, + **kwargs, + ) -> DatasetCreationResults: + del kwargs + df = lazy.pd.DataFrame({"category": ["alpha"] * num_records, "category_copy": ["alpha"] * num_records}) + return _result_from_df( + data_designer._artifact_path, + dataset_name, + df, + config_builder, + stub_dataset_profiler_results, + ) + + data_designer.create = MagicMock(side_effect=fake_create) + return data_designer.create + + +def _category_builder(model_configs: list[ModelConfig]) -> DataDesignerConfigBuilder: + builder = DataDesignerConfigBuilder(model_configs=model_configs) + builder.add_column( + SamplerColumnConfig( + name="category", + sampler_type="category", + params={"values": ["alpha", "beta", "gamma"]}, + ) + ) + return builder + + +def _copy_builder(model_configs: list[ModelConfig]) -> DataDesignerConfigBuilder: + builder = DataDesignerConfigBuilder(model_configs=model_configs) + builder.add_column(ExpressionColumnConfig(name="category_copy", expr="{{ category }}")) + return builder + + +def _seeded_builder(model_configs: list[ModelConfig], rows: list[dict]) -> DataDesignerConfigBuilder: + builder = DataDesignerConfigBuilder(model_configs=model_configs) + builder.with_seed_dataset(DataFrameSeedSource(df=lazy.pd.DataFrame(rows))) + return builder + + +def test_dataset_creation_results_to_config_builder_columns( + stub_model_configs: list[ModelConfig], + stub_dataset_profiler_results, + tmp_path: Path, +) -> None: + results = _result_from_df( + tmp_path / "artifacts", + "dataset", + lazy.pd.DataFrame({"category": ["alpha", "beta", "gamma"], "other": [1, 2, 3]}), + _category_builder(stub_model_configs), + stub_dataset_profiler_results, + ) + + builder = results.to_config_builder(columns=["category"]) + + seed_config = builder.get_seed_config() + assert isinstance(seed_config.source, DataFrameSeedSource) + assert builder.model_configs == stub_model_configs + assert list(seed_config.source.df.columns) == ["category"] + assert len(seed_config.source.df) == 3 + + +def test_preview_results_to_config_builder_columns( + stub_model_configs: list[ModelConfig], +) -> None: + results = PreviewResults( + config_builder=_category_builder(stub_model_configs), + dataset=lazy.pd.DataFrame({"category": ["alpha", "beta"], "other": [1, 2]}), + dataset_metadata=DatasetMetadata(), + ) + + builder = results.to_config_builder(columns=["category"]) + + seed_config = builder.get_seed_config() + assert isinstance(seed_config.source, DataFrameSeedSource) + assert list(seed_config.source.df.columns) == ["category"] + assert len(seed_config.source.df) == 2 + + +def test_composite_workflow_runs_linear_stages_with_disk_handoff( + stub_artifact_path: Path, + stub_model_providers: list[ModelProvider], + stub_model_configs: list[ModelConfig], + stub_dataset_profiler_results, +) -> None: + data_designer = _data_designer(stub_artifact_path, stub_model_providers) + create_mock = _patch_create(data_designer, stub_dataset_profiler_results) + workflow = data_designer.compose_workflow(name="linear-chain") + workflow.add_stage("base", _category_builder(stub_model_configs), num_records=3) + workflow.add_stage("copy", _copy_builder(stub_model_configs)) + + results = workflow.run() + + assert set(results.keys()) == {"base", "copy"} + assert results["base"].count_records() == 3 + assert results["copy"].count_records() == 3 + final_df = results.load_dataset() + assert "category_copy" in final_df.columns + assert (stub_artifact_path / "linear-chain" / "stage-0-base").is_dir() + assert (stub_artifact_path / "linear-chain" / "stage-1-copy").is_dir() + + metadata = json.loads((stub_artifact_path / "linear-chain" / "workflow-metadata.json").read_text()) + assert [stage["status"] for stage in metadata["stages"]] == ["completed", "completed"] + assert metadata["stages"][1]["seeded_from_stage"] == "base" + assert metadata["stages"][1]["depends_on"] == ["base"] + assert metadata["stages"][1]["num_records_requested"] == 3 + assert create_mock.call_args_list[1].kwargs["num_records"] == 3 + second_stage_builder = create_mock.call_args_list[1].args[0] + seed_config = second_stage_builder.get_seed_config() + assert isinstance(seed_config.source, LocalFileSeedSource) + assert seed_config.source.path.endswith("stage-0-base/parquet-files/*.parquet") + + +def test_composite_workflow_callback_output_controls_next_stage_default_count( + stub_artifact_path: Path, + stub_model_providers: list[ModelProvider], + stub_model_configs: list[ModelConfig], + stub_dataset_profiler_results, +) -> None: + data_designer = _data_designer(stub_artifact_path, stub_model_providers) + _patch_create(data_designer, stub_dataset_profiler_results) + + def keep_first(stage_path: Path) -> Path: + df = lazy.pd.read_parquet(stage_path / "parquet-files") + output_path = stage_path / "callback-outputs" / "first-row" + output_path.mkdir(parents=True) + df.head(1).to_parquet(output_path / "data.parquet", index=False) + return output_path + + workflow = data_designer.compose_workflow(name="callback-chain") + workflow.add_stage( + "base", + _category_builder(stub_model_configs), + num_records=3, + on_success=keep_first, + on_success_version="first-row", + ) + workflow.add_stage("copy", _copy_builder(stub_model_configs)) + + results = workflow.run() + + assert results["base"].count_records() == 3 + assert results["copy"].count_records() == 1 + metadata = json.loads((stub_artifact_path / "callback-chain" / "workflow-metadata.json").read_text()) + assert metadata["stages"][0]["output_records"] == 1 + assert metadata["stages"][1]["num_records_requested"] == 1 + + +def test_composite_workflow_explicit_downstream_num_records_supports_explode( + stub_artifact_path: Path, + stub_model_providers: list[ModelProvider], + stub_model_configs: list[ModelConfig], + stub_dataset_profiler_results, +) -> None: + data_designer = _data_designer(stub_artifact_path, stub_model_providers) + create_mock = _patch_create(data_designer, stub_dataset_profiler_results) + workflow = data_designer.compose_workflow(name="explode-chain") + workflow.add_stage("base", _category_builder(stub_model_configs), num_records=2) + workflow.add_stage("expanded", _copy_builder(stub_model_configs), num_records=7) + + results = workflow.run() + + assert results["base"].count_records() == 2 + assert results["expanded"].count_records() == 7 + assert create_mock.call_args_list[1].kwargs["num_records"] == 7 + + +def test_composite_workflow_empty_callback_can_skip_downstream_stages( + stub_artifact_path: Path, + stub_model_providers: list[ModelProvider], + stub_model_configs: list[ModelConfig], + stub_dataset_profiler_results, +) -> None: + data_designer = _data_designer(stub_artifact_path, stub_model_providers) + _patch_create(data_designer, stub_dataset_profiler_results) + + def empty_output(stage_path: Path) -> Path: + output_path = stage_path / "callback-outputs" / "empty" + output_path.mkdir(parents=True) + lazy.pd.DataFrame({"category": []}).to_parquet(output_path / "data.parquet", index=False) + return output_path + + workflow = data_designer.compose_workflow(name="empty-chain") + workflow.add_stage( + "base", + _category_builder(stub_model_configs), + num_records=2, + on_success=empty_output, + on_success_version="empty", + allow_empty=True, + ) + workflow.add_stage("copy", _copy_builder(stub_model_configs)) + + results = workflow.run() + + assert isinstance(results["copy"], SkippedStageResult) + assert results["copy"].upstream_stage == "base" + with pytest.raises(DataDesignerWorkflowError, match="Final stage 'copy' was skipped"): + results.load_dataset() + + +def test_composite_workflow_empty_workflow_fails_before_artifacts( + stub_artifact_path: Path, + stub_model_providers: list[ModelProvider], +) -> None: + workflow = _data_designer(stub_artifact_path, stub_model_providers).compose_workflow(name="empty-workflow") + + with pytest.raises(DataDesignerWorkflowError, match="has no stages"): + workflow.run() + + assert not (stub_artifact_path / "empty-workflow").exists() + + +@pytest.mark.parametrize("name", ["bad/name", "bad*name", ""]) +def test_composite_workflow_rejects_invalid_workflow_names( + name: str, + stub_model_providers: list[ModelProvider], +) -> None: + with pytest.raises(DataDesignerWorkflowError): + DataDesigner(model_providers=stub_model_providers).compose_workflow(name=name) + + +@pytest.mark.parametrize("name", ["bad/name", "bad*name", ""]) +def test_composite_workflow_rejects_invalid_stage_names( + name: str, + stub_artifact_path: Path, + stub_model_providers: list[ModelProvider], + stub_model_configs: list[ModelConfig], +) -> None: + workflow = _data_designer(stub_artifact_path, stub_model_providers).compose_workflow(name="invalid-stage") + + with pytest.raises(DataDesignerWorkflowError): + workflow.add_stage(name, _category_builder(stub_model_configs)) + + +def test_composite_workflow_rejects_duplicate_stage_names( + stub_artifact_path: Path, + stub_model_providers: list[ModelProvider], + stub_model_configs: list[ModelConfig], +) -> None: + data_designer = _data_designer(stub_artifact_path, stub_model_providers) + workflow = data_designer.compose_workflow(name="duplicate-chain") + workflow.add_stage("base", _category_builder(stub_model_configs)) + + with pytest.raises(DataDesignerWorkflowError, match="already used"): + workflow.add_stage("base", _copy_builder(stub_model_configs)) + + +def test_composite_workflow_runs_three_real_async_stages( + tmp_path: Path, + stub_model_providers: list[ModelProvider], + stub_model_configs: list[ModelConfig], +) -> None: + stage_1 = _seeded_builder(stub_model_configs, [{"name": "Ada"}, {"name": "Linus"}]) + stage_1.add_column(ExpressionColumnConfig(name="persona", expr="{{ name }} persona")) + + stage_2 = DataDesignerConfigBuilder(model_configs=stub_model_configs) + stage_2.add_column(ExpressionColumnConfig(name="prompt_seed", expr="{{ persona }} prompt")) + + stage_3 = DataDesignerConfigBuilder(model_configs=stub_model_configs) + stage_3.add_column(ExpressionColumnConfig(name="final_label", expr="{{ prompt_seed }} final")) + + workflow = _real_data_designer(tmp_path / "artifacts", stub_model_providers).compose_workflow(name="three-stage") + workflow.add_stage("personas", stage_1, num_records=2) + workflow.add_stage("prompts", stage_2) + workflow.add_stage("final", stage_3) + + df = workflow.run().load_dataset().sort_values("name").reset_index(drop=True) + + assert df[["name", "persona", "prompt_seed", "final_label"]].to_dict(orient="records") == [ + { + "name": "Ada", + "persona": "Ada persona", + "prompt_seed": "Ada persona prompt", + "final_label": "Ada persona prompt final", + }, + { + "name": "Linus", + "persona": "Linus persona", + "prompt_seed": "Linus persona prompt", + "final_label": "Linus persona prompt final", + }, + ] + + +def test_composite_workflow_callback_can_expand_rows_between_real_async_stages( + tmp_path: Path, + stub_model_providers: list[ModelProvider], + stub_model_configs: list[ModelConfig], +) -> None: + stage_1 = _seeded_builder(stub_model_configs, [{"name": "Ada"}, {"name": "Linus"}]) + stage_1.add_column(ExpressionColumnConfig(name="persona", expr="{{ name }}")) + + def expand(stage_path: Path) -> Path: + df = lazy.pd.read_parquet(stage_path / "parquet-files") + expanded = lazy.pd.DataFrame([{**row, "turn": turn} for row in df.to_dict(orient="records") for turn in (1, 2)]) + output_path = stage_path / "callback-outputs" / "expand-turns" + output_path.mkdir(parents=True) + expanded.to_parquet(output_path / "data.parquet", index=False) + return output_path + + stage_2 = DataDesignerConfigBuilder(model_configs=stub_model_configs) + stage_2.add_column(ExpressionColumnConfig(name="message", expr="{{ persona }} turn {{ turn }}")) + + workflow = _real_data_designer(tmp_path / "artifacts", stub_model_providers).compose_workflow(name="expand") + workflow.add_stage("personas", stage_1, num_records=2, on_success=expand, on_success_version="expand-turns") + workflow.add_stage("messages", stage_2) + + df = workflow.run().load_dataset().sort_values(["name", "turn"]).reset_index(drop=True) + + assert df["message"].tolist() == [ + "Ada turn 1", + "Ada turn 2", + "Linus turn 1", + "Linus turn 2", + ] + + +def test_composite_workflow_does_not_forward_dropped_processor_columns( + tmp_path: Path, + stub_model_providers: list[ModelProvider], + stub_model_configs: list[ModelConfig], +) -> None: + stage_1 = _seeded_builder(stub_model_configs, [{"name": "Ada", "secret": "hidden"}]) + stage_1.add_column(ExpressionColumnConfig(name="public_name", expr="{{ name }}")) + stage_1.add_processor(DropColumnsProcessorConfig(name="drop_secret", column_names=["secret"])) + + stage_2 = DataDesignerConfigBuilder(model_configs=stub_model_configs) + stage_2.add_column(ExpressionColumnConfig(name="copied_name", expr="{{ public_name }}")) + + workflow = _real_data_designer(tmp_path / "artifacts", stub_model_providers).compose_workflow(name="drop-processor") + workflow.add_stage("redacted", stage_1, num_records=1) + workflow.add_stage("downstream", stage_2) + + df = workflow.run().load_dataset() + + assert df.to_dict(orient="records") == [{"name": "Ada", "public_name": "Ada", "copied_name": "Ada"}] + assert "secret" not in df.columns + + +def test_composite_workflow_can_seed_from_processor_output_callback( + tmp_path: Path, + stub_model_providers: list[ModelProvider], + stub_model_configs: list[ModelConfig], +) -> None: + stage_1 = _seeded_builder(stub_model_configs, [{"name": "Ada"}, {"name": "Linus"}]) + stage_1.add_column(ExpressionColumnConfig(name="persona", expr="{{ name }}")) + stage_1.add_processor(SchemaTransformProcessorConfig(name="compact", template={"compact_name": "{{ persona }}"})) + + def use_processor_output(stage_path: Path) -> Path: + return stage_path / "processors-files" / "compact" + + stage_2 = DataDesignerConfigBuilder(model_configs=stub_model_configs) + stage_2.add_column(ExpressionColumnConfig(name="final", expr="{{ compact_name }} final")) + + workflow = _real_data_designer(tmp_path / "artifacts", stub_model_providers).compose_workflow( + name="processor-callback" + ) + workflow.add_stage("compact", stage_1, num_records=2, on_success=use_processor_output) + workflow.add_stage("final", stage_2) + + df = workflow.run().load_dataset().sort_values("compact_name").reset_index(drop=True) + + assert df.to_dict(orient="records") == [ + {"compact_name": "Ada", "final": "Ada final"}, + {"compact_name": "Linus", "final": "Linus final"}, + ] + + +def test_composite_workflow_runs_custom_generator_in_downstream_stage( + tmp_path: Path, + stub_model_providers: list[ModelProvider], + stub_model_configs: list[ModelConfig], +) -> None: + @custom_column_generator(required_columns=["name"]) + def slug(row: dict) -> dict: + return {**row, "slug": row["name"].lower()} + + stage_1 = _seeded_builder(stub_model_configs, [{"name": "Ada"}, {"name": "Linus"}]) + stage_1.add_column(ExpressionColumnConfig(name="name_copy", expr="{{ name }}")) + + stage_2 = DataDesignerConfigBuilder(model_configs=stub_model_configs) + stage_2.add_column(CustomColumnConfig(name="slug", generator_function=slug)) + + workflow = _real_data_designer(tmp_path / "artifacts", stub_model_providers).compose_workflow( + name="custom-generator" + ) + workflow.add_stage("base", stage_1, num_records=2) + workflow.add_stage("custom", stage_2) + + df = workflow.run().load_dataset().sort_values("name").reset_index(drop=True) + + assert df[["name", "slug"]].to_dict(orient="records") == [ + {"name": "Ada", "slug": "ada"}, + {"name": "Linus", "slug": "linus"}, + ] + + +def test_composite_workflow_export_defaults_to_final_stage( + tmp_path: Path, + stub_model_providers: list[ModelProvider], + stub_model_configs: list[ModelConfig], + stub_dataset_profiler_results, +) -> None: + data_designer = _data_designer(tmp_path / "artifacts", stub_model_providers) + + def fake_create( + config_builder: DataDesignerConfigBuilder, + *, + num_records: int, + dataset_name: str, + **kwargs, + ) -> DatasetCreationResults: + del num_records, kwargs + value = "first" if dataset_name == "stage-0-first" else "final" + return _result_from_df( + data_designer._artifact_path, + dataset_name, + lazy.pd.DataFrame({"stage": [value]}), + config_builder, + stub_dataset_profiler_results, + ) + + data_designer.create = MagicMock(side_effect=fake_create) + first = _seeded_builder(stub_model_configs, [{"stage": "first"}]) + first.add_column(ExpressionColumnConfig(name="stage_copy", expr="{{ stage }}")) + last = DataDesignerConfigBuilder(model_configs=stub_model_configs) + last.add_column(ExpressionColumnConfig(name="stage_final", expr="{{ stage }}")) + + workflow = data_designer.compose_workflow(name="export-final") + workflow.add_stage("first", first, num_records=1) + workflow.add_stage("last", last) + + output = workflow.run().export(tmp_path / "out.jsonl") + + assert output.read_text(encoding="utf-8").strip() == '{"stage":"final"}' + + +def test_composite_workflow_push_to_hub_defaults_to_final_stage( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, + stub_model_providers: list[ModelProvider], + stub_model_configs: list[ModelConfig], + stub_dataset_profiler_results, +) -> None: + data_designer = _data_designer(tmp_path / "artifacts", stub_model_providers) + + def fake_create( + config_builder: DataDesignerConfigBuilder, + *, + num_records: int, + dataset_name: str, + **kwargs, + ) -> DatasetCreationResults: + del num_records, kwargs + return _result_from_df( + data_designer._artifact_path, + dataset_name, + lazy.pd.DataFrame({"stage": [dataset_name]}), + config_builder, + stub_dataset_profiler_results, + ) + + upload_calls = [] + + class StubHubClient: + def __init__(self, token: str | None = None): + self.token = token + + def upload_dataset(self, **kwargs): + upload_calls.append(kwargs) + return "https://huggingface.co/datasets/user/final" + + data_designer.create = MagicMock(side_effect=fake_create) + monkeypatch.setattr("data_designer.interface.results.HuggingFaceHubClient", StubHubClient) + first = _seeded_builder(stub_model_configs, [{"stage": "first"}]) + first.add_column(ExpressionColumnConfig(name="stage_copy", expr="{{ stage }}")) + last = DataDesignerConfigBuilder(model_configs=stub_model_configs) + last.add_column(ExpressionColumnConfig(name="stage_final", expr="{{ stage }}")) + + workflow = data_designer.compose_workflow(name="push-final") + workflow.add_stage("first", first, num_records=1) + workflow.add_stage("last", last) + + url = workflow.run().push_to_hub("user/final", "description", token="token", private=True, tags=["tag"]) + + assert url == "https://huggingface.co/datasets/user/final" + assert upload_calls == [ + { + "repo_id": "user/final", + "base_dataset_path": tmp_path / "artifacts" / "push-final" / "stage-1-last", + "private": True, + "description": "description", + "tags": ["tag"], + } + ] From cdb6a763b2b8819f33cb6bf3740690e182e56cb0 Mon Sep 17 00:00:00 2001 From: Andre Manoel Date: Tue, 12 May 2026 19:19:15 +0000 Subject: [PATCH 02/14] test: tidy workflow chaining coverage --- .../interface/test_composite_workflow.py | 38 +++++++++---------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/packages/data-designer/tests/interface/test_composite_workflow.py b/packages/data-designer/tests/interface/test_composite_workflow.py index ed401034e..7d03690b4 100644 --- a/packages/data-designer/tests/interface/test_composite_workflow.py +++ b/packages/data-designer/tests/interface/test_composite_workflow.py @@ -96,18 +96,26 @@ def _category_builder(model_configs: list[ModelConfig]) -> DataDesignerConfigBui return builder -def _copy_builder(model_configs: list[ModelConfig]) -> DataDesignerConfigBuilder: +def _expression_builder(model_configs: list[ModelConfig], name: str, expr: str) -> DataDesignerConfigBuilder: builder = DataDesignerConfigBuilder(model_configs=model_configs) - builder.add_column(ExpressionColumnConfig(name="category_copy", expr="{{ category }}")) + builder.add_column(ExpressionColumnConfig(name=name, expr=expr)) return builder +def _copy_builder(model_configs: list[ModelConfig]) -> DataDesignerConfigBuilder: + return _expression_builder(model_configs, "category_copy", "{{ category }}") + + def _seeded_builder(model_configs: list[ModelConfig], rows: list[dict]) -> DataDesignerConfigBuilder: builder = DataDesignerConfigBuilder(model_configs=model_configs) builder.with_seed_dataset(DataFrameSeedSource(df=lazy.pd.DataFrame(rows))) return builder +def _load_workflow_metadata(artifact_path: Path, workflow_name: str) -> dict: + return json.loads((artifact_path / workflow_name / "workflow-metadata.json").read_text()) + + def test_dataset_creation_results_to_config_builder_columns( stub_model_configs: list[ModelConfig], stub_dataset_profiler_results, @@ -169,7 +177,7 @@ def test_composite_workflow_runs_linear_stages_with_disk_handoff( assert (stub_artifact_path / "linear-chain" / "stage-0-base").is_dir() assert (stub_artifact_path / "linear-chain" / "stage-1-copy").is_dir() - metadata = json.loads((stub_artifact_path / "linear-chain" / "workflow-metadata.json").read_text()) + metadata = _load_workflow_metadata(stub_artifact_path, "linear-chain") assert [stage["status"] for stage in metadata["stages"]] == ["completed", "completed"] assert metadata["stages"][1]["seeded_from_stage"] == "base" assert metadata["stages"][1]["depends_on"] == ["base"] @@ -211,7 +219,7 @@ def keep_first(stage_path: Path) -> Path: assert results["base"].count_records() == 3 assert results["copy"].count_records() == 1 - metadata = json.loads((stub_artifact_path / "callback-chain" / "workflow-metadata.json").read_text()) + metadata = _load_workflow_metadata(stub_artifact_path, "callback-chain") assert metadata["stages"][0]["output_records"] == 1 assert metadata["stages"][1]["num_records_requested"] == 1 @@ -324,11 +332,8 @@ def test_composite_workflow_runs_three_real_async_stages( stage_1 = _seeded_builder(stub_model_configs, [{"name": "Ada"}, {"name": "Linus"}]) stage_1.add_column(ExpressionColumnConfig(name="persona", expr="{{ name }} persona")) - stage_2 = DataDesignerConfigBuilder(model_configs=stub_model_configs) - stage_2.add_column(ExpressionColumnConfig(name="prompt_seed", expr="{{ persona }} prompt")) - - stage_3 = DataDesignerConfigBuilder(model_configs=stub_model_configs) - stage_3.add_column(ExpressionColumnConfig(name="final_label", expr="{{ prompt_seed }} final")) + stage_2 = _expression_builder(stub_model_configs, "prompt_seed", "{{ persona }} prompt") + stage_3 = _expression_builder(stub_model_configs, "final_label", "{{ prompt_seed }} final") workflow = _real_data_designer(tmp_path / "artifacts", stub_model_providers).compose_workflow(name="three-stage") workflow.add_stage("personas", stage_1, num_records=2) @@ -369,8 +374,7 @@ def expand(stage_path: Path) -> Path: expanded.to_parquet(output_path / "data.parquet", index=False) return output_path - stage_2 = DataDesignerConfigBuilder(model_configs=stub_model_configs) - stage_2.add_column(ExpressionColumnConfig(name="message", expr="{{ persona }} turn {{ turn }}")) + stage_2 = _expression_builder(stub_model_configs, "message", "{{ persona }} turn {{ turn }}") workflow = _real_data_designer(tmp_path / "artifacts", stub_model_providers).compose_workflow(name="expand") workflow.add_stage("personas", stage_1, num_records=2, on_success=expand, on_success_version="expand-turns") @@ -395,8 +399,7 @@ def test_composite_workflow_does_not_forward_dropped_processor_columns( stage_1.add_column(ExpressionColumnConfig(name="public_name", expr="{{ name }}")) stage_1.add_processor(DropColumnsProcessorConfig(name="drop_secret", column_names=["secret"])) - stage_2 = DataDesignerConfigBuilder(model_configs=stub_model_configs) - stage_2.add_column(ExpressionColumnConfig(name="copied_name", expr="{{ public_name }}")) + stage_2 = _expression_builder(stub_model_configs, "copied_name", "{{ public_name }}") workflow = _real_data_designer(tmp_path / "artifacts", stub_model_providers).compose_workflow(name="drop-processor") workflow.add_stage("redacted", stage_1, num_records=1) @@ -420,8 +423,7 @@ def test_composite_workflow_can_seed_from_processor_output_callback( def use_processor_output(stage_path: Path) -> Path: return stage_path / "processors-files" / "compact" - stage_2 = DataDesignerConfigBuilder(model_configs=stub_model_configs) - stage_2.add_column(ExpressionColumnConfig(name="final", expr="{{ compact_name }} final")) + stage_2 = _expression_builder(stub_model_configs, "final", "{{ compact_name }} final") workflow = _real_data_designer(tmp_path / "artifacts", stub_model_providers).compose_workflow( name="processor-callback" @@ -494,8 +496,7 @@ def fake_create( data_designer.create = MagicMock(side_effect=fake_create) first = _seeded_builder(stub_model_configs, [{"stage": "first"}]) first.add_column(ExpressionColumnConfig(name="stage_copy", expr="{{ stage }}")) - last = DataDesignerConfigBuilder(model_configs=stub_model_configs) - last.add_column(ExpressionColumnConfig(name="stage_final", expr="{{ stage }}")) + last = _expression_builder(stub_model_configs, "stage_final", "{{ stage }}") workflow = data_designer.compose_workflow(name="export-final") workflow.add_stage("first", first, num_records=1) @@ -545,8 +546,7 @@ def upload_dataset(self, **kwargs): monkeypatch.setattr("data_designer.interface.results.HuggingFaceHubClient", StubHubClient) first = _seeded_builder(stub_model_configs, [{"stage": "first"}]) first.add_column(ExpressionColumnConfig(name="stage_copy", expr="{{ stage }}")) - last = DataDesignerConfigBuilder(model_configs=stub_model_configs) - last.add_column(ExpressionColumnConfig(name="stage_final", expr="{{ stage }}")) + last = _expression_builder(stub_model_configs, "stage_final", "{{ stage }}") workflow = data_designer.compose_workflow(name="push-final") workflow.add_stage("first", first, num_records=1) From 2d38c887bf8d07cfef41417346da815ad89f39bf Mon Sep 17 00:00:00 2001 From: Andre Manoel Date: Tue, 12 May 2026 22:02:25 +0000 Subject: [PATCH 03/14] fix: harden workflow chaining concurrency --- .../data_designer/engine/models/factory.py | 7 +- .../engine/resources/resource_provider.py | 7 ++ .../engine/resources/seed_reader.py | 3 +- .../engine/resources/test_seed_reader.py | 4 +- .../interface/composite_workflow.py | 64 ++++++++------ .../data_designer/interface/data_designer.py | 63 +++++++------- .../tests/interface/test_acreate.py | 86 +++++++++++++++++-- .../interface/test_composite_workflow.py | 30 ++++++- 8 files changed, 194 insertions(+), 70 deletions(-) diff --git a/packages/data-designer-engine/src/data_designer/engine/models/factory.py b/packages/data-designer-engine/src/data_designer/engine/models/factory.py index 6ef2b2727..e3836a912 100644 --- a/packages/data-designer-engine/src/data_designer/engine/models/factory.py +++ b/packages/data-designer-engine/src/data_designer/engine/models/factory.py @@ -13,6 +13,7 @@ if TYPE_CHECKING: from data_designer.config.run_config import RunConfig from data_designer.engine.mcp.registry import MCPRegistry + from data_designer.engine.models.clients.throttle_manager import ThrottleManager from data_designer.engine.models.registry import ModelRegistry @@ -24,6 +25,7 @@ def create_model_registry( mcp_registry: MCPRegistry | None = None, client_concurrency_mode: ClientConcurrencyMode = ClientConcurrencyMode.SYNC, run_config: RunConfig | None = None, + throttle_manager: ThrottleManager | None = None, ) -> ModelRegistry: """Factory function for creating a ModelRegistry instance. @@ -43,6 +45,8 @@ def create_model_registry( run_config: Optional runtime configuration. The nested ``run_config.throttle`` (a ``ThrottleConfig``) is forwarded to the ``ThrottleManager`` constructor. + throttle_manager: Optional shared throttle manager. When omitted, a new + manager is created for this registry. Returns: A configured ModelRegistry instance. @@ -54,7 +58,8 @@ def create_model_registry( from data_designer.engine.models.facade import ModelFacade from data_designer.engine.models.registry import ModelRegistry - throttle_manager = ThrottleManager((run_config or RunConfig()).throttle) + if throttle_manager is None: + throttle_manager = ThrottleManager((run_config or RunConfig()).throttle) def model_facade_factory( model_config: ModelConfig, diff --git a/packages/data-designer-engine/src/data_designer/engine/resources/resource_provider.py b/packages/data-designer-engine/src/data_designer/engine/resources/resource_provider.py index a3642fd9a..bb012310b 100644 --- a/packages/data-designer-engine/src/data_designer/engine/resources/resource_provider.py +++ b/packages/data-designer-engine/src/data_designer/engine/resources/resource_provider.py @@ -4,6 +4,7 @@ from __future__ import annotations import os +from typing import TYPE_CHECKING from data_designer.config.base import ConfigBase from data_designer.config.dataset_metadata import DatasetMetadata @@ -26,6 +27,9 @@ from data_designer.engine.secret_resolver import SecretResolver from data_designer.engine.storage.artifact_storage import ArtifactStorage +if TYPE_CHECKING: + from data_designer.engine.models.clients.throttle_manager import ThrottleManager + class ResourceType(StrEnum): PERSON_READER = "person_reader" @@ -91,6 +95,7 @@ def create_resource_provider( mcp_providers: list[MCPProviderT] | None = None, tool_configs: list[ToolConfig] | None = None, client_concurrency_mode: ClientConcurrencyMode | None = None, + throttle_manager: ThrottleManager | None = None, ) -> ResourceProvider: """Factory function for creating a ResourceProvider instance. @@ -111,6 +116,7 @@ def create_resource_provider( run_config: Optional runtime configuration. mcp_providers: Optional list of MCP provider configurations. tool_configs: Optional list of tool configurations. + throttle_manager: Optional shared throttle manager for model clients. Returns: A configured ResourceProvider instance. @@ -158,6 +164,7 @@ def create_resource_provider( mcp_registry=mcp_registry, client_concurrency_mode=client_concurrency_mode, run_config=effective_run_config, + throttle_manager=throttle_manager, ), person_reader=person_reader, mcp_registry=mcp_registry, diff --git a/packages/data-designer-engine/src/data_designer/engine/resources/seed_reader.py b/packages/data-designer-engine/src/data_designer/engine/resources/seed_reader.py index 2c1c16c78..68dece454 100644 --- a/packages/data-designer-engine/src/data_designer/engine/resources/seed_reader.py +++ b/packages/data-designer-engine/src/data_designer/engine/resources/seed_reader.py @@ -6,6 +6,7 @@ import logging from abc import ABC, abstractmethod from collections.abc import Callable, Iterable, Sequence +from copy import copy from dataclasses import dataclass from fnmatch import fnmatchcase from pathlib import Path, PurePosixPath @@ -673,7 +674,7 @@ def add_reader(self, reader: SeedReader) -> Self: return self def get_reader(self, seed_dataset_source: SeedSource, secret_resolver: SecretResolver) -> SeedReader: - reader = self._get_reader_for_source(seed_dataset_source) + reader = copy(self._get_reader_for_source(seed_dataset_source)) reader.attach(seed_dataset_source, secret_resolver) return reader diff --git a/packages/data-designer-engine/tests/engine/resources/test_seed_reader.py b/packages/data-designer-engine/tests/engine/resources/test_seed_reader.py index 4f504a52c..979e66a12 100644 --- a/packages/data-designer-engine/tests/engine/resources/test_seed_reader.py +++ b/packages/data-designer-engine/tests/engine/resources/test_seed_reader.py @@ -412,7 +412,9 @@ def test_get_reader_basic(): reader = registry.get_reader(local_seed_config, PlaintextResolver()) - assert reader == df_reader + assert isinstance(reader, DataFrameSeedReader) + assert reader is not df_reader + assert reader.source is local_seed_config def test_get_reader_missing(): diff --git a/packages/data-designer/src/data_designer/interface/composite_workflow.py b/packages/data-designer/src/data_designer/interface/composite_workflow.py index f39a05702..4a2a57d62 100644 --- a/packages/data-designer/src/data_designer/interface/composite_workflow.py +++ b/packages/data-designer/src/data_designer/interface/composite_workflow.py @@ -5,10 +5,10 @@ import hashlib import json +import logging import shutil import time -from collections.abc import Callable, Iterator -from contextlib import contextmanager +from collections.abc import Callable, ItemsView, Iterator, KeysView from dataclasses import dataclass from pathlib import Path from typing import TYPE_CHECKING, Any @@ -24,9 +24,14 @@ from data_designer.interface.results import DatasetCreationResults if TYPE_CHECKING: + import pandas as pd + + from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults from data_designer.interface.data_designer import DataDesigner +logger = logging.getLogger(__name__) + OnSuccessCallback = Callable[[Path], Path | str] @@ -50,13 +55,21 @@ class SkippedStageResult: class CompositeWorkflowResults: + """Results for a composite workflow run. + + Per-stage entries are the original ``DataDesigner.create()`` results. If a + stage uses ``on_success``, metadata and downstream seeding use the callback + output path while the stage result still points at the stage's generated + dataset. + """ + def __init__( self, *, name: str, stage_results: dict[str, DatasetCreationResults | SkippedStageResult], final_stage_name: str, - ): + ) -> None: self.name = name self.stage_results = stage_results self.final_stage_name = final_stage_name @@ -67,10 +80,10 @@ def __getitem__(self, stage_name: str) -> DatasetCreationResults | SkippedStageR def __iter__(self) -> Iterator[str]: return iter(self.stage_results) - def keys(self): + def keys(self) -> KeysView[str]: return self.stage_results.keys() - def items(self): + def items(self) -> ItemsView[str, DatasetCreationResults | SkippedStageResult]: return self.stage_results.items() @property @@ -80,24 +93,24 @@ def final_result(self) -> DatasetCreationResults: raise DataDesignerWorkflowError(f"Final stage {self.final_stage_name!r} was skipped: {result.status}.") return result - def load_dataset(self): + def load_dataset(self) -> pd.DataFrame: return self.final_result.load_dataset() - def load_analysis(self): + def load_analysis(self) -> DatasetProfilerResults: return self.final_result.load_analysis() def count_records(self) -> int: return self.final_result.count_records() - def export(self, *args, **kwargs): + def export(self, *args: Any, **kwargs: Any) -> Path: return self.final_result.export(*args, **kwargs) - def push_to_hub(self, *args, **kwargs): + def push_to_hub(self, *args: Any, **kwargs: Any) -> str: return self.final_result.push_to_hub(*args, **kwargs) class CompositeWorkflow: - def __init__(self, *, name: str, data_designer: DataDesigner): + def __init__(self, *, name: str, data_designer: DataDesigner) -> None: _validate_dir_name(name, "workflow name") self.name = name self._data_designer = data_designer @@ -123,7 +136,7 @@ def add_stage( self._stages.append( _WorkflowStage( name=name, - config_builder=config_builder, + config_builder=_clone_config_builder(config_builder), depends_on=(self._stages[-1].name,) if self._stages else (), num_records=num_records, on_success=on_success, @@ -136,6 +149,7 @@ def add_stage( return self def run(self) -> CompositeWorkflowResults: + """Run all stages from scratch, replacing deterministic stage directories.""" if not self._stages: raise DataDesignerWorkflowError(f"Workflow {self.name!r} has no stages.") @@ -174,6 +188,12 @@ def run(self) -> CompositeWorkflowResults: stage_builder = _clone_config_builder(stage.config_builder) if previous_seed_path is not None: + if stage_builder.get_seed_config() is not None: + logger.warning( + "Stage %r has a seed dataset; workflow will seed it from upstream stage %r.", + stage.name, + previous_stage_name, + ) stage_builder.with_seed_dataset( _local_seed_source_from_path(previous_seed_path), sampling_strategy=stage.sampling_strategy, @@ -206,12 +226,12 @@ def run(self) -> CompositeWorkflowResults: start_time = time.monotonic() try: - with _temporary_artifact_path(self._data_designer, workflow_path): - result = self._data_designer.create( - stage_builder, - num_records=num_records, - dataset_name=stage_dir_name, - ) + result = self._data_designer.create( + stage_builder, + num_records=num_records, + dataset_name=stage_dir_name, + artifact_path=workflow_path, + ) actual_records = result.count_records() output_seed_path = result.artifact_storage.final_dataset_path callback_output_path = None @@ -263,16 +283,6 @@ def _clone_config_builder(config_builder: DataDesignerConfigBuilder) -> DataDesi return DataDesignerConfigBuilder.from_config(BuilderConfig(data_designer=config_builder.build())) -@contextmanager -def _temporary_artifact_path(data_designer: DataDesigner, artifact_path: Path): - original_artifact_path = data_designer._artifact_path - data_designer._artifact_path = artifact_path - try: - yield - finally: - data_designer._artifact_path = original_artifact_path - - def _stage_dir_name(index: int, name: str) -> str: return f"stage-{index}-{name}" diff --git a/packages/data-designer/src/data_designer/interface/data_designer.py b/packages/data-designer/src/data_designer/interface/data_designer.py index e1acbc341..a397c7044 100644 --- a/packages/data-designer/src/data_designer/interface/data_designer.py +++ b/packages/data-designer/src/data_designer/interface/data_designer.py @@ -5,7 +5,6 @@ import asyncio import logging -import threading import warnings from pathlib import Path from typing import TYPE_CHECKING @@ -75,6 +74,7 @@ from data_designer.plugins.registry import PluginRegistry if TYPE_CHECKING: + from data_designer.engine.models.clients.throttle_manager import ThrottleManager from data_designer.engine.models.facade import ModelFacade from data_designer.interface.composite_workflow import CompositeWorkflow @@ -154,7 +154,7 @@ def __init__( self._secret_resolver = secret_resolver or DEFAULT_SECRET_RESOLVER self._artifact_path = Path(artifact_path) if artifact_path is not None else Path.cwd() / "artifacts" self._run_config = RunConfig() - self._create_lock = threading.Lock() + self._throttle_manager: ThrottleManager = self._create_throttle_manager() self._managed_assets_path = Path(managed_assets_path or MANAGED_ASSETS_PATH) self._person_reader = person_reader # Only consult the YAML's `default:` key when we are also falling back to @@ -222,6 +222,7 @@ def create( num_records: int = DEFAULT_NUM_RECORDS, dataset_name: str = "dataset", resume: ResumeMode = ResumeMode.NEVER, + artifact_path: Path | str | None = None, ) -> DatasetCreationResults: """Create dataset and save results to the local artifact storage. @@ -253,6 +254,8 @@ def create( In all resume modes, in-flight partial results from the interrupted run are discarded before generation continues. + artifact_path: Optional artifact root for this create call. Defaults + to the path configured on this DataDesigner instance. Returns: DatasetCreationResults object with methods for loading the generated dataset, @@ -265,7 +268,13 @@ def create( logger.info("🎨 Creating Data Designer dataset") self._log_jinja_rendering_engine_mode() - resource_provider = self._create_resource_provider(dataset_name, config_builder, resume=resume) + artifact_path = Path(artifact_path) if artifact_path is not None else self._artifact_path + resource_provider = self._create_resource_provider( + dataset_name, + config_builder, + resume=resume, + artifact_path=artifact_path, + ) # ``DeprecationWarning`` is re-raised before the generic wrapper so that # ``warnings.warn(..., DeprecationWarning)`` calls inside the engine — most @@ -370,31 +379,13 @@ async def acreate( num_records: int = DEFAULT_NUM_RECORDS, dataset_name: str = "dataset", resume: ResumeMode = ResumeMode.NEVER, + artifact_path: Path | str | None = None, ) -> DatasetCreationResults: """Async wrapper for creating a dataset without blocking the caller's event loop.""" - return await asyncio.to_thread( - self._create_threadsafe, - config_builder, - num_records=num_records, - dataset_name=dataset_name, - resume=resume, - ) - - def _create_threadsafe( - self, - config_builder: DataDesignerConfigBuilder, - *, - num_records: int, - dataset_name: str, - resume: ResumeMode, - ) -> DatasetCreationResults: - with self._create_lock: - return self.create( - config_builder, - num_records=num_records, - dataset_name=dataset_name, - resume=resume, - ) + kwargs = {"num_records": num_records, "dataset_name": dataset_name, "resume": resume} + if artifact_path is not None: + kwargs["artifact_path"] = artifact_path + return await asyncio.to_thread(self.create, config_builder, **kwargs) def preview( self, config_builder: DataDesignerConfigBuilder, *, num_records: int = DEFAULT_NUM_RECORDS @@ -633,18 +624,22 @@ def _create_dataset_profiler( ) def _create_resource_provider( - self, dataset_name: str, config_builder: DataDesignerConfigBuilder, *, resume: ResumeMode = ResumeMode.NEVER + self, + dataset_name: str, + config_builder: DataDesignerConfigBuilder, + *, + resume: ResumeMode = ResumeMode.NEVER, + artifact_path: Path | None = None, ) -> ResourceProvider: - ArtifactStorage.mkdir_if_needed(self._artifact_path) + artifact_path = artifact_path or self._artifact_path + ArtifactStorage.mkdir_if_needed(artifact_path) seed_dataset_source = None if (seed_config := config_builder.get_seed_config()) is not None: seed_dataset_source = seed_config.source return create_resource_provider( - artifact_storage=ArtifactStorage( - artifact_path=self._artifact_path, dataset_name=dataset_name, resume=resume - ), + artifact_storage=ArtifactStorage(artifact_path=artifact_path, dataset_name=dataset_name, resume=resume), model_configs=config_builder.model_configs, secret_resolver=self._secret_resolver, model_provider_registry=self._model_provider_registry, @@ -655,8 +650,14 @@ def _create_resource_provider( mcp_providers=self._mcp_providers, tool_configs=config_builder.tool_configs, client_concurrency_mode=self._resolve_client_concurrency_mode(config_builder), + throttle_manager=self._throttle_manager, ) + def _create_throttle_manager(self) -> ThrottleManager: + from data_designer.engine.models.clients.throttle_manager import ThrottleManager + + return ThrottleManager(self._run_config.throttle) + @staticmethod def _resolve_client_concurrency_mode(config_builder: DataDesignerConfigBuilder) -> ClientConcurrencyMode: """Pick the model-client mode that matches the engine the run will use. diff --git a/packages/data-designer/tests/interface/test_acreate.py b/packages/data-designer/tests/interface/test_acreate.py index 5dd008be6..4684ec1cc 100644 --- a/packages/data-designer/tests/interface/test_acreate.py +++ b/packages/data-designer/tests/interface/test_acreate.py @@ -4,6 +4,7 @@ from __future__ import annotations import asyncio +import threading from pathlib import Path from unittest.mock import MagicMock @@ -28,6 +29,18 @@ def _seeded_builder(model_configs: list[ModelConfig], names: list[str]) -> DataD return builder +def _creation_result( + config_builder: DataDesignerConfigBuilder, + stub_dataset_profiler_results, +) -> DatasetCreationResults: + return DatasetCreationResults( + artifact_storage=MagicMock(), + analysis=stub_dataset_profiler_results, + config_builder=config_builder, + dataset_metadata=DatasetMetadata(), + ) + + @pytest.mark.asyncio async def test_acreate_delegates_to_create( tmp_path: Path, @@ -37,12 +50,8 @@ async def test_acreate_delegates_to_create( ) -> None: data_designer = DataDesigner(artifact_path=tmp_path / "artifacts", model_providers=stub_model_providers) artifact_storage = MagicMock() - expected = DatasetCreationResults( - artifact_storage=artifact_storage, - analysis=stub_dataset_profiler_results, - config_builder=_seeded_builder(stub_model_configs, ["Ada"]), - dataset_metadata=DatasetMetadata(), - ) + expected = _creation_result(_seeded_builder(stub_model_configs, ["Ada"]), stub_dataset_profiler_results) + expected.artifact_storage = artifact_storage data_designer.create = MagicMock(return_value=expected) builder = _seeded_builder(stub_model_configs, ["Ada"]) @@ -62,6 +71,71 @@ async def test_acreate_delegates_to_create( ) +@pytest.mark.asyncio +async def test_acreate_does_not_serialize_create_calls( + tmp_path: Path, + stub_model_providers: list[ModelProvider], + stub_model_configs: list[ModelConfig], + stub_dataset_profiler_results, +) -> None: + data_designer = DataDesigner(artifact_path=tmp_path / "artifacts", model_providers=stub_model_providers) + started_count = 0 + started_lock = threading.Lock() + both_started = threading.Event() + release = threading.Event() + + def fake_create( + config_builder: DataDesignerConfigBuilder, + *, + num_records: int, + dataset_name: str, + resume: ResumeMode = ResumeMode.NEVER, + ) -> DatasetCreationResults: + nonlocal started_count + del num_records, dataset_name, resume + with started_lock: + started_count += 1 + if started_count == 2: + both_started.set() + assert both_started.wait(2) + assert release.wait(2) + return _creation_result(config_builder, stub_dataset_profiler_results) + + data_designer.create = MagicMock(side_effect=fake_create) + left = _seeded_builder(stub_model_configs, ["Ada"]) + right = _seeded_builder(stub_model_configs, ["Grace"]) + + left_task = asyncio.create_task(data_designer.acreate(left, num_records=1, dataset_name="left")) + right_task = asyncio.create_task(data_designer.acreate(right, num_records=1, dataset_name="right")) + try: + assert await asyncio.to_thread(both_started.wait, 2) + finally: + release.set() + + left_result, right_result = await asyncio.gather(left_task, right_task) + + assert isinstance(left_result, DatasetCreationResults) + assert isinstance(right_result, DatasetCreationResults) + assert data_designer.create.call_count == 2 + + +def test_data_designer_reuses_throttle_manager_across_create_calls( + tmp_path: Path, + stub_model_providers: list[ModelProvider], + stub_model_configs: list[ModelConfig], +) -> None: + data_designer = DataDesigner(artifact_path=tmp_path / "artifacts", model_providers=stub_model_providers) + left = _seeded_builder(stub_model_configs, ["Ada"]) + right = _seeded_builder(stub_model_configs, ["Grace"]) + + left_provider = data_designer._create_resource_provider("left", left) + right_provider = data_designer._create_resource_provider("right", right) + + assert left_provider.model_registry is not None + assert right_provider.model_registry is not None + assert left_provider.model_registry.throttle_manager is right_provider.model_registry.throttle_manager + + @pytest.mark.asyncio async def test_acreate_supports_gathered_real_async_workflows( tmp_path: Path, diff --git a/packages/data-designer/tests/interface/test_composite_workflow.py b/packages/data-designer/tests/interface/test_composite_workflow.py index 7d03690b4..d9ad5bb2e 100644 --- a/packages/data-designer/tests/interface/test_composite_workflow.py +++ b/packages/data-designer/tests/interface/test_composite_workflow.py @@ -70,10 +70,11 @@ def fake_create( dataset_name: str, **kwargs, ) -> DatasetCreationResults: + artifact_path = Path(kwargs.pop("artifact_path", data_designer._artifact_path)) del kwargs df = lazy.pd.DataFrame({"category": ["alpha"] * num_records, "category_copy": ["alpha"] * num_records}) return _result_from_df( - data_designer._artifact_path, + artifact_path, dataset_name, df, config_builder, @@ -176,6 +177,7 @@ def test_composite_workflow_runs_linear_stages_with_disk_handoff( assert "category_copy" in final_df.columns assert (stub_artifact_path / "linear-chain" / "stage-0-base").is_dir() assert (stub_artifact_path / "linear-chain" / "stage-1-copy").is_dir() + assert data_designer._artifact_path == stub_artifact_path metadata = _load_workflow_metadata(stub_artifact_path, "linear-chain") assert [stage["status"] for stage in metadata["stages"]] == ["completed", "completed"] @@ -183,6 +185,7 @@ def test_composite_workflow_runs_linear_stages_with_disk_handoff( assert metadata["stages"][1]["depends_on"] == ["base"] assert metadata["stages"][1]["num_records_requested"] == 3 assert create_mock.call_args_list[1].kwargs["num_records"] == 3 + assert create_mock.call_args_list[1].kwargs["artifact_path"] == stub_artifact_path / "linear-chain" second_stage_builder = create_mock.call_args_list[1].args[0] seed_config = second_stage_builder.get_seed_config() assert isinstance(seed_config.source, LocalFileSeedSource) @@ -324,6 +327,25 @@ def test_composite_workflow_rejects_duplicate_stage_names( workflow.add_stage("base", _copy_builder(stub_model_configs)) +def test_composite_workflow_clones_stage_builders_on_add( + stub_artifact_path: Path, + stub_model_providers: list[ModelProvider], + stub_model_configs: list[ModelConfig], + stub_dataset_profiler_results, +) -> None: + data_designer = _data_designer(stub_artifact_path, stub_model_providers) + create_mock = _patch_create(data_designer, stub_dataset_profiler_results) + builder = _category_builder(stub_model_configs) + workflow = data_designer.compose_workflow(name="clone-builder") + workflow.add_stage("base", builder, num_records=1) + builder.add_column(ExpressionColumnConfig(name="late_column", expr="{{ category }}")) + + workflow.run() + + stage_builder = create_mock.call_args.args[0] + assert [column.name for column in stage_builder.get_column_configs()] == ["category"] + + def test_composite_workflow_runs_three_real_async_stages( tmp_path: Path, stub_model_providers: list[ModelProvider], @@ -483,10 +505,11 @@ def fake_create( dataset_name: str, **kwargs, ) -> DatasetCreationResults: + artifact_path = Path(kwargs.pop("artifact_path", data_designer._artifact_path)) del num_records, kwargs value = "first" if dataset_name == "stage-0-first" else "final" return _result_from_df( - data_designer._artifact_path, + artifact_path, dataset_name, lazy.pd.DataFrame({"stage": [value]}), config_builder, @@ -523,9 +546,10 @@ def fake_create( dataset_name: str, **kwargs, ) -> DatasetCreationResults: + artifact_path = Path(kwargs.pop("artifact_path", data_designer._artifact_path)) del num_records, kwargs return _result_from_df( - data_designer._artifact_path, + artifact_path, dataset_name, lazy.pd.DataFrame({"stage": [dataset_name]}), config_builder, From 4950ed55a2cd9a7c02fb8b180210712e5c072ffb Mon Sep 17 00:00:00 2001 From: Andre Manoel Date: Thu, 14 May 2026 19:58:09 +0000 Subject: [PATCH 04/14] docs: update workflow chaining plan --- plans/workflow-chaining/workflow-chaining.md | 89 ++++++++++++++++++-- 1 file changed, 84 insertions(+), 5 deletions(-) diff --git a/plans/workflow-chaining/workflow-chaining.md b/plans/workflow-chaining/workflow-chaining.md index 95a4468ec..68f2dc8b2 100644 --- a/plans/workflow-chaining/workflow-chaining.md +++ b/plans/workflow-chaining/workflow-chaining.md @@ -91,11 +91,30 @@ This is the *only* place in the chaining surface that uses an in-memory handoff. Each stage seeds from the **previous stage's final dataset** - the post-processor output with dropped columns excluded. This is the same DataFrame returned by `DatasetCreationResults.load_dataset()`. -Processor outputs (named processor artifacts), schema-transform artifacts, dropped columns, and media assets (images stored on disk with relative paths in the DataFrame) are NOT automatically forwarded in v1. If a downstream stage needs one of these artifacts, use an `on_success` callback to read the upstream stage artifact directory and write a `LocalFileSeedSource`-compatible dataset for the next stage. First-class seeding from named processor outputs is a future extension. +Processor outputs (named processor artifacts), schema-transform artifacts, dropped columns, and media assets (images stored on disk with relative paths in the DataFrame) are NOT automatically forwarded in v1. If a downstream stage needs one of these artifacts, use an `on_success` callback to read the upstream stage artifact directory and write a `LocalFileSeedSource`-compatible dataset for the next stage. First-class seeding from named processor outputs is a future extension. Related near-term extensions are seeded processor-only configs and structured stage `postprocessors`, so processor-expressible transforms can remain normal Data Designer processor configs rather than workflow-specific callback code. -#### Between-stage callbacks +#### Between-stage postprocessors and callbacks -Users may need to transform data between stages. `CompositeWorkflow.add_stage()` supports an optional `on_success` callback that transforms a completed stage's output before child stages seed from it: +Users may need to transform data between stages. The preferred future API is a `postprocessors` argument on `CompositeWorkflow.add_stage()`: + +```python +workflow.add_stage( + "generated", + config_gen, + num_records=1000, + postprocessors=[ + DropColumnsProcessorConfig( + name="drop_scratch", + column_names=["scratch_*"], + ) + ], +) +workflow.add_stage("enriched", config_enrich) +``` + +Postprocessors run after the stage's `DataDesigner.create()` completes and before child stages seed from that stage. They operate on the stage's final dataset, write a deterministic postprocessed artifact, and make that artifact the stage output for downstream seeding. They should accept existing built-in or plugin `ProcessorConfig` objects and be fingerprinted from those configs, so no callback version string is needed. + +Arbitrary Python data transforms should come through the `CustomProcessor` work tracked in #159, rather than a workflow-only callable type. Until that exists, `on_success` remains the escape hatch for arbitrary data-changing code: ```python def filter_high_quality(stage_output_path: Path) -> Path: @@ -117,12 +136,33 @@ workflow.add_stage( workflow.add_stage("enriched", config_enrich) ``` -The callback receives the path to the completed stage's artifact directory (containing `parquet-files/`, `metadata.json`, etc.) and returns a path that child stages will seed from. This keeps large DataFrames on disk and gives users full control. Callback outputs should live under a managed subdirectory such as `/callback-outputs//` so they stay deterministic and easy to clean. +The callback receives the path to the completed stage's artifact directory (containing `parquet-files/`, `metadata.json`, etc.) and returns a path that child stages will seed from. This keeps large DataFrames on disk and gives users full control. Callback outputs should live under a managed subdirectory such as `/callback-outputs//` so they stay deterministic and easy to clean. Once postprocessors and seeded processor-only configs are valid, processor-expressible transforms should use those structured APIs instead of raw callbacks: -**Callback resume policy**: The composite workflow does not hash arbitrary Python source or bytecode in v1. `on_success_version` is the explicit callback identity recorded in `workflow-metadata.json` and included in the stage's exposed output fingerprint. If `on_success` is set without `on_success_version`, that stage's transformed output is treated as dirty on every resume so a changed callback cannot silently reuse stale transformed data. The resolved path returned by the callback is also recorded as the dependent stage's seed path; a stage seeded from callback output is skippable only if that recorded path still exists and is readable by `LocalFileSeedSource`. +```python +filter_config = DataDesignerConfigBuilder().add_processor( + DropColumnsProcessorConfig( + name="drop_scratch", + column_names=["scratch_*"], + ), +) +workflow.add_stage("generated", config_gen, num_records=1000) +workflow.add_stage("filtered", filter_config) +workflow.add_stage("enriched", config_enrich) +``` + +**Callback resume policy**: The composite workflow does not hash arbitrary Python source or bytecode in v1. `on_success_version` is the explicit callback identity recorded in `workflow-metadata.json` and included in the stage's exposed output fingerprint. If `on_success` is set without `on_success_version`, that stage's transformed output is treated as dirty on every resume so a changed callback cannot silently reuse stale transformed data. The resolved path returned by the callback is also recorded as the dependent stage's seed path; a stage seeded from callback output is skippable only if that recorded path still exists and is readable by `LocalFileSeedSource`. Structured postprocessors should use their processor config fingerprint instead. **Empty stage policy**: If a callback filters all rows (or a stage produces zero rows), the composite workflow raises `DataDesignerWorkflowError` by default. Stages can opt in to empty output with `allow_empty=True` on `add_stage()`, in which case the workflow marks that stage as `completed_empty` and all downstream stages as `skipped_empty_upstream`. `CompositeWorkflowResults` still contains every declared stage name: executed stages map to `DatasetCreationResults`, while skipped downstream stages map to `SkippedStageResult` with `status="skipped_empty_upstream"` and `upstream_stage=`. This avoids `KeyError`/`None` ambiguity and gives resume a durable state distinct from normal completion. +**Stage output selection**: The default stage output is the final dataset returned by `DatasetCreationResults.load_dataset()`. A future additive option can select a named processor artifact as the stage output: + +```python +workflow.add_stage("drafts", config_drafts, num_records=1000) +workflow.add_stage("sft", sft_config, output="processor:sft") +``` + +When `output="processor:"` is set, child stages seed from `processors-files//` instead of `parquet-files/`, and workflow metadata records the selected output source. This covers processors such as `SchemaTransformProcessorConfig`, which write side datasets while leaving the main dataset unchanged. + **Export/push policy**: `CompositeWorkflowResults` export and push helpers default to the final stage's dataset. Exporting selected stages or the full composite workflow artifact bundle is explicit future work. #### `num_records` and seed behavior @@ -361,6 +401,8 @@ result_2 = data_designer.create(config_2, num_records=200) # explode: 50 -> 200 - Add `compose_workflow(name: str)` factory method on `DataDesigner`. - Tests: multi-stage runs, explode/filter via callbacks, num_records defaulting, duplicate stage-name rejection, artifact layout, throttle reuse across stages. +**Status after PR #636:** Implemented `CompositeWorkflow`, `compose_workflow()`, `to_config_builder()`, disk handoff, stage metadata, `acreate()`, shared throttle manager reuse, explicit stage artifact roots, cloned stage builders, and concurrent-safe seed reader/resource-provider handling. Still deferred: stage-level resume, DAG branches, `allow_resize` removal, seeded processor-only configs, postprocessors, stage output selection, config bundles, and first-class artifact seeding. + ### Sidecar: `acreate()` on `DataDesigner` (independent of chaining v1) - Add `async def acreate(...)` mirroring `create()` but returning the awaitable instead of blocking. @@ -377,6 +419,7 @@ result_2 = data_designer.create(config_2, num_records=200) # explode: 50 -> 200 - Remove the `allow_resize` field from the config schema. - Add fail-fast guard in `ProcessorRunner` for pre-batch row-count changes. - Tests: verify rejection, migration path examples. +- Migration examples should prefer `postprocessors` for inline processor-expressible transforms, or seeded processor-only configs when the transform deserves its own named stage. Raw `on_success` remains the escape hatch for arbitrary custom transforms. ### Phase 3: Stage-level resume @@ -413,6 +456,39 @@ Items not on the current roadmap but worth flagging so they don't get accidental **Failure hooks.** v1 should raise on failure by default. A future `on_failure` callback could support cleanup, custom recovery, or explicit handling of all-rows-filtered cases, but it should not obscure the default failure semantics. +**Seeded processor-only configs.** Do not introduce a separate public `stages` wrapper module or a workflow-specific `processors=[...]` stage argument. Keep `CompositeWorkflow.add_stage()` as the orchestration surface, and make processor-only work expressible as a normal seeded config: + +```python +workflow.add_stage("drafts", config_drafts, num_records=1000) +workflow.add_stage("filtered", DataDesignerConfigBuilder().add_processor(...)) +workflow.add_stage("enriched", config_enrich) +``` + +The same config should also work outside workflows when the user supplies a seed directly: + +```python +config = ( + DataDesignerConfigBuilder() + .with_seed_dataset(LocalFileSeedSource(path="data/*.parquet")) + .add_processor(...) +) +result = data_designer.create(config) +``` + +This requires validation to allow a seed-only config when it has processors and a seed dataset, while still rejecting processor-only configs with no seed and configs that drop all output columns. + +**Stage postprocessors.** Add `postprocessors=[ProcessorConfig, ...]` to `CompositeWorkflow.add_stage()` for compact inline transforms at a stage boundary. Internally this can reuse the seeded processor-only config path above: seed a temporary processor config from the completed stage output, run it under a deterministic subdirectory, and use its final dataset as the stage output. This keeps row-count-changing 1:N and N:1 operations out of the async generation batch path while making the common case concise. Custom arbitrary transforms should use `CustomProcessor` when #159 lands; until then, keep raw `on_success` as the escape hatch. + +**Config bundles.** Built-in or plugin-provided bundles should be reusable config fragments/factories, not execution stages. A bundle can return or extend a `DataDesignerConfigBuilder`, and `add_stage()` can accept either a builder or a bundle by normalizing the bundle to a builder: + +```python +workflow.add_stage("draft_qa", bundles.document_qa(model_alias="fast")) +workflow.add_stage("quality", bundles.quality_judge(model_alias="judge")) +workflow.add_stage("sft", bundles.sft_export(), output="processor:sft") +``` + +Bundles can cover DataArc-like document QA, refinement, distillation, SFT export, or GRPO export shapes without introducing another workflow abstraction. Plugins can contribute bundles alongside existing column, seed-reader, and processor plugins. + **First-class artifact seeding.** v1 can bridge named processor outputs or schema-transform artifacts through `on_success` callbacks. A later API could support `seed_from_artifact(...)` or named processor output references directly. **Auto-composition from a single config.** Auto-detecting stage boundaries in one config is possible, but likely overkill for the initial roadmap. If it returns later, it should be a convenience layer on top of `CompositeWorkflow`, not a separate execution model. @@ -433,6 +509,8 @@ These were open in earlier drafts; recording the resolutions here so the design 6. **Export and push semantics** -> v1 export/push defaults to the final stage's dataset only. Exporting selected stages or a full composite workflow artifact bundle is future work. +7. **Stage extension model** -> The public surface remains `CompositeWorkflow`, not a separate `stages` module. Near-term extensibility is seeded processor-only configs, postprocessors, and stage output selection. Shared built-in/plugin units are config bundles, not a new executor. + ## Open questions 1. **Preview support**: Should `workflow.preview()` run all stages with small `num_records`? Or just preview the last stage seeded from a prior full run? @@ -445,5 +523,6 @@ These were open in earlier drafts; recording the resolutions here so the design - #447 - AsyncRunController refactor (partially superseded: pre-batch resize handling moves to composite workflow level instead of controller level) - #526 / #525 - Resume interrupted runs (single-stage batch/row-group resume primitive used by composite workflow stage resume) +- #159 - Custom processors (needed for arbitrary Python postprocessors without workflow-only callback types) - #462 - Progress bar and scheduler polish (independent) - #464 - Custom column retryable errors (independent) From 18179a3921861e23b27410643b2a3cb06fb57105 Mon Sep 17 00:00:00 2001 From: Andre Manoel Date: Thu, 14 May 2026 20:12:18 +0000 Subject: [PATCH 05/14] feat: add workflow stage postprocessors --- .../config/data_designer_config.py | 4 +- .../src/data_designer/engine/validation.py | 30 +++- .../tests/engine/test_validation.py | 31 +++++ .../interface/composite_workflow.py | 99 ++++++++++++- .../data_designer/interface/data_designer.py | 15 +- .../interface/test_composite_workflow.py | 130 ++++++++++++++++++ plans/workflow-chaining/workflow-chaining.md | 12 +- 7 files changed, 301 insertions(+), 20 deletions(-) diff --git a/packages/data-designer-config/src/data_designer/config/data_designer_config.py b/packages/data-designer-config/src/data_designer/config/data_designer_config.py index 8831ced08..874079f9d 100644 --- a/packages/data-designer-config/src/data_designer/config/data_designer_config.py +++ b/packages/data-designer-config/src/data_designer/config/data_designer_config.py @@ -28,7 +28,7 @@ class DataDesignerConfig(ExportableConfigBase): Attributes: columns: Required list of column configurations defining how each column - should be generated. Must contain at least one column. + should be generated. May be empty for seeded processor-only configs. model_configs: Optional list of model configurations for LLM-based generation. Each model config defines the model, provider, and inference parameters. tool_configs: Optional list of tool configurations for MCP tool calling. @@ -39,7 +39,7 @@ class DataDesignerConfig(ExportableConfigBase): processors: Optional list of processor configurations for post-generation transformations. """ - columns: list[Annotated[ColumnConfigT, Field(discriminator="column_type")]] = Field(min_length=1) + columns: list[Annotated[ColumnConfigT, Field(discriminator="column_type")]] model_configs: list[ModelConfig] | None = None tool_configs: list[ToolConfig] | None = None seed_config: SeedConfig | None = None diff --git a/packages/data-designer-engine/src/data_designer/engine/validation.py b/packages/data-designer-engine/src/data_designer/engine/validation.py index 600fde233..2b62a0d42 100644 --- a/packages/data-designer-engine/src/data_designer/engine/validation.py +++ b/packages/data-designer-engine/src/data_designer/engine/validation.py @@ -70,7 +70,7 @@ def validate_data_designer_config( violations.extend(validate_code_validation(columns=columns)) violations.extend(validate_expression_references(columns=columns, allowed_references=allowed_references)) violations.extend(validate_skip_references(columns=columns, allowed_references=allowed_references)) - violations.extend(validate_columns_not_all_dropped(columns=columns)) + violations.extend(validate_columns_not_all_dropped(columns=columns, processor_configs=processor_configs)) violations.extend(validate_drop_columns_processor(columns=columns, processor_configs=processor_configs)) violations.extend(validate_schema_transform_processor(columns=columns, processor_configs=processor_configs)) if not can_run_data_designer_locally(): @@ -258,8 +258,17 @@ def validate_code_validation( def validate_columns_not_all_dropped( columns: list[ColumnConfigT], + processor_configs: list[ProcessorConfigT] | None = None, ) -> list[Violation]: - remaining_cols = [c for c in columns if c.column_type != DataDesignerColumnType.SEED_DATASET and not c.drop] + processor_configs = processor_configs or [] + processor_dropped_columns = _resolve_processor_dropped_columns(columns, processor_configs) + remaining_cols = [ + c + for c in columns + if not c.drop + and c.name not in processor_dropped_columns + and (c.column_type != DataDesignerColumnType.SEED_DATASET or processor_configs) + ] if len(remaining_cols) == 0: return [ @@ -277,6 +286,23 @@ def validate_columns_not_all_dropped( return [] +def _resolve_processor_dropped_columns( + columns: list[ColumnConfigT], + processor_configs: list[ProcessorConfigT], +) -> set[str]: + column_names = {c.name for c in columns} + dropped_columns: set[str] = set() + for processor_config in processor_configs: + if processor_config.processor_type != ProcessorType.DROP_COLUMNS: + continue + for name in processor_config.column_names: + if _is_glob(name): + dropped_columns.update(col for col in column_names if fnmatch(col, name)) + else: + dropped_columns.add(name) + return dropped_columns + + def _is_glob(pattern: str) -> bool: return "*" in pattern diff --git a/packages/data-designer-engine/tests/engine/test_validation.py b/packages/data-designer-engine/tests/engine/test_validation.py index 9bbbb13df..f8e91a728 100644 --- a/packages/data-designer-engine/tests/engine/test_validation.py +++ b/packages/data-designer-engine/tests/engine/test_validation.py @@ -13,6 +13,7 @@ LLMTextColumnConfig, SamplerColumnConfig, Score, + SeedDatasetColumnConfig, ValidationColumnConfig, ) from data_designer.config.models import ImageContext, ModalityDataType @@ -268,6 +269,36 @@ def test_validate_columns_not_all_dropped(): assert violations[0].type == ViolationType.ALL_COLUMNS_DROPPED +def test_validate_columns_not_all_dropped_allows_seeded_processor_only_config(): + violations = validate_columns_not_all_dropped( + [SeedDatasetColumnConfig(name="seed_text")], + processor_configs=[ + SchemaTransformProcessorConfig(name="format", template={"text": "{{ seed_text }}"}), + ], + ) + + assert violations == [] + + +def test_validate_columns_not_all_dropped_rejects_seeded_processor_only_config_with_no_output_columns(): + violations = validate_columns_not_all_dropped( + [SeedDatasetColumnConfig(name="seed_text")], + processor_configs=[ + DropColumnsProcessorConfig(name="drop_seed", column_names=["seed_text"]), + ], + ) + + assert len(violations) == 1 + assert violations[0].type == ViolationType.ALL_COLUMNS_DROPPED + + +def test_validate_columns_not_all_dropped_still_rejects_seed_only_config(): + violations = validate_columns_not_all_dropped([SeedDatasetColumnConfig(name="seed_text")]) + + assert len(violations) == 1 + assert violations[0].type == ViolationType.ALL_COLUMNS_DROPPED + + def test_validate_expression_references(): violations = validate_expression_references( [ diff --git a/packages/data-designer/src/data_designer/interface/composite_workflow.py b/packages/data-designer/src/data_designer/interface/composite_workflow.py index 4a2a57d62..e469b91ca 100644 --- a/packages/data-designer/src/data_designer/interface/composite_workflow.py +++ b/packages/data-designer/src/data_designer/interface/composite_workflow.py @@ -14,6 +14,7 @@ from typing import TYPE_CHECKING, Any import data_designer.lazy_heavy_imports as lazy +from data_designer.config.base import ProcessorConfig from data_designer.config.config_builder import BuilderConfig, DataDesignerConfigBuilder from data_designer.config.data_designer_config import DataDesignerConfig from data_designer.config.seed import IndexRange, PartitionBlock, SamplingStrategy @@ -43,6 +44,8 @@ class _WorkflowStage: num_records: int | None on_success: OnSuccessCallback | None on_success_version: str | None + postprocessors: tuple[ProcessorConfig, ...] + output: str allow_empty: bool sampling_strategy: SamplingStrategy selection_strategy: IndexRange | PartitionBlock | None @@ -57,10 +60,9 @@ class SkippedStageResult: class CompositeWorkflowResults: """Results for a composite workflow run. - Per-stage entries are the original ``DataDesigner.create()`` results. If a + Per-stage entries are the effective ``DataDesigner.create()`` results. If a stage uses ``on_success``, metadata and downstream seeding use the callback - output path while the stage result still points at the stage's generated - dataset. + output path while the stage result still points at the stage's dataset. """ def __init__( @@ -69,10 +71,12 @@ def __init__( name: str, stage_results: dict[str, DatasetCreationResults | SkippedStageResult], final_stage_name: str, + final_output_path: Path | None = None, ) -> None: self.name = name self.stage_results = stage_results self.final_stage_name = final_stage_name + self._final_output_path = final_output_path def __getitem__(self, stage_name: str) -> DatasetCreationResults | SkippedStageResult: return self.stage_results[stage_name] @@ -94,12 +98,18 @@ def final_result(self) -> DatasetCreationResults: return result def load_dataset(self) -> pd.DataFrame: + self.final_result + if self._final_output_path is not None: + return _load_parquet_dataset(self._final_output_path) return self.final_result.load_dataset() def load_analysis(self) -> DatasetProfilerResults: return self.final_result.load_analysis() def count_records(self) -> int: + self.final_result + if self._final_output_path is not None: + return _count_parquet_records(self._final_output_path) return self.final_result.count_records() def export(self, *args: Any, **kwargs: Any) -> Path: @@ -124,6 +134,8 @@ def add_stage( num_records: int | None = None, on_success: OnSuccessCallback | None = None, on_success_version: str | None = None, + postprocessors: list[ProcessorConfig] | None = None, + output: str = "final", allow_empty: bool = False, sampling_strategy: SamplingStrategy = SamplingStrategy.ORDERED, selection_strategy: IndexRange | PartitionBlock | None = None, @@ -133,6 +145,7 @@ def add_stage( raise DataDesignerWorkflowError(f"Stage name {name!r} is already used in workflow {self.name!r}.") if num_records is not None and num_records < 1: raise DataDesignerWorkflowError("Stage num_records must be at least 1.") + _validate_stage_output(output) self._stages.append( _WorkflowStage( name=name, @@ -141,6 +154,8 @@ def add_stage( num_records=num_records, on_success=on_success, on_success_version=on_success_version, + postprocessors=_clone_processors(postprocessors or []), + output=output, allow_empty=allow_empty, sampling_strategy=sampling_strategy, selection_strategy=selection_strategy, @@ -233,12 +248,26 @@ def run(self) -> CompositeWorkflowResults: artifact_path=workflow_path, ) actual_records = result.count_records() - output_seed_path = result.artifact_storage.final_dataset_path + output_result = result + if stage.postprocessors: + postprocessor_builder = _postprocessor_config_builder( + stage_builder=stage_builder, + seed_path=result.artifact_storage.final_dataset_path, + postprocessors=stage.postprocessors, + ) + output_result = self._data_designer.create( + postprocessor_builder, + num_records=actual_records, + dataset_name="postprocessors", + artifact_path=workflow_path / stage_dir_name, + ) + + output_seed_path = _resolve_stage_output_path(output_result, stage.output) callback_output_path = None - output_records = actual_records + output_records = _count_parquet_records(output_seed_path) if stage.on_success is not None: - callback_output_path = Path(stage.on_success(result.artifact_storage.base_dataset_path)) + callback_output_path = Path(stage.on_success(output_result.artifact_storage.base_dataset_path)) output_seed_path = callback_output_path output_records = _count_parquet_records(callback_output_path) @@ -257,6 +286,9 @@ def run(self) -> CompositeWorkflowResults: "output_records": output_records, "output_seed_path": str(output_seed_path), "callback_output_path": str(callback_output_path) if callback_output_path else None, + "postprocessor_output_path": ( + str(output_result.artifact_storage.base_dataset_path) if stage.postprocessors else None + ), "duration_sec": time.monotonic() - start_time, } ) @@ -265,7 +297,7 @@ def run(self) -> CompositeWorkflowResults: _write_workflow_metadata(workflow_path, metadata) raise - stage_results[stage.name] = result + stage_results[stage.name] = output_result previous_seed_path = output_seed_path previous_output_records = output_records previous_stage_name = stage.name @@ -276,6 +308,7 @@ def run(self) -> CompositeWorkflowResults: name=self.name, stage_results=stage_results, final_stage_name=self._stages[-1].name, + final_output_path=previous_seed_path, ) @@ -283,6 +316,25 @@ def _clone_config_builder(config_builder: DataDesignerConfigBuilder) -> DataDesi return DataDesignerConfigBuilder.from_config(BuilderConfig(data_designer=config_builder.build())) +def _clone_processors(processors: list[ProcessorConfig]) -> tuple[ProcessorConfig, ...]: + return tuple(processor.model_copy(deep=True) for processor in processors) + + +def _postprocessor_config_builder( + *, + stage_builder: DataDesignerConfigBuilder, + seed_path: Path, + postprocessors: tuple[ProcessorConfig, ...], +) -> DataDesignerConfigBuilder: + builder = DataDesignerConfigBuilder( + model_configs=stage_builder.model_configs, + tool_configs=stage_builder.tool_configs, + ).with_seed_dataset(_local_seed_source_from_path(seed_path)) + for processor in postprocessors: + builder.add_processor(processor.model_copy(deep=True)) + return builder + + def _stage_dir_name(index: int, name: str) -> str: return f"stage-{index}-{name}" @@ -295,6 +347,8 @@ def _base_stage_metadata(index: int, stage: _WorkflowStage, stage_dir_name: str) "depends_on": list(stage.depends_on), "allow_empty": stage.allow_empty, "on_success_version": stage.on_success_version, + "postprocessors": [processor.model_dump(mode="json") for processor in stage.postprocessors], + "output": stage.output, "sampling_strategy": stage.sampling_strategy.value, "selection_strategy": _selection_strategy_payload(stage.selection_strategy), } @@ -314,6 +368,8 @@ def _stage_fingerprint( "selection_strategy": _selection_strategy_payload(stage.selection_strategy), "allow_empty": stage.allow_empty, "on_success_version": stage.on_success_version, + "postprocessors": [processor.model_dump(mode="json") for processor in stage.postprocessors], + "output": stage.output, "library_version": get_library_version(), "upstream_fingerprint": upstream_fingerprint, } @@ -332,6 +388,19 @@ def _local_seed_source_from_path(path: Path) -> LocalFileSeedSource: return LocalFileSeedSource(path=str(path)) +def _resolve_stage_output_path(result: DatasetCreationResults, output: str) -> Path: + if output == "final": + return result.artifact_storage.final_dataset_path + processor_name = output.removeprefix("processor:") + processor_path = result.artifact_storage.processors_outputs_path / processor_name + if processor_path.exists(): + return processor_path + processor_file_path = result.artifact_storage.processors_outputs_path / f"{processor_name}.parquet" + if processor_file_path.exists(): + return processor_file_path + raise DataDesignerWorkflowError(f"Stage output processor {processor_name!r} did not produce artifacts.") + + def _count_parquet_records(path: Path) -> int: parquet_files = sorted(path.glob("*.parquet")) if path.is_dir() else [path] if not parquet_files: @@ -339,11 +408,27 @@ def _count_parquet_records(path: Path) -> int: return sum(lazy.pq.read_metadata(file_path).num_rows for file_path in parquet_files) +def _load_parquet_dataset(path: Path) -> pd.DataFrame: + parquet_files = sorted(path.glob("*.parquet")) if path.is_dir() else [path] + if not parquet_files: + raise DataDesignerWorkflowError(f"No parquet files found at {str(path)!r}.") + return lazy.pd.concat([lazy.pd.read_parquet(file_path) for file_path in parquet_files], ignore_index=True) + + def _write_workflow_metadata(workflow_path: Path, metadata: dict[str, Any]) -> None: path = workflow_path / "workflow-metadata.json" path.write_text(json.dumps(metadata, indent=2, sort_keys=True), encoding="utf-8") +def _validate_stage_output(output: str) -> None: + if output == "final": + return + if not output.startswith("processor:"): + raise DataDesignerWorkflowError("Stage output must be 'final' or 'processor:'.") + processor_name = output.removeprefix("processor:") + _validate_dir_name(processor_name, "processor output name") + + def _validate_dir_name(name: str, label: str) -> None: if not name: raise DataDesignerWorkflowError(f"{label} must be a non-empty string.") diff --git a/packages/data-designer/src/data_designer/interface/data_designer.py b/packages/data-designer/src/data_designer/interface/data_designer.py index a397c7044..7431754dd 100644 --- a/packages/data-designer/src/data_designer/interface/data_designer.py +++ b/packages/data-designer/src/data_designer/interface/data_designer.py @@ -351,7 +351,12 @@ def create( ) try: - profiler = self._create_dataset_profiler(config_builder, resource_provider) + profiler_column_configs = config_builder.get_column_configs() or builder._data_designer_config.columns + profiler = self._create_dataset_profiler( + config_builder, + resource_provider, + column_configs=profiler_column_configs, + ) analysis = profiler.profile_dataset(num_records, dataset_for_profiler) except Exception as e: raise DataDesignerProfilingError(f"🛑 Error profiling dataset: {e}") from e @@ -613,11 +618,15 @@ def _create_dataset_builder( ) def _create_dataset_profiler( - self, config_builder: DataDesignerConfigBuilder, resource_provider: ResourceProvider + self, + config_builder: DataDesignerConfigBuilder, + resource_provider: ResourceProvider, + *, + column_configs: list | None = None, ) -> DataDesignerDatasetProfiler: return DataDesignerDatasetProfiler( config=DatasetProfilerConfig( - column_configs=config_builder.get_column_configs(), + column_configs=column_configs or config_builder.get_column_configs(), column_profiler_configs=config_builder.get_profilers(), ), resource_provider=resource_provider, diff --git a/packages/data-designer/tests/interface/test_composite_workflow.py b/packages/data-designer/tests/interface/test_composite_workflow.py index d9ad5bb2e..6ddba0545 100644 --- a/packages/data-designer/tests/interface/test_composite_workflow.py +++ b/packages/data-designer/tests/interface/test_composite_workflow.py @@ -314,6 +314,19 @@ def test_composite_workflow_rejects_invalid_stage_names( workflow.add_stage(name, _category_builder(stub_model_configs)) +@pytest.mark.parametrize("output", ["processor:", "callback", "processor:bad/name"]) +def test_composite_workflow_rejects_invalid_stage_outputs( + output: str, + stub_artifact_path: Path, + stub_model_providers: list[ModelProvider], + stub_model_configs: list[ModelConfig], +) -> None: + workflow = _data_designer(stub_artifact_path, stub_model_providers).compose_workflow(name="invalid-output") + + with pytest.raises(DataDesignerWorkflowError): + workflow.add_stage("base", _category_builder(stub_model_configs), output=output) + + def test_composite_workflow_rejects_duplicate_stage_names( stub_artifact_path: Path, stub_model_providers: list[ModelProvider], @@ -461,6 +474,123 @@ def use_processor_output(stage_path: Path) -> Path: ] +def test_composite_workflow_runs_seeded_processor_only_stage( + tmp_path: Path, + stub_model_providers: list[ModelProvider], + stub_model_configs: list[ModelConfig], +) -> None: + stage_1 = _seeded_builder(stub_model_configs, [{"name": "Ada", "secret": "hidden"}]) + stage_1.add_column(ExpressionColumnConfig(name="public_name", expr="{{ name }}")) + + stage_2 = DataDesignerConfigBuilder(model_configs=stub_model_configs) + stage_2.add_processor(DropColumnsProcessorConfig(name="drop_secret", column_names=["secret"])) + + stage_3 = _expression_builder(stub_model_configs, "final", "{{ public_name }} final") + + workflow = _real_data_designer(tmp_path / "artifacts", stub_model_providers).compose_workflow( + name="processor-only-stage" + ) + workflow.add_stage("base", stage_1, num_records=1) + workflow.add_stage("redacted", stage_2) + workflow.add_stage("final", stage_3) + + results = workflow.run() + redacted = results["redacted"].load_dataset() + final = results.load_dataset() + + assert "secret" not in redacted.columns + assert final.to_dict(orient="records") == [{"name": "Ada", "public_name": "Ada", "final": "Ada final"}] + + +def test_composite_workflow_postprocessors_transform_stage_output( + tmp_path: Path, + stub_model_providers: list[ModelProvider], + stub_model_configs: list[ModelConfig], +) -> None: + stage_1 = _seeded_builder(stub_model_configs, [{"name": "Ada", "secret": "hidden"}]) + stage_1.add_column(ExpressionColumnConfig(name="public_name", expr="{{ name }}")) + + stage_2 = _expression_builder(stub_model_configs, "final", "{{ public_name }} final") + + workflow = _real_data_designer(tmp_path / "artifacts", stub_model_providers).compose_workflow( + name="postprocessor-stage" + ) + workflow.add_stage( + "base", + stage_1, + num_records=1, + postprocessors=[DropColumnsProcessorConfig(name="drop_secret", column_names=["secret"])], + ) + workflow.add_stage("final", stage_2) + + results = workflow.run() + base = results["base"].load_dataset() + final = results.load_dataset() + metadata = _load_workflow_metadata(tmp_path / "artifacts", "postprocessor-stage") + + assert "secret" not in base.columns + assert final.to_dict(orient="records") == [{"name": "Ada", "public_name": "Ada", "final": "Ada final"}] + assert metadata["stages"][0]["postprocessor_output_path"].endswith("stage-0-base/postprocessors") + + +def test_composite_workflow_output_can_select_processor_artifact( + tmp_path: Path, + stub_model_providers: list[ModelProvider], + stub_model_configs: list[ModelConfig], +) -> None: + stage_1 = _seeded_builder(stub_model_configs, [{"name": "Ada"}, {"name": "Linus"}]) + stage_1.add_column(ExpressionColumnConfig(name="persona", expr="{{ name }}")) + stage_1.add_processor(SchemaTransformProcessorConfig(name="compact", template={"compact_name": "{{ persona }}"})) + + stage_2 = _expression_builder(stub_model_configs, "final", "{{ compact_name }} final") + + workflow = _real_data_designer(tmp_path / "artifacts", stub_model_providers).compose_workflow( + name="processor-output" + ) + workflow.add_stage("compact", stage_1, num_records=2, output="processor:compact") + workflow.add_stage("final", stage_2) + + df = workflow.run().load_dataset().sort_values("compact_name").reset_index(drop=True) + metadata = _load_workflow_metadata(tmp_path / "artifacts", "processor-output") + + assert df.to_dict(orient="records") == [ + {"compact_name": "Ada", "final": "Ada final"}, + {"compact_name": "Linus", "final": "Linus final"}, + ] + assert metadata["stages"][0]["output"] == "processor:compact" + assert metadata["stages"][0]["output_seed_path"].endswith("stage-0-compact/processors-files/compact") + + +def test_composite_workflow_postprocessors_can_feed_from_processor_artifact( + tmp_path: Path, + stub_model_providers: list[ModelProvider], + stub_model_configs: list[ModelConfig], +) -> None: + stage_1 = _seeded_builder(stub_model_configs, [{"name": "Ada"}, {"name": "Linus"}]) + stage_1.add_column(ExpressionColumnConfig(name="persona", expr="{{ name }}")) + + stage_2 = _expression_builder(stub_model_configs, "final", "{{ compact_name }} final") + + workflow = _real_data_designer(tmp_path / "artifacts", stub_model_providers).compose_workflow( + name="postprocessor-output" + ) + workflow.add_stage( + "compact", + stage_1, + num_records=2, + postprocessors=[SchemaTransformProcessorConfig(name="compact", template={"compact_name": "{{ persona }}"})], + output="processor:compact", + ) + workflow.add_stage("final", stage_2) + + df = workflow.run().load_dataset().sort_values("compact_name").reset_index(drop=True) + + assert df.to_dict(orient="records") == [ + {"compact_name": "Ada", "final": "Ada final"}, + {"compact_name": "Linus", "final": "Linus final"}, + ] + + def test_composite_workflow_runs_custom_generator_in_downstream_stage( tmp_path: Path, stub_model_providers: list[ModelProvider], diff --git a/plans/workflow-chaining/workflow-chaining.md b/plans/workflow-chaining/workflow-chaining.md index 68f2dc8b2..06c24211a 100644 --- a/plans/workflow-chaining/workflow-chaining.md +++ b/plans/workflow-chaining/workflow-chaining.md @@ -401,7 +401,7 @@ result_2 = data_designer.create(config_2, num_records=200) # explode: 50 -> 200 - Add `compose_workflow(name: str)` factory method on `DataDesigner`. - Tests: multi-stage runs, explode/filter via callbacks, num_records defaulting, duplicate stage-name rejection, artifact layout, throttle reuse across stages. -**Status after PR #636:** Implemented `CompositeWorkflow`, `compose_workflow()`, `to_config_builder()`, disk handoff, stage metadata, `acreate()`, shared throttle manager reuse, explicit stage artifact roots, cloned stage builders, and concurrent-safe seed reader/resource-provider handling. Still deferred: stage-level resume, DAG branches, `allow_resize` removal, seeded processor-only configs, postprocessors, stage output selection, config bundles, and first-class artifact seeding. +**Status after PR #636:** Implemented `CompositeWorkflow`, `compose_workflow()`, `to_config_builder()`, disk handoff, stage metadata, `acreate()`, shared throttle manager reuse, explicit stage artifact roots, cloned stage builders, concurrent-safe seed reader/resource-provider handling, seeded processor-only configs, stage postprocessors, and stage output selection. Still deferred: stage-level resume, DAG branches, `allow_resize` removal, config bundles, and broader first-class artifact seeding. ### Sidecar: `acreate()` on `DataDesigner` (independent of chaining v1) @@ -456,7 +456,7 @@ Items not on the current roadmap but worth flagging so they don't get accidental **Failure hooks.** v1 should raise on failure by default. A future `on_failure` callback could support cleanup, custom recovery, or explicit handling of all-rows-filtered cases, but it should not obscure the default failure semantics. -**Seeded processor-only configs.** Do not introduce a separate public `stages` wrapper module or a workflow-specific `processors=[...]` stage argument. Keep `CompositeWorkflow.add_stage()` as the orchestration surface, and make processor-only work expressible as a normal seeded config: +**Seeded processor-only configs (implemented in #636).** Do not introduce a separate public `stages` wrapper module or a workflow-specific `processors=[...]` stage argument. Keep `CompositeWorkflow.add_stage()` as the orchestration surface, and make processor-only work expressible as a normal seeded config: ```python workflow.add_stage("drafts", config_drafts, num_records=1000) @@ -475,9 +475,9 @@ config = ( result = data_designer.create(config) ``` -This requires validation to allow a seed-only config when it has processors and a seed dataset, while still rejecting processor-only configs with no seed and configs that drop all output columns. +Validation allows a seed-only config when it has processors and a seed dataset, while still rejecting processor-only configs with no seed and configs that drop all output columns. -**Stage postprocessors.** Add `postprocessors=[ProcessorConfig, ...]` to `CompositeWorkflow.add_stage()` for compact inline transforms at a stage boundary. Internally this can reuse the seeded processor-only config path above: seed a temporary processor config from the completed stage output, run it under a deterministic subdirectory, and use its final dataset as the stage output. This keeps row-count-changing 1:N and N:1 operations out of the async generation batch path while making the common case concise. Custom arbitrary transforms should use `CustomProcessor` when #159 lands; until then, keep raw `on_success` as the escape hatch. +**Stage postprocessors (implemented in #636).** `CompositeWorkflow.add_stage()` accepts `postprocessors=[ProcessorConfig, ...]` for compact inline transforms at a stage boundary. Internally this reuses the seeded processor-only config path above: seed a temporary processor config from the completed stage output, run it under a deterministic subdirectory, and use its final dataset as the stage output. This keeps row-count-changing 1:N and N:1 operations out of the async generation batch path while making the common case concise. Custom arbitrary transforms should use `CustomProcessor` when #159 lands; until then, keep raw `on_success` as the escape hatch. **Config bundles.** Built-in or plugin-provided bundles should be reusable config fragments/factories, not execution stages. A bundle can return or extend a `DataDesignerConfigBuilder`, and `add_stage()` can accept either a builder or a bundle by normalizing the bundle to a builder: @@ -489,7 +489,7 @@ workflow.add_stage("sft", bundles.sft_export(), output="processor:sft") Bundles can cover DataArc-like document QA, refinement, distillation, SFT export, or GRPO export shapes without introducing another workflow abstraction. Plugins can contribute bundles alongside existing column, seed-reader, and processor plugins. -**First-class artifact seeding.** v1 can bridge named processor outputs or schema-transform artifacts through `on_success` callbacks. A later API could support `seed_from_artifact(...)` or named processor output references directly. +**First-class artifact seeding.** v1 can select named processor outputs with `output="processor:"`. Broader artifact seeding, such as media assets or arbitrary artifact folders, still requires `on_success` callbacks. A later API could support `seed_from_artifact(...)` references directly. **Auto-composition from a single config.** Auto-detecting stage boundaries in one config is possible, but likely overkill for the initial roadmap. If it returns later, it should be a convenience layer on top of `CompositeWorkflow`, not a separate execution model. @@ -505,7 +505,7 @@ These were open in earlier drafts; recording the resolutions here so the design 4. **CompositeWorkflow construction** -> `CompositeWorkflow` is created via `data_designer.compose_workflow(name=...)` and reuses the parent `DataDesigner`'s `ModelRegistry` and `ThrottleManager` across all stages. The explicit name is the durable artifact identity used for resume, and the workflow does not construct its own `DataDesigner` instances. This is the throttle-coordination invariant (see Composability section). -5. **Downstream seeding scope** -> v1 seeds downstream stages from the upstream stage's final dataset only. Named processor outputs, dropped columns, schema-transform artifacts, and media forwarding require an `on_success` callback in v1 and can become first-class APIs later. +5. **Downstream seeding scope** -> v1 seeds downstream stages from the upstream stage's final dataset by default. Named processor outputs can be selected with `output="processor:"`. Dropped columns, media forwarding, and arbitrary artifacts require an `on_success` callback in v1 and can become first-class APIs later. 6. **Export and push semantics** -> v1 export/push defaults to the final stage's dataset only. Exporting selected stages or a full composite workflow artifact bundle is future work. From 4d4896edae9184d133ee820533639215afce7247 Mon Sep 17 00:00:00 2001 From: Andre Manoel Date: Thu, 14 May 2026 20:28:30 +0000 Subject: [PATCH 06/14] feat: expose workflow stage outputs --- .../interface/composite_workflow.py | 29 +++++++++++----- .../interface/test_composite_workflow.py | 34 +++++++++++++++++-- 2 files changed, 51 insertions(+), 12 deletions(-) diff --git a/packages/data-designer/src/data_designer/interface/composite_workflow.py b/packages/data-designer/src/data_designer/interface/composite_workflow.py index e469b91ca..420a0adf0 100644 --- a/packages/data-designer/src/data_designer/interface/composite_workflow.py +++ b/packages/data-designer/src/data_designer/interface/composite_workflow.py @@ -63,6 +63,7 @@ class CompositeWorkflowResults: Per-stage entries are the effective ``DataDesigner.create()`` results. If a stage uses ``on_success``, metadata and downstream seeding use the callback output path while the stage result still points at the stage's dataset. + Use ``load_stage_output()`` to load the selected output handed downstream. """ def __init__( @@ -71,12 +72,12 @@ def __init__( name: str, stage_results: dict[str, DatasetCreationResults | SkippedStageResult], final_stage_name: str, - final_output_path: Path | None = None, + stage_output_paths: dict[str, Path] | None = None, ) -> None: self.name = name self.stage_results = stage_results self.final_stage_name = final_stage_name - self._final_output_path = final_output_path + self._stage_output_paths = stage_output_paths or {} def __getitem__(self, stage_name: str) -> DatasetCreationResults | SkippedStageResult: return self.stage_results[stage_name] @@ -99,18 +100,26 @@ def final_result(self) -> DatasetCreationResults: def load_dataset(self) -> pd.DataFrame: self.final_result - if self._final_output_path is not None: - return _load_parquet_dataset(self._final_output_path) - return self.final_result.load_dataset() + return self.load_stage_output(self.final_stage_name) def load_analysis(self) -> DatasetProfilerResults: return self.final_result.load_analysis() def count_records(self) -> int: self.final_result - if self._final_output_path is not None: - return _count_parquet_records(self._final_output_path) - return self.final_result.count_records() + return self.count_stage_output_records(self.final_stage_name) + + def get_stage_output_path(self, stage_name: str) -> Path: + result = self.stage_results[stage_name] + if isinstance(result, SkippedStageResult): + raise DataDesignerWorkflowError(f"Stage {stage_name!r} was skipped: {result.status}.") + return self._stage_output_paths.get(stage_name, result.artifact_storage.final_dataset_path) + + def load_stage_output(self, stage_name: str) -> pd.DataFrame: + return _load_parquet_dataset(self.get_stage_output_path(stage_name)) + + def count_stage_output_records(self, stage_name: str) -> int: + return _count_parquet_records(self.get_stage_output_path(stage_name)) def export(self, *args: Any, **kwargs: Any) -> Path: return self.final_result.export(*args, **kwargs) @@ -176,6 +185,7 @@ def run(self) -> CompositeWorkflowResults: "stages": [], } stage_results: dict[str, DatasetCreationResults | SkippedStageResult] = {} + stage_output_paths: dict[str, Path] = {} previous_seed_path: Path | None = None previous_output_records: int | None = None previous_stage_name: str | None = None @@ -298,6 +308,7 @@ def run(self) -> CompositeWorkflowResults: raise stage_results[stage.name] = output_result + stage_output_paths[stage.name] = output_seed_path previous_seed_path = output_seed_path previous_output_records = output_records previous_stage_name = stage.name @@ -308,7 +319,7 @@ def run(self) -> CompositeWorkflowResults: name=self.name, stage_results=stage_results, final_stage_name=self._stages[-1].name, - final_output_path=previous_seed_path, + stage_output_paths=stage_output_paths, ) diff --git a/packages/data-designer/tests/interface/test_composite_workflow.py b/packages/data-designer/tests/interface/test_composite_workflow.py index 6ddba0545..775754f5f 100644 --- a/packages/data-designer/tests/interface/test_composite_workflow.py +++ b/packages/data-designer/tests/interface/test_composite_workflow.py @@ -415,7 +415,9 @@ def expand(stage_path: Path) -> Path: workflow.add_stage("personas", stage_1, num_records=2, on_success=expand, on_success_version="expand-turns") workflow.add_stage("messages", stage_2) - df = workflow.run().load_dataset().sort_values(["name", "turn"]).reset_index(drop=True) + results = workflow.run() + df = results.load_dataset().sort_values(["name", "turn"]).reset_index(drop=True) + stage_output = results.load_stage_output("personas").sort_values(["name", "turn"]).reset_index(drop=True) assert df["message"].tolist() == [ "Ada turn 1", @@ -423,6 +425,9 @@ def expand(stage_path: Path) -> Path: "Linus turn 1", "Linus turn 2", ] + assert stage_output["turn"].tolist() == [1, 2, 1, 2] + assert results["personas"].count_records() == 2 + assert results.count_stage_output_records("personas") == 4 def test_composite_workflow_does_not_forward_dropped_processor_columns( @@ -550,13 +555,25 @@ def test_composite_workflow_output_can_select_processor_artifact( workflow.add_stage("compact", stage_1, num_records=2, output="processor:compact") workflow.add_stage("final", stage_2) - df = workflow.run().load_dataset().sort_values("compact_name").reset_index(drop=True) + results = workflow.run() + df = results.load_dataset().sort_values("compact_name").reset_index(drop=True) + stage_output = results.load_stage_output("compact").sort_values("compact_name").reset_index(drop=True) + stage_final = results["compact"].load_dataset().sort_values("name").reset_index(drop=True) metadata = _load_workflow_metadata(tmp_path / "artifacts", "processor-output") assert df.to_dict(orient="records") == [ {"compact_name": "Ada", "final": "Ada final"}, {"compact_name": "Linus", "final": "Linus final"}, ] + assert stage_output.to_dict(orient="records") == [ + {"compact_name": "Ada"}, + {"compact_name": "Linus"}, + ] + assert stage_final[["name", "persona"]].to_dict(orient="records") == [ + {"name": "Ada", "persona": "Ada"}, + {"name": "Linus", "persona": "Linus"}, + ] + assert results.count_stage_output_records("compact") == 2 assert metadata["stages"][0]["output"] == "processor:compact" assert metadata["stages"][0]["output_seed_path"].endswith("stage-0-compact/processors-files/compact") @@ -583,12 +600,23 @@ def test_composite_workflow_postprocessors_can_feed_from_processor_artifact( ) workflow.add_stage("final", stage_2) - df = workflow.run().load_dataset().sort_values("compact_name").reset_index(drop=True) + results = workflow.run() + df = results.load_dataset().sort_values("compact_name").reset_index(drop=True) + stage_output = results.load_stage_output("compact").sort_values("compact_name").reset_index(drop=True) + stage_final = results["compact"].load_dataset().sort_values("name").reset_index(drop=True) assert df.to_dict(orient="records") == [ {"compact_name": "Ada", "final": "Ada final"}, {"compact_name": "Linus", "final": "Linus final"}, ] + assert stage_output.to_dict(orient="records") == [ + {"compact_name": "Ada"}, + {"compact_name": "Linus"}, + ] + assert stage_final[["name", "persona"]].to_dict(orient="records") == [ + {"name": "Ada", "persona": "Ada"}, + {"name": "Linus", "persona": "Linus"}, + ] def test_composite_workflow_runs_custom_generator_in_downstream_stage( From 558c94621611e1a8df3fcc3bdf93f7da896c3c77 Mon Sep 17 00:00:00 2001 From: Andre Manoel Date: Thu, 14 May 2026 21:32:40 +0000 Subject: [PATCH 07/14] fix: align workflow selected output export --- .../interface/composite_workflow.py | 57 +++++++++++++++++-- .../interface/test_composite_workflow.py | 49 ++++++++++++++++ 2 files changed, 102 insertions(+), 4 deletions(-) diff --git a/packages/data-designer/src/data_designer/interface/composite_workflow.py b/packages/data-designer/src/data_designer/interface/composite_workflow.py index 420a0adf0..1f8307cf0 100644 --- a/packages/data-designer/src/data_designer/interface/composite_workflow.py +++ b/packages/data-designer/src/data_designer/interface/composite_workflow.py @@ -17,12 +17,21 @@ from data_designer.config.base import ProcessorConfig from data_designer.config.config_builder import BuilderConfig, DataDesignerConfigBuilder from data_designer.config.data_designer_config import DataDesignerConfig +from data_designer.config.errors import InvalidFileFormatError from data_designer.config.seed import IndexRange, PartitionBlock, SamplingStrategy from data_designer.config.seed_source import LocalFileSeedSource from data_designer.config.utils.constants import DEFAULT_NUM_RECORDS from data_designer.config.version import get_library_version +from data_designer.engine.dataset_builders.errors import ArtifactStorageError from data_designer.interface.errors import DataDesignerWorkflowError -from data_designer.interface.results import DatasetCreationResults +from data_designer.interface.results import ( + SUPPORTED_EXPORT_FORMATS, + DatasetCreationResults, + ExportFormat, + _export_csv, + _export_jsonl, + _export_parquet, +) if TYPE_CHECKING: import pandas as pd @@ -121,11 +130,18 @@ def load_stage_output(self, stage_name: str) -> pd.DataFrame: def count_stage_output_records(self, stage_name: str) -> int: return _count_parquet_records(self.get_stage_output_path(stage_name)) - def export(self, *args: Any, **kwargs: Any) -> Path: - return self.final_result.export(*args, **kwargs) + def export(self, path: Path | str, *, format: ExportFormat | None = None) -> Path: + self.final_result + return _export_parquet_dataset(self.get_stage_output_path(self.final_stage_name), Path(path), format=format) def push_to_hub(self, *args: Any, **kwargs: Any) -> str: - return self.final_result.push_to_hub(*args, **kwargs) + final_result = self.final_result + if self.get_stage_output_path(self.final_stage_name) != final_result.artifact_storage.final_dataset_path: + raise DataDesignerWorkflowError( + "push_to_hub() does not support selected workflow outputs yet. " + "Use export() for the selected output, or push the stage result directly." + ) + return final_result.push_to_hub(*args, **kwargs) class CompositeWorkflow: @@ -155,6 +171,7 @@ def add_stage( if num_records is not None and num_records < 1: raise DataDesignerWorkflowError("Stage num_records must be at least 1.") _validate_stage_output(output) + _validate_stage_output_processor(output, config_builder, postprocessors or []) self._stages.append( _WorkflowStage( name=name, @@ -426,6 +443,24 @@ def _load_parquet_dataset(path: Path) -> pd.DataFrame: return lazy.pd.concat([lazy.pd.read_parquet(file_path) for file_path in parquet_files], ignore_index=True) +def _export_parquet_dataset(source_path: Path, output_path: Path, *, format: ExportFormat | None = None) -> Path: + resolved_format: str = format if format is not None else output_path.suffix.lstrip(".").lower() + if resolved_format not in SUPPORTED_EXPORT_FORMATS: + raise InvalidFileFormatError( + f"Unsupported export format: {resolved_format!r}. Choose one of: {', '.join(SUPPORTED_EXPORT_FORMATS)}." + ) + parquet_files = sorted(source_path.glob("*.parquet")) if source_path.is_dir() else [source_path] + if not parquet_files: + raise ArtifactStorageError("No parquet files found to export.") + if resolved_format == "jsonl": + _export_jsonl(parquet_files, output_path) + elif resolved_format == "csv": + _export_csv(parquet_files, output_path) + elif resolved_format == "parquet": + _export_parquet(parquet_files, output_path) + return output_path + + def _write_workflow_metadata(workflow_path: Path, metadata: dict[str, Any]) -> None: path = workflow_path / "workflow-metadata.json" path.write_text(json.dumps(metadata, indent=2, sort_keys=True), encoding="utf-8") @@ -440,6 +475,20 @@ def _validate_stage_output(output: str) -> None: _validate_dir_name(processor_name, "processor output name") +def _validate_stage_output_processor( + output: str, + config_builder: DataDesignerConfigBuilder, + postprocessors: list[ProcessorConfig], +) -> None: + if not output.startswith("processor:"): + return + processor_name = output.removeprefix("processor:") + processor_names = {processor.name for processor in config_builder.get_processor_configs()} + processor_names.update(processor.name for processor in postprocessors) + if processor_name not in processor_names: + raise DataDesignerWorkflowError(f"Stage output processor {processor_name!r} is not configured on this stage.") + + def _validate_dir_name(name: str, label: str) -> None: if not name: raise DataDesignerWorkflowError(f"{label} must be a non-empty string.") diff --git a/packages/data-designer/tests/interface/test_composite_workflow.py b/packages/data-designer/tests/interface/test_composite_workflow.py index 775754f5f..d1c147621 100644 --- a/packages/data-designer/tests/interface/test_composite_workflow.py +++ b/packages/data-designer/tests/interface/test_composite_workflow.py @@ -327,6 +327,17 @@ def test_composite_workflow_rejects_invalid_stage_outputs( workflow.add_stage("base", _category_builder(stub_model_configs), output=output) +def test_composite_workflow_rejects_unknown_processor_stage_output( + stub_artifact_path: Path, + stub_model_providers: list[ModelProvider], + stub_model_configs: list[ModelConfig], +) -> None: + workflow = _data_designer(stub_artifact_path, stub_model_providers).compose_workflow(name="unknown-output") + + with pytest.raises(DataDesignerWorkflowError, match="not configured"): + workflow.add_stage("base", _category_builder(stub_model_configs), output="processor:missing") + + def test_composite_workflow_rejects_duplicate_stage_names( stub_artifact_path: Path, stub_model_providers: list[ModelProvider], @@ -619,6 +630,44 @@ def test_composite_workflow_postprocessors_can_feed_from_processor_artifact( ] +def test_composite_workflow_export_uses_selected_final_output( + tmp_path: Path, + stub_model_providers: list[ModelProvider], + stub_model_configs: list[ModelConfig], +) -> None: + stage = _seeded_builder(stub_model_configs, [{"name": "Ada"}, {"name": "Linus"}]) + stage.add_column(ExpressionColumnConfig(name="persona", expr="{{ name }}")) + stage.add_processor(SchemaTransformProcessorConfig(name="compact", template={"compact_name": "{{ persona }}"})) + + workflow = _real_data_designer(tmp_path / "artifacts", stub_model_providers).compose_workflow( + name="selected-export" + ) + workflow.add_stage("compact", stage, num_records=2, output="processor:compact") + + output = workflow.run().export(tmp_path / "selected.jsonl") + + assert [json.loads(line) for line in output.read_text(encoding="utf-8").splitlines()] == [ + {"compact_name": "Ada"}, + {"compact_name": "Linus"}, + ] + + +def test_composite_workflow_push_to_hub_rejects_selected_processor_output( + tmp_path: Path, + stub_model_providers: list[ModelProvider], + stub_model_configs: list[ModelConfig], +) -> None: + stage = _seeded_builder(stub_model_configs, [{"name": "Ada"}]) + stage.add_column(ExpressionColumnConfig(name="persona", expr="{{ name }}")) + stage.add_processor(SchemaTransformProcessorConfig(name="compact", template={"compact_name": "{{ persona }}"})) + + workflow = _real_data_designer(tmp_path / "artifacts", stub_model_providers).compose_workflow(name="selected-push") + workflow.add_stage("compact", stage, num_records=1, output="processor:compact") + + with pytest.raises(DataDesignerWorkflowError, match="selected workflow outputs"): + workflow.run().push_to_hub("user/selected", "description") + + def test_composite_workflow_runs_custom_generator_in_downstream_stage( tmp_path: Path, stub_model_providers: list[ModelProvider], From 0ee4c660fec6deca63a41d153c38d190262a374e Mon Sep 17 00:00:00 2001 From: Andre Manoel Date: Thu, 14 May 2026 22:11:33 +0000 Subject: [PATCH 08/14] fix: address workflow chaining review issues --- .../dataset_builders/dataset_builder.py | 4 ++ .../interface/composite_workflow.py | 27 ++++++++-- .../data_designer/interface/data_designer.py | 3 +- .../interface/test_composite_workflow.py | 54 +++++++++++++++++++ .../tests/interface/test_data_designer.py | 6 ++- 5 files changed, 89 insertions(+), 5 deletions(-) diff --git a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py index 3b15ed96d..d1d0c3c0b 100644 --- a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py +++ b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py @@ -163,6 +163,10 @@ def __init__( def artifact_storage(self) -> ArtifactStorage: return self._resource_provider.artifact_storage + @property + def data_designer_config(self) -> DataDesignerConfig: + return self._data_designer_config + @property def processors(self) -> tuple[Processor, ...]: return self._processor_runner.processors diff --git a/packages/data-designer/src/data_designer/interface/composite_workflow.py b/packages/data-designer/src/data_designer/interface/composite_workflow.py index 1f8307cf0..13e07043e 100644 --- a/packages/data-designer/src/data_designer/interface/composite_workflow.py +++ b/packages/data-designer/src/data_designer/interface/composite_workflow.py @@ -276,6 +276,7 @@ def run(self) -> CompositeWorkflowResults: ) actual_records = result.count_records() output_result = result + output_source_result = result if stage.postprocessors: postprocessor_builder = _postprocessor_config_builder( stage_builder=stage_builder, @@ -288,13 +289,14 @@ def run(self) -> CompositeWorkflowResults: dataset_name="postprocessors", artifact_path=workflow_path / stage_dir_name, ) + output_source_result = _select_output_result(stage, result, output_result) - output_seed_path = _resolve_stage_output_path(output_result, stage.output) + output_seed_path = _resolve_stage_output_path(output_source_result, stage.output) callback_output_path = None output_records = _count_parquet_records(output_seed_path) if stage.on_success is not None: - callback_output_path = Path(stage.on_success(output_result.artifact_storage.base_dataset_path)) + callback_output_path = Path(stage.on_success(result.artifact_storage.base_dataset_path)) output_seed_path = callback_output_path output_records = _count_parquet_records(callback_output_path) @@ -429,6 +431,19 @@ def _resolve_stage_output_path(result: DatasetCreationResults, output: str) -> P raise DataDesignerWorkflowError(f"Stage output processor {processor_name!r} did not produce artifacts.") +def _select_output_result( + stage: _WorkflowStage, + result: DatasetCreationResults, + output_result: DatasetCreationResults, +) -> DatasetCreationResults: + if stage.output == "final": + return output_result + processor_name = stage.output.removeprefix("processor:") + if processor_name in {processor.name for processor in stage.postprocessors}: + return output_result + return result + + def _count_parquet_records(path: Path) -> int: parquet_files = sorted(path.glob("*.parquet")) if path.is_dir() else [path] if not parquet_files: @@ -449,7 +464,7 @@ def _export_parquet_dataset(source_path: Path, output_path: Path, *, format: Exp raise InvalidFileFormatError( f"Unsupported export format: {resolved_format!r}. Choose one of: {', '.join(SUPPORTED_EXPORT_FORMATS)}." ) - parquet_files = sorted(source_path.glob("*.parquet")) if source_path.is_dir() else [source_path] + parquet_files = _export_parquet_files(source_path) if not parquet_files: raise ArtifactStorageError("No parquet files found to export.") if resolved_format == "jsonl": @@ -461,6 +476,12 @@ def _export_parquet_dataset(source_path: Path, output_path: Path, *, format: Exp return output_path +def _export_parquet_files(source_path: Path) -> list[Path]: + if not source_path.is_dir(): + return [source_path] + return sorted(source_path.glob("batch_*.parquet")) or sorted(source_path.glob("*.parquet")) + + def _write_workflow_metadata(workflow_path: Path, metadata: dict[str, Any]) -> None: path = workflow_path / "workflow-metadata.json" path.write_text(json.dumps(metadata, indent=2, sort_keys=True), encoding="utf-8") diff --git a/packages/data-designer/src/data_designer/interface/data_designer.py b/packages/data-designer/src/data_designer/interface/data_designer.py index 7431754dd..8fb7492d0 100644 --- a/packages/data-designer/src/data_designer/interface/data_designer.py +++ b/packages/data-designer/src/data_designer/interface/data_designer.py @@ -351,7 +351,7 @@ def create( ) try: - profiler_column_configs = config_builder.get_column_configs() or builder._data_designer_config.columns + profiler_column_configs = config_builder.get_column_configs() or builder.data_designer_config.columns profiler = self._create_dataset_profiler( config_builder, resource_provider, @@ -571,6 +571,7 @@ def set_run_config(self, run_config: RunConfig) -> None: due to error-rate thresholds. Errors are still tracked for reporting. """ self._run_config = run_config + self._throttle_manager = self._create_throttle_manager() def get_models(self, model_aliases: list[str]) -> dict[str, ModelFacade]: """Get a dict of ModelFacade instances for custom column development. diff --git a/packages/data-designer/tests/interface/test_composite_workflow.py b/packages/data-designer/tests/interface/test_composite_workflow.py index d1c147621..32aa38ee7 100644 --- a/packages/data-designer/tests/interface/test_composite_workflow.py +++ b/packages/data-designer/tests/interface/test_composite_workflow.py @@ -630,6 +630,60 @@ def test_composite_workflow_postprocessors_can_feed_from_processor_artifact( ] +def test_composite_workflow_output_can_select_main_processor_with_postprocessors( + tmp_path: Path, + stub_model_providers: list[ModelProvider], + stub_model_configs: list[ModelConfig], +) -> None: + stage = _seeded_builder(stub_model_configs, [{"name": "Ada", "scratch": "drop me"}]) + stage.add_column(ExpressionColumnConfig(name="persona", expr="{{ name }}")) + stage.add_processor(SchemaTransformProcessorConfig(name="compact", template={"compact_name": "{{ persona }}"})) + + workflow = _real_data_designer(tmp_path / "artifacts", stub_model_providers).compose_workflow( + name="main-processor-output" + ) + workflow.add_stage( + "compact", + stage, + num_records=1, + postprocessors=[DropColumnsProcessorConfig(name="drop_scratch", column_names=["scratch"])], + output="processor:compact", + ) + + assert workflow.run().load_dataset().to_dict(orient="records") == [{"compact_name": "Ada"}] + + +def test_composite_workflow_callback_receives_main_stage_artifact_with_postprocessors( + tmp_path: Path, + stub_model_providers: list[ModelProvider], + stub_model_configs: list[ModelConfig], +) -> None: + stage = _seeded_builder(stub_model_configs, [{"name": "Ada", "scratch": "drop me"}]) + stage.add_column(ExpressionColumnConfig(name="persona", expr="{{ name }}")) + callback_paths = [] + + def keep_postprocessed(stage_path: Path) -> Path: + callback_paths.append(stage_path) + assert (stage_path / "postprocessors" / "parquet-files").is_dir() + return stage_path / "postprocessors" / "parquet-files" + + workflow = _real_data_designer(tmp_path / "artifacts", stub_model_providers).compose_workflow( + name="postprocessor-callback" + ) + workflow.add_stage( + "base", + stage, + num_records=1, + postprocessors=[DropColumnsProcessorConfig(name="drop_scratch", column_names=["scratch"])], + on_success=keep_postprocessed, + ) + + result = workflow.run() + + assert callback_paths == [tmp_path / "artifacts" / "postprocessor-callback" / "stage-0-base"] + assert "scratch" not in result.load_dataset().columns + + def test_composite_workflow_export_uses_selected_final_output( tmp_path: Path, stub_model_providers: list[ModelProvider], diff --git a/packages/data-designer/tests/interface/test_data_designer.py b/packages/data-designer/tests/interface/test_data_designer.py index c98987103..e1a7a9bcf 100644 --- a/packages/data-designer/tests/interface/test_data_designer.py +++ b/packages/data-designer/tests/interface/test_data_designer.py @@ -23,7 +23,7 @@ from data_designer.config.errors import InvalidConfigError from data_designer.config.models import ModelProvider from data_designer.config.processors import DropColumnsProcessorConfig -from data_designer.config.run_config import JinjaRenderingEngine, RunConfig +from data_designer.config.run_config import JinjaRenderingEngine, RunConfig, ThrottleConfig from data_designer.config.sampler_params import CategorySamplerParams, DatetimeSamplerParams, SamplerType from data_designer.config.seed import IndexRange, PartitionBlock, SamplingStrategy from data_designer.config.seed_source import ( @@ -657,6 +657,7 @@ def test_init_yaml_default_emits_single_deprecation_warning( def test_run_config_setting_persists(stub_artifact_path, stub_model_providers): """Test that run config setting persists across multiple calls.""" data_designer = DataDesigner(artifact_path=stub_artifact_path, model_providers=stub_model_providers) + original_throttle_manager = data_designer._throttle_manager # Test default values assert data_designer.run_config.disable_early_shutdown is False @@ -675,6 +676,7 @@ def test_run_config_setting_persists(stub_artifact_path, stub_model_providers): buffer_size=500, max_conversation_restarts=7, max_conversation_correction_steps=2, + throttle=ThrottleConfig(success_window=7), ) ) assert data_designer.run_config.disable_early_shutdown is True @@ -683,6 +685,8 @@ def test_run_config_setting_persists(stub_artifact_path, stub_model_providers): assert data_designer.run_config.buffer_size == 500 assert data_designer.run_config.max_conversation_restarts == 7 assert data_designer.run_config.max_conversation_correction_steps == 2 + assert data_designer._throttle_manager is not original_throttle_manager + assert data_designer._throttle_manager._success_window == 7 # Test updating values data_designer.set_run_config( From 77e72974ded11885795ffa4439b2704b67ef1be4 Mon Sep 17 00:00:00 2001 From: Andre Manoel Date: Thu, 14 May 2026 23:08:00 +0000 Subject: [PATCH 09/14] fix: align workflow parquet export selection Signed-off-by: Andre Manoel --- .../interface/composite_workflow.py | 16 +++--- .../data_designer/interface/data_designer.py | 3 +- .../interface/test_composite_workflow.py | 54 ++++++++++++++++++- 3 files changed, 63 insertions(+), 10 deletions(-) diff --git a/packages/data-designer/src/data_designer/interface/composite_workflow.py b/packages/data-designer/src/data_designer/interface/composite_workflow.py index 13e07043e..f35d56b8c 100644 --- a/packages/data-designer/src/data_designer/interface/composite_workflow.py +++ b/packages/data-designer/src/data_designer/interface/composite_workflow.py @@ -445,14 +445,14 @@ def _select_output_result( def _count_parquet_records(path: Path) -> int: - parquet_files = sorted(path.glob("*.parquet")) if path.is_dir() else [path] + parquet_files = _parquet_files(path) if not parquet_files: raise DataDesignerWorkflowError(f"No parquet files found at {str(path)!r}.") return sum(lazy.pq.read_metadata(file_path).num_rows for file_path in parquet_files) def _load_parquet_dataset(path: Path) -> pd.DataFrame: - parquet_files = sorted(path.glob("*.parquet")) if path.is_dir() else [path] + parquet_files = _parquet_files(path) if not parquet_files: raise DataDesignerWorkflowError(f"No parquet files found at {str(path)!r}.") return lazy.pd.concat([lazy.pd.read_parquet(file_path) for file_path in parquet_files], ignore_index=True) @@ -464,7 +464,7 @@ def _export_parquet_dataset(source_path: Path, output_path: Path, *, format: Exp raise InvalidFileFormatError( f"Unsupported export format: {resolved_format!r}. Choose one of: {', '.join(SUPPORTED_EXPORT_FORMATS)}." ) - parquet_files = _export_parquet_files(source_path) + parquet_files = _parquet_files(source_path) if not parquet_files: raise ArtifactStorageError("No parquet files found to export.") if resolved_format == "jsonl": @@ -476,10 +476,10 @@ def _export_parquet_dataset(source_path: Path, output_path: Path, *, format: Exp return output_path -def _export_parquet_files(source_path: Path) -> list[Path]: - if not source_path.is_dir(): - return [source_path] - return sorted(source_path.glob("batch_*.parquet")) or sorted(source_path.glob("*.parquet")) +def _parquet_files(path: Path) -> list[Path]: + if not path.is_dir(): + return [path] + return sorted(path.glob("*.parquet")) def _write_workflow_metadata(workflow_path: Path, metadata: dict[str, Any]) -> None: @@ -513,6 +513,8 @@ def _validate_stage_output_processor( def _validate_dir_name(name: str, label: str) -> None: if not name: raise DataDesignerWorkflowError(f"{label} must be a non-empty string.") + if name in {".", ".."}: + raise DataDesignerWorkflowError(f"{label} {name!r} is not allowed.") invalid_chars = {"<", ">", ":", '"', "/", "\\", "|", "?", "*"} if any(char in name for char in invalid_chars): raise DataDesignerWorkflowError(f"{label} {name!r} contains invalid path characters.") diff --git a/packages/data-designer/src/data_designer/interface/data_designer.py b/packages/data-designer/src/data_designer/interface/data_designer.py index f615b0c73..202eb6ac2 100644 --- a/packages/data-designer/src/data_designer/interface/data_designer.py +++ b/packages/data-designer/src/data_designer/interface/data_designer.py @@ -11,6 +11,7 @@ import data_designer.lazy_heavy_imports as lazy from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults +from data_designer.config.column_types import ColumnConfigT from data_designer.config.config_builder import DataDesignerConfigBuilder from data_designer.config.data_designer_config import DataDesignerConfig from data_designer.config.default_model_settings import ( @@ -641,7 +642,7 @@ def _create_dataset_profiler( config_builder: DataDesignerConfigBuilder, resource_provider: ResourceProvider, *, - column_configs: list | None = None, + column_configs: list[ColumnConfigT] | None = None, ) -> DataDesignerDatasetProfiler: return DataDesignerDatasetProfiler( config=DatasetProfilerConfig( diff --git a/packages/data-designer/tests/interface/test_composite_workflow.py b/packages/data-designer/tests/interface/test_composite_workflow.py index 32aa38ee7..efb0e1994 100644 --- a/packages/data-designer/tests/interface/test_composite_workflow.py +++ b/packages/data-designer/tests/interface/test_composite_workflow.py @@ -280,6 +280,28 @@ def empty_output(stage_path: Path) -> Path: results.load_dataset() +def test_composite_workflow_empty_callback_fails_by_default( + stub_artifact_path: Path, + stub_model_providers: list[ModelProvider], + stub_model_configs: list[ModelConfig], + stub_dataset_profiler_results, +) -> None: + data_designer = _data_designer(stub_artifact_path, stub_model_providers) + _patch_create(data_designer, stub_dataset_profiler_results) + + def empty_output(stage_path: Path) -> Path: + output_path = stage_path / "callback-outputs" / "empty" + output_path.mkdir(parents=True) + lazy.pd.DataFrame({"category": []}).to_parquet(output_path / "data.parquet", index=False) + return output_path + + workflow = data_designer.compose_workflow(name="empty-default") + workflow.add_stage("base", _category_builder(stub_model_configs), num_records=2, on_success=empty_output) + + with pytest.raises(DataDesignerWorkflowError, match="produced an empty output"): + workflow.run() + + def test_composite_workflow_empty_workflow_fails_before_artifacts( stub_artifact_path: Path, stub_model_providers: list[ModelProvider], @@ -292,7 +314,7 @@ def test_composite_workflow_empty_workflow_fails_before_artifacts( assert not (stub_artifact_path / "empty-workflow").exists() -@pytest.mark.parametrize("name", ["bad/name", "bad*name", ""]) +@pytest.mark.parametrize("name", ["bad/name", "bad*name", "", ".", ".."]) def test_composite_workflow_rejects_invalid_workflow_names( name: str, stub_model_providers: list[ModelProvider], @@ -301,7 +323,7 @@ def test_composite_workflow_rejects_invalid_workflow_names( DataDesigner(model_providers=stub_model_providers).compose_workflow(name=name) -@pytest.mark.parametrize("name", ["bad/name", "bad*name", ""]) +@pytest.mark.parametrize("name", ["bad/name", "bad*name", "", ".", ".."]) def test_composite_workflow_rejects_invalid_stage_names( name: str, stub_artifact_path: Path, @@ -706,6 +728,34 @@ def test_composite_workflow_export_uses_selected_final_output( ] +def test_composite_workflow_export_matches_selected_output_files( + tmp_path: Path, + stub_model_providers: list[ModelProvider], + stub_model_configs: list[ModelConfig], +) -> None: + stage = _seeded_builder(stub_model_configs, [{"name": "Ada"}]) + stage.add_column(ExpressionColumnConfig(name="persona", expr="{{ name }}")) + stage.add_processor(SchemaTransformProcessorConfig(name="compact", template={"compact_name": "{{ persona }}"})) + + workflow = _real_data_designer(tmp_path / "artifacts", stub_model_providers).compose_workflow(name="mixed-export") + workflow.add_stage("compact", stage, num_records=1, output="processor:compact") + results = workflow.run() + + lazy.pd.DataFrame({"compact_name": ["Linus"]}).to_parquet( + results.get_stage_output_path("compact") / "extra.parquet", + index=False, + ) + + output = results.export(tmp_path / "mixed.jsonl") + + rows = [json.loads(line) for line in output.read_text(encoding="utf-8").splitlines()] + assert sorted(rows, key=lambda row: row["compact_name"]) == [ + {"compact_name": "Ada"}, + {"compact_name": "Linus"}, + ] + assert results.count_records() == 2 + + def test_composite_workflow_push_to_hub_rejects_selected_processor_output( tmp_path: Path, stub_model_providers: list[ModelProvider], From 9d9d1d1eba999f68970529b6aef85d8232224f8a Mon Sep 17 00:00:00 2001 From: Andre Manoel Date: Thu, 14 May 2026 23:38:48 +0000 Subject: [PATCH 10/14] fix: preserve generated columns in drop validation Signed-off-by: Andre Manoel --- .../src/data_designer/engine/validation.py | 6 ++++-- .../tests/engine/test_validation.py | 15 +++++++++++++++ 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/packages/data-designer-engine/src/data_designer/engine/validation.py b/packages/data-designer-engine/src/data_designer/engine/validation.py index 2b62a0d42..ed166577b 100644 --- a/packages/data-designer-engine/src/data_designer/engine/validation.py +++ b/packages/data-designer-engine/src/data_designer/engine/validation.py @@ -266,8 +266,10 @@ def validate_columns_not_all_dropped( c for c in columns if not c.drop - and c.name not in processor_dropped_columns - and (c.column_type != DataDesignerColumnType.SEED_DATASET or processor_configs) + and ( + c.column_type != DataDesignerColumnType.SEED_DATASET + or (processor_configs and c.name not in processor_dropped_columns) + ) ] if len(remaining_cols) == 0: diff --git a/packages/data-designer-engine/tests/engine/test_validation.py b/packages/data-designer-engine/tests/engine/test_validation.py index f8e91a728..38af52947 100644 --- a/packages/data-designer-engine/tests/engine/test_validation.py +++ b/packages/data-designer-engine/tests/engine/test_validation.py @@ -292,6 +292,21 @@ def test_validate_columns_not_all_dropped_rejects_seeded_processor_only_config_w assert violations[0].type == ViolationType.ALL_COLUMNS_DROPPED +def test_validate_columns_not_all_dropped_allows_generated_columns_dropped_by_processors(): + violations = validate_columns_not_all_dropped( + [ + LLMTextColumnConfig(name="question", prompt="Generate a question.", model_alias=STUB_MODEL_ALIAS), + LLMTextColumnConfig(name="answer", prompt="Answer {{ question }}.", model_alias=STUB_MODEL_ALIAS), + ], + processor_configs=[ + DropColumnsProcessorConfig(name="drop_raw", column_names=["question", "answer"]), + SchemaTransformProcessorConfig(name="format", template={"messages": "{{ question }} {{ answer }}"}), + ], + ) + + assert violations == [] + + def test_validate_columns_not_all_dropped_still_rejects_seed_only_config(): violations = validate_columns_not_all_dropped([SeedDatasetColumnConfig(name="seed_text")]) From 881aba09dabfb6518ab16681a872639e0bad973c Mon Sep 17 00:00:00 2001 From: Andre Manoel Date: Fri, 15 May 2026 19:12:38 +0000 Subject: [PATCH 11/14] fix: clarify workflow output processors Signed-off-by: Andre Manoel --- .../data_designer/config/preview_results.py | 5 +- .../engine/resources/seed_reader.py | 2 + .../src/data_designer/interface/__init__.py | 2 + .../interface/composite_workflow.py | 110 ++++++++++++------ .../data_designer/interface/data_designer.py | 4 +- .../src/data_designer/interface/results.py | 6 +- .../interface/test_composite_workflow.py | 102 +++++++++++++--- plans/workflow-chaining/workflow-chaining.md | 24 ++-- 8 files changed, 183 insertions(+), 72 deletions(-) diff --git a/packages/data-designer-config/src/data_designer/config/preview_results.py b/packages/data-designer-config/src/data_designer/config/preview_results.py index 6ea90ab59..65880dccb 100644 --- a/packages/data-designer-config/src/data_designer/config/preview_results.py +++ b/packages/data-designer-config/src/data_designer/config/preview_results.py @@ -44,7 +44,10 @@ def __init__( self._config_builder = config_builder def to_config_builder(self, columns: list[str] | None = None) -> DataDesignerConfigBuilder: - """Create a new config builder seeded from this preview dataset.""" + """Create a new config builder seeded from this preview dataset. + + Copies the full preview dataset in memory; intended for interactive use. + """ if self.dataset is None: raise ValueError("Preview dataset is not available.") df = self.dataset diff --git a/packages/data-designer-engine/src/data_designer/engine/resources/seed_reader.py b/packages/data-designer-engine/src/data_designer/engine/resources/seed_reader.py index 68dece454..8f2574cf2 100644 --- a/packages/data-designer-engine/src/data_designer/engine/resources/seed_reader.py +++ b/packages/data-designer-engine/src/data_designer/engine/resources/seed_reader.py @@ -674,6 +674,8 @@ def add_reader(self, reader: SeedReader) -> Self: return self def get_reader(self, seed_dataset_source: SeedSource, secret_resolver: SecretResolver) -> SeedReader: + # attach() mutates top-level source/resolver state. Reader subclasses must + # not keep nested mutable state shared across attaches. reader = copy(self._get_reader_for_source(seed_dataset_source)) reader.attach(seed_dataset_source, secret_resolver) return reader diff --git a/packages/data-designer/src/data_designer/interface/__init__.py b/packages/data-designer/src/data_designer/interface/__init__.py index 351d023f0..febf02f55 100644 --- a/packages/data-designer/src/data_designer/interface/__init__.py +++ b/packages/data-designer/src/data_designer/interface/__init__.py @@ -12,6 +12,7 @@ CompositeWorkflow, CompositeWorkflowResults, SkippedStageResult, + SkippedStageStatus, ) from data_designer.interface.data_designer import DataDesigner # noqa: F401 from data_designer.interface.errors import ( # noqa: F401 @@ -33,6 +34,7 @@ "DatasetCreationResults": ("data_designer.interface.results", "DatasetCreationResults"), "ResumeMode": ("data_designer.engine.storage.artifact_storage", "ResumeMode"), "SkippedStageResult": ("data_designer.interface.composite_workflow", "SkippedStageResult"), + "SkippedStageStatus": ("data_designer.interface.composite_workflow", "SkippedStageStatus"), } __all__ = list(_LAZY_IMPORTS.keys()) diff --git a/packages/data-designer/src/data_designer/interface/composite_workflow.py b/packages/data-designer/src/data_designer/interface/composite_workflow.py index f35d56b8c..44f80999d 100644 --- a/packages/data-designer/src/data_designer/interface/composite_workflow.py +++ b/packages/data-designer/src/data_designer/interface/composite_workflow.py @@ -21,6 +21,7 @@ from data_designer.config.seed import IndexRange, PartitionBlock, SamplingStrategy from data_designer.config.seed_source import LocalFileSeedSource from data_designer.config.utils.constants import DEFAULT_NUM_RECORDS +from data_designer.config.utils.type_helpers import StrEnum from data_designer.config.version import get_library_version from data_designer.engine.dataset_builders.errors import ArtifactStorageError from data_designer.interface.errors import DataDesignerWorkflowError @@ -53,26 +54,30 @@ class _WorkflowStage: num_records: int | None on_success: OnSuccessCallback | None on_success_version: str | None - postprocessors: tuple[ProcessorConfig, ...] + output_processors: tuple[ProcessorConfig, ...] output: str allow_empty: bool sampling_strategy: SamplingStrategy selection_strategy: IndexRange | PartitionBlock | None +class SkippedStageStatus(StrEnum): + SKIPPED_EMPTY_UPSTREAM = "skipped_empty_upstream" + + @dataclass(frozen=True) class SkippedStageResult: - status: str + status: SkippedStageStatus upstream_stage: str class CompositeWorkflowResults: """Results for a composite workflow run. - Per-stage entries are the effective ``DataDesigner.create()`` results. If a - stage uses ``on_success``, metadata and downstream seeding use the callback - output path while the stage result still points at the stage's dataset. - Use ``load_stage_output()`` to load the selected output handed downstream. + Per-stage entries are the effective ``DataDesigner.create()`` results. For + stages with ``output_processors``, this is the output-processor create + result. Use ``load_stage_output()`` to load the selected output handed + downstream; it follows ``output="processor:"`` and ``on_success``. """ def __init__( @@ -102,26 +107,31 @@ def items(self) -> ItemsView[str, DatasetCreationResults | SkippedStageResult]: @property def final_result(self) -> DatasetCreationResults: + return self._require_final_result() + + def _require_final_result(self) -> DatasetCreationResults: result = self.stage_results[self.final_stage_name] if isinstance(result, SkippedStageResult): - raise DataDesignerWorkflowError(f"Final stage {self.final_stage_name!r} was skipped: {result.status}.") + raise DataDesignerWorkflowError( + f"Final stage {self.final_stage_name!r} was skipped: {result.status.value}." + ) return result def load_dataset(self) -> pd.DataFrame: - self.final_result + self._require_final_result() return self.load_stage_output(self.final_stage_name) def load_analysis(self) -> DatasetProfilerResults: return self.final_result.load_analysis() def count_records(self) -> int: - self.final_result + self._require_final_result() return self.count_stage_output_records(self.final_stage_name) def get_stage_output_path(self, stage_name: str) -> Path: result = self.stage_results[stage_name] if isinstance(result, SkippedStageResult): - raise DataDesignerWorkflowError(f"Stage {stage_name!r} was skipped: {result.status}.") + raise DataDesignerWorkflowError(f"Stage {stage_name!r} was skipped: {result.status.value}.") return self._stage_output_paths.get(stage_name, result.artifact_storage.final_dataset_path) def load_stage_output(self, stage_name: str) -> pd.DataFrame: @@ -131,7 +141,7 @@ def count_stage_output_records(self, stage_name: str) -> int: return _count_parquet_records(self.get_stage_output_path(stage_name)) def export(self, path: Path | str, *, format: ExportFormat | None = None) -> Path: - self.final_result + self._require_final_result() return _export_parquet_dataset(self.get_stage_output_path(self.final_stage_name), Path(path), format=format) def push_to_hub(self, *args: Any, **kwargs: Any) -> str: @@ -159,19 +169,29 @@ def add_stage( num_records: int | None = None, on_success: OnSuccessCallback | None = None, on_success_version: str | None = None, - postprocessors: list[ProcessorConfig] | None = None, + output_processors: list[ProcessorConfig] | None = None, output: str = "final", allow_empty: bool = False, sampling_strategy: SamplingStrategy = SamplingStrategy.ORDERED, selection_strategy: IndexRange | PartitionBlock | None = None, ) -> CompositeWorkflow: + """Add a stage to the workflow. + + Processors on ``config_builder`` run inside the main stage. Use + ``output_processors`` for stage-boundary transforms whose output should + feed downstream stages by default. ``output="processor:"`` selects + a named processor artifact, and ``on_success`` can override the selected + output by returning a parquet file or directory. + """ _validate_dir_name(name, "stage name") if any(stage.name == name for stage in self._stages): raise DataDesignerWorkflowError(f"Stage name {name!r} is already used in workflow {self.name!r}.") if num_records is not None and num_records < 1: raise DataDesignerWorkflowError("Stage num_records must be at least 1.") _validate_stage_output(output) - _validate_stage_output_processor(output, config_builder, postprocessors or []) + output_processors = output_processors or [] + _validate_distinct_output_processors(config_builder, output_processors) + _validate_stage_output_processor(output, config_builder, output_processors) self._stages.append( _WorkflowStage( name=name, @@ -180,7 +200,7 @@ def add_stage( num_records=num_records, on_success=on_success, on_success_version=on_success_version, - postprocessors=_clone_processors(postprocessors or []), + output_processors=_clone_processors(output_processors), output=output, allow_empty=allow_empty, sampling_strategy=sampling_strategy, @@ -222,7 +242,7 @@ def run(self) -> CompositeWorkflowResults: } ) stage_results[stage.name] = SkippedStageResult( - status="skipped_empty_upstream", + status=SkippedStageStatus.SKIPPED_EMPTY_UPSTREAM, upstream_stage=skipped_upstream_stage, ) _write_workflow_metadata(workflow_path, metadata) @@ -277,28 +297,27 @@ def run(self) -> CompositeWorkflowResults: actual_records = result.count_records() output_result = result output_source_result = result - if stage.postprocessors: - postprocessor_builder = _postprocessor_config_builder( + if stage.output_processors: + output_processor_builder = _output_processor_config_builder( stage_builder=stage_builder, seed_path=result.artifact_storage.final_dataset_path, - postprocessors=stage.postprocessors, + output_processors=stage.output_processors, ) output_result = self._data_designer.create( - postprocessor_builder, + output_processor_builder, num_records=actual_records, - dataset_name="postprocessors", + dataset_name="output-processors", artifact_path=workflow_path / stage_dir_name, ) output_source_result = _select_output_result(stage, result, output_result) - output_seed_path = _resolve_stage_output_path(output_source_result, stage.output) callback_output_path = None - output_records = _count_parquet_records(output_seed_path) - if stage.on_success is not None: callback_output_path = Path(stage.on_success(result.artifact_storage.base_dataset_path)) output_seed_path = callback_output_path - output_records = _count_parquet_records(callback_output_path) + else: + output_seed_path = _resolve_stage_output_path(output_source_result, stage.output) + output_records = _count_parquet_records(output_seed_path) if output_records == 0: if not stage.allow_empty: @@ -315,8 +334,8 @@ def run(self) -> CompositeWorkflowResults: "output_records": output_records, "output_seed_path": str(output_seed_path), "callback_output_path": str(callback_output_path) if callback_output_path else None, - "postprocessor_output_path": ( - str(output_result.artifact_storage.base_dataset_path) if stage.postprocessors else None + "output_processor_output_path": ( + str(output_result.artifact_storage.base_dataset_path) if stage.output_processors else None ), "duration_sec": time.monotonic() - start_time, } @@ -350,17 +369,17 @@ def _clone_processors(processors: list[ProcessorConfig]) -> tuple[ProcessorConfi return tuple(processor.model_copy(deep=True) for processor in processors) -def _postprocessor_config_builder( +def _output_processor_config_builder( *, stage_builder: DataDesignerConfigBuilder, seed_path: Path, - postprocessors: tuple[ProcessorConfig, ...], + output_processors: tuple[ProcessorConfig, ...], ) -> DataDesignerConfigBuilder: builder = DataDesignerConfigBuilder( model_configs=stage_builder.model_configs, tool_configs=stage_builder.tool_configs, ).with_seed_dataset(_local_seed_source_from_path(seed_path)) - for processor in postprocessors: + for processor in output_processors: builder.add_processor(processor.model_copy(deep=True)) return builder @@ -377,7 +396,7 @@ def _base_stage_metadata(index: int, stage: _WorkflowStage, stage_dir_name: str) "depends_on": list(stage.depends_on), "allow_empty": stage.allow_empty, "on_success_version": stage.on_success_version, - "postprocessors": [processor.model_dump(mode="json") for processor in stage.postprocessors], + "output_processors": [processor.model_dump(mode="json") for processor in stage.output_processors], "output": stage.output, "sampling_strategy": stage.sampling_strategy.value, "selection_strategy": _selection_strategy_payload(stage.selection_strategy), @@ -398,7 +417,7 @@ def _stage_fingerprint( "selection_strategy": _selection_strategy_payload(stage.selection_strategy), "allow_empty": stage.allow_empty, "on_success_version": stage.on_success_version, - "postprocessors": [processor.model_dump(mode="json") for processor in stage.postprocessors], + "output_processors": [processor.model_dump(mode="json") for processor in stage.output_processors], "output": stage.output, "library_version": get_library_version(), "upstream_fingerprint": upstream_fingerprint, @@ -439,7 +458,7 @@ def _select_output_result( if stage.output == "final": return output_result processor_name = stage.output.removeprefix("processor:") - if processor_name in {processor.name for processor in stage.postprocessors}: + if processor_name in {processor.name for processor in stage.output_processors}: return output_result return result @@ -448,14 +467,20 @@ def _count_parquet_records(path: Path) -> int: parquet_files = _parquet_files(path) if not parquet_files: raise DataDesignerWorkflowError(f"No parquet files found at {str(path)!r}.") - return sum(lazy.pq.read_metadata(file_path).num_rows for file_path in parquet_files) + try: + return sum(lazy.pq.read_metadata(file_path).num_rows for file_path in parquet_files) + except Exception as e: + raise DataDesignerWorkflowError(f"Failed to read parquet files at {str(path)!r}: {e}") from e def _load_parquet_dataset(path: Path) -> pd.DataFrame: parquet_files = _parquet_files(path) if not parquet_files: raise DataDesignerWorkflowError(f"No parquet files found at {str(path)!r}.") - return lazy.pd.concat([lazy.pd.read_parquet(file_path) for file_path in parquet_files], ignore_index=True) + try: + return lazy.pd.concat([lazy.pd.read_parquet(file_path) for file_path in parquet_files], ignore_index=True) + except Exception as e: + raise DataDesignerWorkflowError(f"Failed to read parquet files at {str(path)!r}: {e}") from e def _export_parquet_dataset(source_path: Path, output_path: Path, *, format: ExportFormat | None = None) -> Path: @@ -477,6 +502,8 @@ def _export_parquet_dataset(source_path: Path, output_path: Path, *, format: Exp def _parquet_files(path: Path) -> list[Path]: + if not path.exists(): + return [] if not path.is_dir(): return [path] return sorted(path.glob("*.parquet")) @@ -499,17 +526,28 @@ def _validate_stage_output(output: str) -> None: def _validate_stage_output_processor( output: str, config_builder: DataDesignerConfigBuilder, - postprocessors: list[ProcessorConfig], + output_processors: list[ProcessorConfig], ) -> None: if not output.startswith("processor:"): return processor_name = output.removeprefix("processor:") processor_names = {processor.name for processor in config_builder.get_processor_configs()} - processor_names.update(processor.name for processor in postprocessors) + processor_names.update(processor.name for processor in output_processors) if processor_name not in processor_names: raise DataDesignerWorkflowError(f"Stage output processor {processor_name!r} is not configured on this stage.") +def _validate_distinct_output_processors( + config_builder: DataDesignerConfigBuilder, + output_processors: list[ProcessorConfig], +) -> None: + stage_processor_names = {processor.name for processor in config_builder.get_processor_configs()} + duplicate_names = stage_processor_names.intersection(processor.name for processor in output_processors) + if duplicate_names: + names = ", ".join(sorted(duplicate_names)) + raise DataDesignerWorkflowError(f"Output processor names must be distinct from stage processor names: {names}.") + + def _validate_dir_name(name: str, label: str) -> None: if not name: raise DataDesignerWorkflowError(f"{label} must be a non-empty string.") diff --git a/packages/data-designer/src/data_designer/interface/data_designer.py b/packages/data-designer/src/data_designer/interface/data_designer.py index 202eb6ac2..e8ba704f0 100644 --- a/packages/data-designer/src/data_designer/interface/data_designer.py +++ b/packages/data-designer/src/data_designer/interface/data_designer.py @@ -64,6 +64,7 @@ SecretResolver, ) from data_designer.engine.storage.artifact_storage import ArtifactStorage, ResumeMode +from data_designer.interface.composite_workflow import CompositeWorkflow from data_designer.interface.errors import ( DataDesignerEarlyShutdownError, DataDesignerGenerationError, @@ -77,7 +78,6 @@ if TYPE_CHECKING: from data_designer.engine.models.clients.throttle_manager import ThrottleManager from data_designer.engine.models.facade import ModelFacade - from data_designer.interface.composite_workflow import CompositeWorkflow logger = logging.getLogger(__name__) @@ -498,8 +498,6 @@ def preview( ) def compose_workflow(self, *, name: str) -> CompositeWorkflow: - from data_designer.interface.composite_workflow import CompositeWorkflow - return CompositeWorkflow(name=name, data_designer=self) def _log_jinja_rendering_engine_mode(self) -> None: diff --git a/packages/data-designer/src/data_designer/interface/results.py b/packages/data-designer/src/data_designer/interface/results.py index f44a1b72a..a7038866c 100644 --- a/packages/data-designer/src/data_designer/interface/results.py +++ b/packages/data-designer/src/data_designer/interface/results.py @@ -87,7 +87,11 @@ def load_dataset(self) -> pd.DataFrame: return self.artifact_storage.load_dataset() def to_config_builder(self, columns: list[str] | None = None) -> DataDesignerConfigBuilder: - """Create a new config builder seeded from this result dataset.""" + """Create a new config builder seeded from this result dataset. + + Loads the full dataset into memory; intended for interactive use. For + production pipelines, prefer ``CompositeWorkflow``. + """ df = self.load_dataset() if columns is not None: df = df.loc[:, columns] diff --git a/packages/data-designer/tests/interface/test_composite_workflow.py b/packages/data-designer/tests/interface/test_composite_workflow.py index efb0e1994..6af00073b 100644 --- a/packages/data-designer/tests/interface/test_composite_workflow.py +++ b/packages/data-designer/tests/interface/test_composite_workflow.py @@ -21,7 +21,7 @@ from data_designer.config.seed_source_dataframe import DataFrameSeedSource from data_designer.engine.secret_resolver import PlaintextResolver from data_designer.engine.storage.artifact_storage import ArtifactStorage, BatchStage -from data_designer.interface.composite_workflow import SkippedStageResult +from data_designer.interface.composite_workflow import SkippedStageResult, SkippedStageStatus from data_designer.interface.data_designer import DataDesigner from data_designer.interface.errors import DataDesignerWorkflowError from data_designer.interface.results import DatasetCreationResults @@ -275,6 +275,7 @@ def empty_output(stage_path: Path) -> Path: results = workflow.run() assert isinstance(results["copy"], SkippedStageResult) + assert results["copy"].status == SkippedStageStatus.SKIPPED_EMPTY_UPSTREAM assert results["copy"].upstream_stage == "base" with pytest.raises(DataDesignerWorkflowError, match="Final stage 'copy' was skipped"): results.load_dataset() @@ -360,6 +361,23 @@ def test_composite_workflow_rejects_unknown_processor_stage_output( workflow.add_stage("base", _category_builder(stub_model_configs), output="processor:missing") +def test_composite_workflow_rejects_duplicate_output_processor_names( + stub_artifact_path: Path, + stub_model_providers: list[ModelProvider], + stub_model_configs: list[ModelConfig], +) -> None: + stage = _category_builder(stub_model_configs) + stage.add_processor(SchemaTransformProcessorConfig(name="compact", template={"category": "{{ category }}"})) + workflow = _data_designer(stub_artifact_path, stub_model_providers).compose_workflow(name="duplicate-processor") + + with pytest.raises(DataDesignerWorkflowError, match="distinct"): + workflow.add_stage( + "base", + stage, + output_processors=[SchemaTransformProcessorConfig(name="compact", template={"text": "{{ category }}"})], + ) + + def test_composite_workflow_rejects_duplicate_stage_names( stub_artifact_path: Path, stub_model_providers: list[ModelProvider], @@ -512,6 +530,52 @@ def use_processor_output(stage_path: Path) -> Path: ] +def test_composite_workflow_missing_callback_output_raises_workflow_error( + tmp_path: Path, + stub_model_providers: list[ModelProvider], + stub_model_configs: list[ModelConfig], +) -> None: + stage = _seeded_builder(stub_model_configs, [{"name": "Ada"}]) + stage.add_column(ExpressionColumnConfig(name="persona", expr="{{ name }}")) + + def missing_output(stage_path: Path) -> Path: + return stage_path / "callback-outputs" / "missing" + + workflow = _real_data_designer(tmp_path / "artifacts", stub_model_providers).compose_workflow( + name="missing-callback" + ) + workflow.add_stage("base", stage, num_records=1, on_success=missing_output) + + with pytest.raises(DataDesignerWorkflowError, match="No parquet files found"): + workflow.run() + + +def test_composite_workflow_callback_replaces_selected_output_before_counting( + tmp_path: Path, + stub_model_providers: list[ModelProvider], + stub_model_configs: list[ModelConfig], +) -> None: + stage = _seeded_builder(stub_model_configs, [{"name": "Ada", "secret": "hidden"}]) + stage.add_column(ExpressionColumnConfig(name="persona", expr="{{ name }}")) + stage.add_processor(DropColumnsProcessorConfig(name="drop_secret", column_names=["secret"])) + + def use_main_output(stage_path: Path) -> Path: + return stage_path / "parquet-files" + + workflow = _real_data_designer(tmp_path / "artifacts", stub_model_providers).compose_workflow( + name="callback-replaces-output" + ) + workflow.add_stage( + "base", + stage, + num_records=1, + output="processor:drop_secret", + on_success=use_main_output, + ) + + assert workflow.run().load_dataset().to_dict(orient="records") == [{"name": "Ada", "persona": "Ada"}] + + def test_composite_workflow_runs_seeded_processor_only_stage( tmp_path: Path, stub_model_providers: list[ModelProvider], @@ -540,7 +604,7 @@ def test_composite_workflow_runs_seeded_processor_only_stage( assert final.to_dict(orient="records") == [{"name": "Ada", "public_name": "Ada", "final": "Ada final"}] -def test_composite_workflow_postprocessors_transform_stage_output( +def test_composite_workflow_output_processors_transform_stage_output( tmp_path: Path, stub_model_providers: list[ModelProvider], stub_model_configs: list[ModelConfig], @@ -551,24 +615,24 @@ def test_composite_workflow_postprocessors_transform_stage_output( stage_2 = _expression_builder(stub_model_configs, "final", "{{ public_name }} final") workflow = _real_data_designer(tmp_path / "artifacts", stub_model_providers).compose_workflow( - name="postprocessor-stage" + name="output_processor-stage" ) workflow.add_stage( "base", stage_1, num_records=1, - postprocessors=[DropColumnsProcessorConfig(name="drop_secret", column_names=["secret"])], + output_processors=[DropColumnsProcessorConfig(name="drop_secret", column_names=["secret"])], ) workflow.add_stage("final", stage_2) results = workflow.run() base = results["base"].load_dataset() final = results.load_dataset() - metadata = _load_workflow_metadata(tmp_path / "artifacts", "postprocessor-stage") + metadata = _load_workflow_metadata(tmp_path / "artifacts", "output_processor-stage") assert "secret" not in base.columns assert final.to_dict(orient="records") == [{"name": "Ada", "public_name": "Ada", "final": "Ada final"}] - assert metadata["stages"][0]["postprocessor_output_path"].endswith("stage-0-base/postprocessors") + assert metadata["stages"][0]["output_processor_output_path"].endswith("stage-0-base/output-processors") def test_composite_workflow_output_can_select_processor_artifact( @@ -611,7 +675,7 @@ def test_composite_workflow_output_can_select_processor_artifact( assert metadata["stages"][0]["output_seed_path"].endswith("stage-0-compact/processors-files/compact") -def test_composite_workflow_postprocessors_can_feed_from_processor_artifact( +def test_composite_workflow_output_processors_can_feed_from_processor_artifact( tmp_path: Path, stub_model_providers: list[ModelProvider], stub_model_configs: list[ModelConfig], @@ -622,13 +686,13 @@ def test_composite_workflow_postprocessors_can_feed_from_processor_artifact( stage_2 = _expression_builder(stub_model_configs, "final", "{{ compact_name }} final") workflow = _real_data_designer(tmp_path / "artifacts", stub_model_providers).compose_workflow( - name="postprocessor-output" + name="output_processor-output" ) workflow.add_stage( "compact", stage_1, num_records=2, - postprocessors=[SchemaTransformProcessorConfig(name="compact", template={"compact_name": "{{ persona }}"})], + output_processors=[SchemaTransformProcessorConfig(name="compact", template={"compact_name": "{{ persona }}"})], output="processor:compact", ) workflow.add_stage("final", stage_2) @@ -652,7 +716,7 @@ def test_composite_workflow_postprocessors_can_feed_from_processor_artifact( ] -def test_composite_workflow_output_can_select_main_processor_with_postprocessors( +def test_composite_workflow_output_can_select_main_processor_with_output_processors( tmp_path: Path, stub_model_providers: list[ModelProvider], stub_model_configs: list[ModelConfig], @@ -668,14 +732,14 @@ def test_composite_workflow_output_can_select_main_processor_with_postprocessors "compact", stage, num_records=1, - postprocessors=[DropColumnsProcessorConfig(name="drop_scratch", column_names=["scratch"])], + output_processors=[DropColumnsProcessorConfig(name="drop_scratch", column_names=["scratch"])], output="processor:compact", ) assert workflow.run().load_dataset().to_dict(orient="records") == [{"compact_name": "Ada"}] -def test_composite_workflow_callback_receives_main_stage_artifact_with_postprocessors( +def test_composite_workflow_callback_receives_main_stage_artifact_with_output_processors( tmp_path: Path, stub_model_providers: list[ModelProvider], stub_model_configs: list[ModelConfig], @@ -684,25 +748,25 @@ def test_composite_workflow_callback_receives_main_stage_artifact_with_postproce stage.add_column(ExpressionColumnConfig(name="persona", expr="{{ name }}")) callback_paths = [] - def keep_postprocessed(stage_path: Path) -> Path: + def keep_output_processed(stage_path: Path) -> Path: callback_paths.append(stage_path) - assert (stage_path / "postprocessors" / "parquet-files").is_dir() - return stage_path / "postprocessors" / "parquet-files" + assert (stage_path / "output-processors" / "parquet-files").is_dir() + return stage_path / "output-processors" / "parquet-files" workflow = _real_data_designer(tmp_path / "artifacts", stub_model_providers).compose_workflow( - name="postprocessor-callback" + name="output_processor-callback" ) workflow.add_stage( "base", stage, num_records=1, - postprocessors=[DropColumnsProcessorConfig(name="drop_scratch", column_names=["scratch"])], - on_success=keep_postprocessed, + output_processors=[DropColumnsProcessorConfig(name="drop_scratch", column_names=["scratch"])], + on_success=keep_output_processed, ) result = workflow.run() - assert callback_paths == [tmp_path / "artifacts" / "postprocessor-callback" / "stage-0-base"] + assert callback_paths == [tmp_path / "artifacts" / "output_processor-callback" / "stage-0-base"] assert "scratch" not in result.load_dataset().columns diff --git a/plans/workflow-chaining/workflow-chaining.md b/plans/workflow-chaining/workflow-chaining.md index 06c24211a..dc7358511 100644 --- a/plans/workflow-chaining/workflow-chaining.md +++ b/plans/workflow-chaining/workflow-chaining.md @@ -91,18 +91,18 @@ This is the *only* place in the chaining surface that uses an in-memory handoff. Each stage seeds from the **previous stage's final dataset** - the post-processor output with dropped columns excluded. This is the same DataFrame returned by `DatasetCreationResults.load_dataset()`. -Processor outputs (named processor artifacts), schema-transform artifacts, dropped columns, and media assets (images stored on disk with relative paths in the DataFrame) are NOT automatically forwarded in v1. If a downstream stage needs one of these artifacts, use an `on_success` callback to read the upstream stage artifact directory and write a `LocalFileSeedSource`-compatible dataset for the next stage. First-class seeding from named processor outputs is a future extension. Related near-term extensions are seeded processor-only configs and structured stage `postprocessors`, so processor-expressible transforms can remain normal Data Designer processor configs rather than workflow-specific callback code. +Processor outputs (named processor artifacts), schema-transform artifacts, dropped columns, and media assets (images stored on disk with relative paths in the DataFrame) are NOT automatically forwarded in v1. If a downstream stage needs one of these artifacts, use an `on_success` callback to read the upstream stage artifact directory and write a `LocalFileSeedSource`-compatible dataset for the next stage. First-class seeding from named processor outputs is a future extension. Related near-term extensions are seeded processor-only configs and structured stage output processors, so processor-expressible transforms can remain normal Data Designer processor configs rather than workflow-specific callback code. -#### Between-stage postprocessors and callbacks +#### Between-stage output processors and callbacks -Users may need to transform data between stages. The preferred future API is a `postprocessors` argument on `CompositeWorkflow.add_stage()`: +Users may need to transform data between stages. The preferred future API is an `output_processors` argument on `CompositeWorkflow.add_stage()`: ```python workflow.add_stage( "generated", config_gen, num_records=1000, - postprocessors=[ + output_processors=[ DropColumnsProcessorConfig( name="drop_scratch", column_names=["scratch_*"], @@ -112,7 +112,7 @@ workflow.add_stage( workflow.add_stage("enriched", config_enrich) ``` -Postprocessors run after the stage's `DataDesigner.create()` completes and before child stages seed from that stage. They operate on the stage's final dataset, write a deterministic postprocessed artifact, and make that artifact the stage output for downstream seeding. They should accept existing built-in or plugin `ProcessorConfig` objects and be fingerprinted from those configs, so no callback version string is needed. +Postprocessors run after the stage's `DataDesigner.create()` completes and before child stages seed from that stage. They operate on the stage's final dataset, write a deterministic output-processed artifact, and make that artifact the stage output for downstream seeding. They should accept existing built-in or plugin `ProcessorConfig` objects and be fingerprinted from those configs, so no callback version string is needed. Arbitrary Python data transforms should come through the `CustomProcessor` work tracked in #159, rather than a workflow-only callable type. Until that exists, `on_success` remains the escape hatch for arbitrary data-changing code: @@ -136,7 +136,7 @@ workflow.add_stage( workflow.add_stage("enriched", config_enrich) ``` -The callback receives the path to the completed stage's artifact directory (containing `parquet-files/`, `metadata.json`, etc.) and returns a path that child stages will seed from. This keeps large DataFrames on disk and gives users full control. Callback outputs should live under a managed subdirectory such as `/callback-outputs//` so they stay deterministic and easy to clean. Once postprocessors and seeded processor-only configs are valid, processor-expressible transforms should use those structured APIs instead of raw callbacks: +The callback receives the path to the completed stage's artifact directory (containing `parquet-files/`, `metadata.json`, etc.) and returns a path that child stages will seed from. This keeps large DataFrames on disk and gives users full control. Callback outputs should live under a managed subdirectory such as `/callback-outputs//` so they stay deterministic and easy to clean. Once output processors and seeded processor-only configs are valid, processor-expressible transforms should use those structured APIs instead of raw callbacks: ```python filter_config = DataDesignerConfigBuilder().add_processor( @@ -150,7 +150,7 @@ workflow.add_stage("filtered", filter_config) workflow.add_stage("enriched", config_enrich) ``` -**Callback resume policy**: The composite workflow does not hash arbitrary Python source or bytecode in v1. `on_success_version` is the explicit callback identity recorded in `workflow-metadata.json` and included in the stage's exposed output fingerprint. If `on_success` is set without `on_success_version`, that stage's transformed output is treated as dirty on every resume so a changed callback cannot silently reuse stale transformed data. The resolved path returned by the callback is also recorded as the dependent stage's seed path; a stage seeded from callback output is skippable only if that recorded path still exists and is readable by `LocalFileSeedSource`. Structured postprocessors should use their processor config fingerprint instead. +**Callback resume policy**: The composite workflow does not hash arbitrary Python source or bytecode in v1. `on_success_version` is the explicit callback identity recorded in `workflow-metadata.json` and included in the stage's exposed output fingerprint. If `on_success` is set without `on_success_version`, that stage's transformed output is treated as dirty on every resume so a changed callback cannot silently reuse stale transformed data. The resolved path returned by the callback is also recorded as the dependent stage's seed path; a stage seeded from callback output is skippable only if that recorded path still exists and is readable by `LocalFileSeedSource`. Structured output processors should use their processor config fingerprint instead. **Empty stage policy**: If a callback filters all rows (or a stage produces zero rows), the composite workflow raises `DataDesignerWorkflowError` by default. Stages can opt in to empty output with `allow_empty=True` on `add_stage()`, in which case the workflow marks that stage as `completed_empty` and all downstream stages as `skipped_empty_upstream`. `CompositeWorkflowResults` still contains every declared stage name: executed stages map to `DatasetCreationResults`, while skipped downstream stages map to `SkippedStageResult` with `status="skipped_empty_upstream"` and `upstream_stage=`. This avoids `KeyError`/`None` ambiguity and gives resume a durable state distinct from normal completion. @@ -401,7 +401,7 @@ result_2 = data_designer.create(config_2, num_records=200) # explode: 50 -> 200 - Add `compose_workflow(name: str)` factory method on `DataDesigner`. - Tests: multi-stage runs, explode/filter via callbacks, num_records defaulting, duplicate stage-name rejection, artifact layout, throttle reuse across stages. -**Status after PR #636:** Implemented `CompositeWorkflow`, `compose_workflow()`, `to_config_builder()`, disk handoff, stage metadata, `acreate()`, shared throttle manager reuse, explicit stage artifact roots, cloned stage builders, concurrent-safe seed reader/resource-provider handling, seeded processor-only configs, stage postprocessors, and stage output selection. Still deferred: stage-level resume, DAG branches, `allow_resize` removal, config bundles, and broader first-class artifact seeding. +**Status after PR #636:** Implemented `CompositeWorkflow`, `compose_workflow()`, `to_config_builder()`, disk handoff, stage metadata, `acreate()`, shared throttle manager reuse, explicit stage artifact roots, cloned stage builders, concurrent-safe seed reader/resource-provider handling, seeded processor-only configs, stage output processors, and stage output selection. Still deferred: stage-level resume, DAG branches, `allow_resize` removal, config bundles, and broader first-class artifact seeding. ### Sidecar: `acreate()` on `DataDesigner` (independent of chaining v1) @@ -419,7 +419,7 @@ result_2 = data_designer.create(config_2, num_records=200) # explode: 50 -> 200 - Remove the `allow_resize` field from the config schema. - Add fail-fast guard in `ProcessorRunner` for pre-batch row-count changes. - Tests: verify rejection, migration path examples. -- Migration examples should prefer `postprocessors` for inline processor-expressible transforms, or seeded processor-only configs when the transform deserves its own named stage. Raw `on_success` remains the escape hatch for arbitrary custom transforms. +- Migration examples should prefer `output_processors` for inline processor-expressible transforms, or seeded processor-only configs when the transform deserves its own named stage. Raw `on_success` remains the escape hatch for arbitrary custom transforms. ### Phase 3: Stage-level resume @@ -477,7 +477,7 @@ result = data_designer.create(config) Validation allows a seed-only config when it has processors and a seed dataset, while still rejecting processor-only configs with no seed and configs that drop all output columns. -**Stage postprocessors (implemented in #636).** `CompositeWorkflow.add_stage()` accepts `postprocessors=[ProcessorConfig, ...]` for compact inline transforms at a stage boundary. Internally this reuses the seeded processor-only config path above: seed a temporary processor config from the completed stage output, run it under a deterministic subdirectory, and use its final dataset as the stage output. This keeps row-count-changing 1:N and N:1 operations out of the async generation batch path while making the common case concise. Custom arbitrary transforms should use `CustomProcessor` when #159 lands; until then, keep raw `on_success` as the escape hatch. +**Stage output processors (implemented in #636).** `CompositeWorkflow.add_stage()` accepts `output_processors=[ProcessorConfig, ...]` for compact inline transforms at a stage boundary. Internally this reuses the seeded processor-only config path above: seed a temporary processor config from the completed stage output, run it under a deterministic subdirectory, and use its final dataset as the stage output. This keeps row-count-changing 1:N and N:1 operations out of the async generation batch path while making the common case concise. Custom arbitrary transforms should use `CustomProcessor` when #159 lands; until then, keep raw `on_success` as the escape hatch. **Config bundles.** Built-in or plugin-provided bundles should be reusable config fragments/factories, not execution stages. A bundle can return or extend a `DataDesignerConfigBuilder`, and `add_stage()` can accept either a builder or a bundle by normalizing the bundle to a builder: @@ -509,7 +509,7 @@ These were open in earlier drafts; recording the resolutions here so the design 6. **Export and push semantics** -> v1 export/push defaults to the final stage's dataset only. Exporting selected stages or a full composite workflow artifact bundle is future work. -7. **Stage extension model** -> The public surface remains `CompositeWorkflow`, not a separate `stages` module. Near-term extensibility is seeded processor-only configs, postprocessors, and stage output selection. Shared built-in/plugin units are config bundles, not a new executor. +7. **Stage extension model** -> The public surface remains `CompositeWorkflow`, not a separate `stages` module. Near-term extensibility is seeded processor-only configs, output processors, and stage output selection. Shared built-in/plugin units are config bundles, not a new executor. ## Open questions @@ -523,6 +523,6 @@ These were open in earlier drafts; recording the resolutions here so the design - #447 - AsyncRunController refactor (partially superseded: pre-batch resize handling moves to composite workflow level instead of controller level) - #526 / #525 - Resume interrupted runs (single-stage batch/row-group resume primitive used by composite workflow stage resume) -- #159 - Custom processors (needed for arbitrary Python postprocessors without workflow-only callback types) +- #159 - Custom processors (needed for arbitrary Python output processors without workflow-only callback types) - #462 - Progress bar and scheduler polish (independent) - #464 - Custom column retryable errors (independent) From 7d4a1ca438702bf6d5a0ec7a6ec567294594231d Mon Sep 17 00:00:00 2001 From: Andre Manoel Date: Mon, 18 May 2026 19:45:07 +0000 Subject: [PATCH 12/14] docs: add workflow chaining page --- docs/concepts/workflow-chaining.md | 107 +++++++++++++++++ fern/versions/latest.yml | 2 + .../pages/concepts/workflow-chaining.mdx | 111 ++++++++++++++++++ mkdocs.yml | 1 + 4 files changed, 221 insertions(+) create mode 100644 docs/concepts/workflow-chaining.md create mode 100644 fern/versions/latest/pages/concepts/workflow-chaining.mdx diff --git a/docs/concepts/workflow-chaining.md b/docs/concepts/workflow-chaining.md new file mode 100644 index 000000000..959dd688a --- /dev/null +++ b/docs/concepts/workflow-chaining.md @@ -0,0 +1,107 @@ +# Workflow Chaining + +!!! warning "Experimental" + Workflow chaining is experimental. The API, metadata schema, and artifact layout may change before this becomes a stable interface. Use it for early testing and feedback, not as a long-term compatibility boundary yet. + +Workflow chaining lets you split a dataset build into named stages. Each stage runs a normal `DataDesigner.create()` call, writes its own artifact directory, and hands a selected parquet output to the next stage as a `LocalFileSeedSource`. + +Use it when one generation step naturally depends on the cleaned or reshaped output of another step, especially when a processor-only stage is clearer than mixing all transformations into one config. + +## Basic shape + +```python +import data_designer.config as dd +from data_designer.interface import DataDesigner + +data_designer = DataDesigner() + +drafts = ( + dd.DataDesignerConfigBuilder(model_configs=[fast_model]) + .with_seed_dataset(dd.LocalFileSeedSource(path="parsed_docs/*.parquet")) + .add_column( + name="chunk_summary", + column_type="llm_text", + model_alias="fast", + prompt="Summarize this passage:\n\n{{ text }}", + ) + .add_column( + name="question", + column_type="llm_text", + model_alias="fast", + prompt="Write a question about this passage:\n\n{{ chunk_summary }}", + ) + .add_column( + name="answer", + column_type="llm_text", + model_alias="fast", + prompt="Answer {{ question }} using this passage:\n\n{{ text }}", + ) +) + +chatml = dd.DataDesignerConfigBuilder().add_processor( + dd.SchemaTransformProcessorConfig( + name="chatml", + template={ + "messages": [ + {"role": "user", "content": "{{ question }}"}, + {"role": "assistant", "content": "{{ answer }}"}, + ], + }, + ) +) + +workflow = data_designer.compose_workflow(name="doc-qa") +workflow.add_stage( + "drafts", + drafts, + num_records=100, + output_processors=[ + dd.DropColumnsProcessorConfig( + name="drop_scratch", + column_names=["text", "chunk_summary"], + ) + ], +) +workflow.add_stage("chatml", chatml, output="processor:chatml") + +results = workflow.run() +training_rows = results.load_dataset() +results.export("chatml.jsonl") +``` + +## Stage outputs + +A stage can expose different views of its data: + +| Surface | What it returns | +|---------|-----------------| +| `results["stage_name"]` | The effective `DatasetCreationResults` for that stage. If the stage uses `output_processors`, this points at the output-processor run. | +| `results.load_stage_output("stage_name")` | The selected output handed to downstream stages. This follows `output="processor:"` and `on_success`. | +| `results.load_dataset()` | The selected output from the final stage. | + +Processors added with `config_builder.add_processor(...)` run inside the stage and usually create side artifacts. They do not automatically change what the next stage receives. Use `output_processors=[...]` when a processor should define the stage boundary output. + +## Processor-only stages + +Stages can be processor-only when they receive seed data from an upstream stage: + +```python +cleanup = dd.DataDesignerConfigBuilder().add_processor( + dd.DropColumnsProcessorConfig( + name="drop_private_fields", + column_names=["email", "raw_notes"], + ) +) + +workflow.add_stage("cleanup", cleanup) +``` + +This is useful for final cleanup, schema transforms, and format-specific export preparation. + +## Current limits + +- Stages are linear. DAGs, parallel branches, and joins are planned separately. +- Stage-level resume is not implemented yet. +- `push_to_hub()` does not support selected processor or callback outputs yet. Use `export()` for the selected workflow output. +- `on_success` callbacks are trusted user code. If a callback returns a path, Data Designer reads that path as the next stage input. +- The artifact layout is intended for inspection, but it is not yet a stable public contract. diff --git a/fern/versions/latest.yml b/fern/versions/latest.yml index 9854f922c..314f0a640 100644 --- a/fern/versions/latest.yml +++ b/fern/versions/latest.yml @@ -35,6 +35,8 @@ navigation: path: ./v0.5.8/pages/concepts/validators.mdx - page: Processors path: ./v0.5.8/pages/concepts/processors.mdx + - page: Workflow Chaining + path: ./latest/pages/concepts/workflow-chaining.mdx - page: Person Sampling path: ./v0.5.8/pages/concepts/person_sampling.mdx - page: Traces diff --git a/fern/versions/latest/pages/concepts/workflow-chaining.mdx b/fern/versions/latest/pages/concepts/workflow-chaining.mdx new file mode 100644 index 000000000..ead5b6b2b --- /dev/null +++ b/fern/versions/latest/pages/concepts/workflow-chaining.mdx @@ -0,0 +1,111 @@ +--- +title: "Workflow Chaining" +description: "" +position: 8 +--- + + Workflow chaining is experimental. The API, metadata schema, and artifact layout may change before this becomes a stable interface. Use it for early testing and feedback, not as a long-term compatibility boundary yet. + + +Workflow chaining lets you split a dataset build into named stages. Each stage runs a normal `DataDesigner.create()` call, writes its own artifact directory, and hands a selected parquet output to the next stage as a `LocalFileSeedSource`. + +Use it when one generation step naturally depends on the cleaned or reshaped output of another step, especially when a processor-only stage is clearer than mixing all transformations into one config. + +## Basic shape + +```python +import data_designer.config as dd +from data_designer.interface import DataDesigner + +data_designer = DataDesigner() + +drafts = ( + dd.DataDesignerConfigBuilder(model_configs=[fast_model]) + .with_seed_dataset(dd.LocalFileSeedSource(path="parsed_docs/*.parquet")) + .add_column( + name="chunk_summary", + column_type="llm_text", + model_alias="fast", + prompt="Summarize this passage:\n\n{{ text }}", + ) + .add_column( + name="question", + column_type="llm_text", + model_alias="fast", + prompt="Write a question about this passage:\n\n{{ chunk_summary }}", + ) + .add_column( + name="answer", + column_type="llm_text", + model_alias="fast", + prompt="Answer {{ question }} using this passage:\n\n{{ text }}", + ) +) + +chatml = dd.DataDesignerConfigBuilder().add_processor( + dd.SchemaTransformProcessorConfig( + name="chatml", + template={ + "messages": [ + {"role": "user", "content": "{{ question }}"}, + {"role": "assistant", "content": "{{ answer }}"}, + ], + }, + ) +) + +workflow = data_designer.compose_workflow(name="doc-qa") +workflow.add_stage( + "drafts", + drafts, + num_records=100, + output_processors=[ + dd.DropColumnsProcessorConfig( + name="drop_scratch", + column_names=["text", "chunk_summary"], + ) + ], +) +workflow.add_stage("chatml", chatml, output="processor:chatml") + +results = workflow.run() +training_rows = results.load_dataset() +results.export("chatml.jsonl") +``` + +## Stage outputs + +A stage can expose different views of its data: + +| Surface | What it returns | +|---------|-----------------| +| `results["stage_name"]` | The effective `DatasetCreationResults` for that stage. If the stage uses `output_processors`, this points at the output-processor run. | +| `results.load_stage_output("stage_name")` | The selected output handed to downstream stages. This follows `output="processor:"` and `on_success`. | +| `results.load_dataset()` | The selected output from the final stage. | + +Processors added with `config_builder.add_processor(...)` run inside the stage and usually create side artifacts. They do not automatically change what the next stage receives. Use `output_processors=[...]` when a processor should define the stage boundary output. + +## Processor-only stages + +Stages can be processor-only when they receive seed data from an upstream stage: + +```python +cleanup = dd.DataDesignerConfigBuilder().add_processor( + dd.DropColumnsProcessorConfig( + name="drop_private_fields", + column_names=["email", "raw_notes"], + ) +) + +workflow.add_stage("cleanup", cleanup) +``` + +This is useful for final cleanup, schema transforms, and format-specific export preparation. + +## Current limits + +- Stages are linear. DAGs, parallel branches, and joins are planned separately. +- Stage-level resume is not implemented yet. +- `push_to_hub()` does not support selected processor or callback outputs yet. Use `export()` for the selected workflow output. +- `on_success` callbacks are trusted user code. If a callback returns a path, Data Designer reads that path as the next stage input. +- The artifact layout is intended for inspection, but it is not yet a stable public contract. diff --git a/mkdocs.yml b/mkdocs.yml index 89d425dfd..44612da0c 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -20,6 +20,7 @@ nav: - Custom Columns: concepts/custom_columns.md - Validators: concepts/validators.md - Processors: concepts/processors.md + - Workflow Chaining: concepts/workflow-chaining.md - Person Sampling: concepts/person_sampling.md - Traces: concepts/traces.md - Tool Use & MCP: From 475b16e5979130822805a55caecca6a6c5fdedec Mon Sep 17 00:00:00 2001 From: Andre Manoel Date: Mon, 18 May 2026 20:05:17 +0000 Subject: [PATCH 13/14] docs: align workflow chaining warning --- docs/concepts/workflow-chaining.md | 4 ++-- fern/versions/latest/pages/concepts/workflow-chaining.mdx | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/concepts/workflow-chaining.md b/docs/concepts/workflow-chaining.md index 959dd688a..2b8c3cc3f 100644 --- a/docs/concepts/workflow-chaining.md +++ b/docs/concepts/workflow-chaining.md @@ -1,7 +1,7 @@ # Workflow Chaining -!!! warning "Experimental" - Workflow chaining is experimental. The API, metadata schema, and artifact layout may change before this becomes a stable interface. Use it for early testing and feedback, not as a long-term compatibility boundary yet. +!!! warning "Experimental Feature" + Workflow chaining is currently **experimental** and under active development. The documentation, examples, workflow API, metadata schema, and artifact layout are subject to significant changes in future releases. If you encounter any issues, have questions, or have ideas for improvement, please consider starting [a discussion on GitHub](https://github.com/NVIDIA-NeMo/DataDesigner/discussions). Workflow chaining lets you split a dataset build into named stages. Each stage runs a normal `DataDesigner.create()` call, writes its own artifact directory, and hands a selected parquet output to the next stage as a `LocalFileSeedSource`. diff --git a/fern/versions/latest/pages/concepts/workflow-chaining.mdx b/fern/versions/latest/pages/concepts/workflow-chaining.mdx index ead5b6b2b..998d0dfd0 100644 --- a/fern/versions/latest/pages/concepts/workflow-chaining.mdx +++ b/fern/versions/latest/pages/concepts/workflow-chaining.mdx @@ -3,8 +3,8 @@ title: "Workflow Chaining" description: "" position: 8 --- - - Workflow chaining is experimental. The API, metadata schema, and artifact layout may change before this becomes a stable interface. Use it for early testing and feedback, not as a long-term compatibility boundary yet. + + Workflow chaining is currently **experimental** and under active development. The documentation, examples, workflow API, metadata schema, and artifact layout are subject to significant changes in future releases. If you encounter any issues, have questions, or have ideas for improvement, please consider starting [a discussion on GitHub](https://github.com/NVIDIA-NeMo/DataDesigner/discussions). Workflow chaining lets you split a dataset build into named stages. Each stage runs a normal `DataDesigner.create()` call, writes its own artifact directory, and hands a selected parquet output to the next stage as a `LocalFileSeedSource`. From ae70ca81c79e4dedb1f1298b327e3a8b1ec603ca Mon Sep 17 00:00:00 2001 From: Andre Manoel Date: Mon, 18 May 2026 21:49:44 +0000 Subject: [PATCH 14/14] fix: address workflow review nits --- .../interface/composite_workflow.py | 21 +++++++++++++++++-- .../data_designer/interface/data_designer.py | 16 ++++++++++++++ .../tests/interface/test_acreate.py | 6 +++--- .../interface/test_composite_workflow.py | 8 +++---- .../tests/interface/test_data_designer.py | 3 ++- 5 files changed, 44 insertions(+), 10 deletions(-) diff --git a/packages/data-designer/src/data_designer/interface/composite_workflow.py b/packages/data-designer/src/data_designer/interface/composite_workflow.py index 44f80999d..98128efb6 100644 --- a/packages/data-designer/src/data_designer/interface/composite_workflow.py +++ b/packages/data-designer/src/data_designer/interface/composite_workflow.py @@ -107,6 +107,7 @@ def items(self) -> ItemsView[str, DatasetCreationResults | SkippedStageResult]: @property def final_result(self) -> DatasetCreationResults: + """Return the final stage result, or raise if it was skipped.""" return self._require_final_result() def _require_final_result(self) -> DatasetCreationResults: @@ -118,33 +119,41 @@ def _require_final_result(self) -> DatasetCreationResults: return result def load_dataset(self) -> pd.DataFrame: + """Load the selected output from the final workflow stage.""" self._require_final_result() return self.load_stage_output(self.final_stage_name) def load_analysis(self) -> DatasetProfilerResults: + """Load analysis from the final stage result.""" return self.final_result.load_analysis() def count_records(self) -> int: + """Count records in the selected output from the final workflow stage.""" self._require_final_result() return self.count_stage_output_records(self.final_stage_name) def get_stage_output_path(self, stage_name: str) -> Path: + """Return the selected output path handed downstream for a stage.""" result = self.stage_results[stage_name] if isinstance(result, SkippedStageResult): raise DataDesignerWorkflowError(f"Stage {stage_name!r} was skipped: {result.status.value}.") return self._stage_output_paths.get(stage_name, result.artifact_storage.final_dataset_path) def load_stage_output(self, stage_name: str) -> pd.DataFrame: + """Load the selected output handed downstream for a stage.""" return _load_parquet_dataset(self.get_stage_output_path(stage_name)) def count_stage_output_records(self, stage_name: str) -> int: + """Count records in the selected output handed downstream for a stage.""" return _count_parquet_records(self.get_stage_output_path(stage_name)) def export(self, path: Path | str, *, format: ExportFormat | None = None) -> Path: + """Export the selected output from the final workflow stage.""" self._require_final_result() return _export_parquet_dataset(self.get_stage_output_path(self.final_stage_name), Path(path), format=format) def push_to_hub(self, *args: Any, **kwargs: Any) -> str: + """Push the final stage result to Hugging Face Hub when no output override is selected.""" final_result = self.final_result if self.get_stage_output_path(self.final_stage_name) != final_result.artifact_storage.final_dataset_path: raise DataDesignerWorkflowError( @@ -155,7 +164,10 @@ def push_to_hub(self, *args: Any, **kwargs: Any) -> str: class CompositeWorkflow: + """Experimental linear workflow for chaining Data Designer stages.""" + def __init__(self, *, name: str, data_designer: DataDesigner) -> None: + """Create a workflow bound to a parent Data Designer instance.""" _validate_dir_name(name, "workflow name") self.name = name self._data_designer = data_designer @@ -210,11 +222,16 @@ def add_stage( return self def run(self) -> CompositeWorkflowResults: - """Run all stages from scratch, replacing deterministic stage directories.""" + """Run all stages from scratch. + + Each stage writes a deterministic artifact directory under the parent + Data Designer artifact path. Downstream stages are seeded from the + selected output of the previous stage. + """ if not self._stages: raise DataDesignerWorkflowError(f"Workflow {self.name!r} has no stages.") - workflow_path = self._data_designer._artifact_path / self.name + workflow_path = self._data_designer.artifact_path / self.name workflow_path.mkdir(parents=True, exist_ok=True) metadata: dict[str, Any] = { "name": self.name, diff --git a/packages/data-designer/src/data_designer/interface/data_designer.py b/packages/data-designer/src/data_designer/interface/data_designer.py index e8ba704f0..de97b47ab 100644 --- a/packages/data-designer/src/data_designer/interface/data_designer.py +++ b/packages/data-designer/src/data_designer/interface/data_designer.py @@ -215,6 +215,11 @@ def info(self) -> InterfaceInfo: """ return self._get_interface_info(self._model_providers) + @property + def artifact_path(self) -> Path: + """Directory where Data Designer writes artifacts by default.""" + return self._artifact_path + def list_mcp_tool_names(self, mcp_provider_name: str, *, timeout_sec: float = 10.0) -> list[str]: """Connect to a configured MCP provider and return the names of its available tools. @@ -498,6 +503,17 @@ def preview( ) def compose_workflow(self, *, name: str) -> CompositeWorkflow: + """Create an experimental composite workflow. + + Workflow chaining is experimental and its API, metadata schema, and + artifact layout may change in future releases. + + Args: + name: Workflow name used for the artifact directory. + + Returns: + A composite workflow that can run named stages in sequence. + """ return CompositeWorkflow(name=name, data_designer=self) def _log_jinja_rendering_engine_mode(self) -> None: diff --git a/packages/data-designer/tests/interface/test_acreate.py b/packages/data-designer/tests/interface/test_acreate.py index 4684ec1cc..b87d8eeba 100644 --- a/packages/data-designer/tests/interface/test_acreate.py +++ b/packages/data-designer/tests/interface/test_acreate.py @@ -97,8 +97,8 @@ def fake_create( started_count += 1 if started_count == 2: both_started.set() - assert both_started.wait(2) - assert release.wait(2) + assert both_started.wait(5) + assert release.wait(5) return _creation_result(config_builder, stub_dataset_profiler_results) data_designer.create = MagicMock(side_effect=fake_create) @@ -108,7 +108,7 @@ def fake_create( left_task = asyncio.create_task(data_designer.acreate(left, num_records=1, dataset_name="left")) right_task = asyncio.create_task(data_designer.acreate(right, num_records=1, dataset_name="right")) try: - assert await asyncio.to_thread(both_started.wait, 2) + assert await asyncio.to_thread(both_started.wait, 5) finally: release.set() diff --git a/packages/data-designer/tests/interface/test_composite_workflow.py b/packages/data-designer/tests/interface/test_composite_workflow.py index 6af00073b..aea4ec20f 100644 --- a/packages/data-designer/tests/interface/test_composite_workflow.py +++ b/packages/data-designer/tests/interface/test_composite_workflow.py @@ -70,7 +70,7 @@ def fake_create( dataset_name: str, **kwargs, ) -> DatasetCreationResults: - artifact_path = Path(kwargs.pop("artifact_path", data_designer._artifact_path)) + artifact_path = Path(kwargs.pop("artifact_path", data_designer.artifact_path)) del kwargs df = lazy.pd.DataFrame({"category": ["alpha"] * num_records, "category_copy": ["alpha"] * num_records}) return _result_from_df( @@ -177,7 +177,7 @@ def test_composite_workflow_runs_linear_stages_with_disk_handoff( assert "category_copy" in final_df.columns assert (stub_artifact_path / "linear-chain" / "stage-0-base").is_dir() assert (stub_artifact_path / "linear-chain" / "stage-1-copy").is_dir() - assert data_designer._artifact_path == stub_artifact_path + assert data_designer.artifact_path == stub_artifact_path metadata = _load_workflow_metadata(stub_artifact_path, "linear-chain") assert [stage["status"] for stage in metadata["stages"]] == ["completed", "completed"] @@ -880,7 +880,7 @@ def fake_create( dataset_name: str, **kwargs, ) -> DatasetCreationResults: - artifact_path = Path(kwargs.pop("artifact_path", data_designer._artifact_path)) + artifact_path = Path(kwargs.pop("artifact_path", data_designer.artifact_path)) del num_records, kwargs value = "first" if dataset_name == "stage-0-first" else "final" return _result_from_df( @@ -921,7 +921,7 @@ def fake_create( dataset_name: str, **kwargs, ) -> DatasetCreationResults: - artifact_path = Path(kwargs.pop("artifact_path", data_designer._artifact_path)) + artifact_path = Path(kwargs.pop("artifact_path", data_designer.artifact_path)) del num_records, kwargs return _result_from_df( artifact_path, diff --git a/packages/data-designer/tests/interface/test_data_designer.py b/packages/data-designer/tests/interface/test_data_designer.py index 5c17645bd..a9c07eb86 100644 --- a/packages/data-designer/tests/interface/test_data_designer.py +++ b/packages/data-designer/tests/interface/test_data_designer.py @@ -491,7 +491,8 @@ def test_init_with_string_path(stub_artifact_path, stub_model_providers): """Test DataDesigner accepts string paths.""" designer = DataDesigner(artifact_path=str(stub_artifact_path), model_providers=stub_model_providers) assert designer is not None - assert isinstance(designer._artifact_path, Path) + assert isinstance(designer.artifact_path, Path) + assert designer.artifact_path == stub_artifact_path def test_init_with_path_object(stub_artifact_path, stub_model_providers):