From 11689607327366ce36cf1f594db398e3a437ba1a Mon Sep 17 00:00:00 2001 From: Daniele Lotito Date: Mon, 20 Apr 2026 16:10:36 +0100 Subject: [PATCH 01/23] refactor: migrate no-priors sampler --- examples/no-priors-characterization/README.md | 208 ++-- .../example_yamls/op_basic_sampling.yaml | 23 +- .../example_yamls/op_quick_exploration.yaml | 23 +- .../example_yamls/op_thorough_coverage.yaml | 23 +- .../discoveryspace/no_priors_parameters.py | 6 +- .../core/discoveryspace}/no_priors_sampler.py | 9 +- .../core/discoveryspace/no_priors_utils.py | 953 ++++++++++++++++++ .../no-priors-characterization/README.md | 46 - .../no-priors-characterization/pyproject.toml | 29 - .../no_priors_characterization/__init__.py | 6 - .../no_priors_characterization/operator.py | 106 -- .../utils/__init__.py | 33 - .../utils/high_dimensional_sampling.py | 338 ------- .../utils/one_dimensional_sampling.py | 293 ------ .../no_priors_characterization/utils/order.py | 247 ----- .../utils/space_df_connector.py | 524 ---------- .../visualize_sampling.py | 135 --- plugins/operators/trim/pyproject.toml | 2 - pyproject.toml | 3 - requirements.txt | 8 + .../discoveryspace/test_no_priors_sampler.py | 97 ++ uv.lock | 102 +- .../operators/no-priors-characterization.md | 1 - website/mkdocs.yml | 1 - 24 files changed, 1296 insertions(+), 1920 deletions(-) rename plugins/operators/no-priors-characterization/src/no_priors_characterization/no_priors_pydantic.py => orchestrator/core/discoveryspace/no_priors_parameters.py (89%) rename {plugins/operators/no-priors-characterization/src/no_priors_characterization => orchestrator/core/discoveryspace}/no_priors_sampler.py (95%) create mode 100644 orchestrator/core/discoveryspace/no_priors_utils.py delete mode 100644 plugins/operators/no-priors-characterization/README.md delete mode 100644 plugins/operators/no-priors-characterization/pyproject.toml delete mode 100644 plugins/operators/no-priors-characterization/src/no_priors_characterization/__init__.py delete mode 100644 plugins/operators/no-priors-characterization/src/no_priors_characterization/operator.py delete mode 100644 plugins/operators/no-priors-characterization/src/no_priors_characterization/utils/__init__.py delete mode 100644 plugins/operators/no-priors-characterization/src/no_priors_characterization/utils/high_dimensional_sampling.py delete mode 100644 plugins/operators/no-priors-characterization/src/no_priors_characterization/utils/one_dimensional_sampling.py delete mode 100644 plugins/operators/no-priors-characterization/src/no_priors_characterization/utils/order.py delete mode 100644 plugins/operators/no-priors-characterization/src/no_priors_characterization/utils/space_df_connector.py delete mode 100644 plugins/operators/no-priors-characterization/visualize_sampling.py create mode 100644 tests/core/discoveryspace/test_no_priors_sampler.py delete mode 120000 website/docs/operators/no-priors-characterization.md diff --git a/examples/no-priors-characterization/README.md b/examples/no-priors-characterization/README.md index 79fc313be..0bdd09202 100644 --- a/examples/no-priors-characterization/README.md +++ b/examples/no-priors-characterization/README.md @@ -1,22 +1,21 @@ -# Exploring Parameter Spaces with No-Priors Characterization +# Performing Efficient Space-Filling Sampling of a Configuration Space > [!NOTE] The scenario > -> You have an experiment with multiple parameters, -> and you want to understand how these parameters influence the outcome. -> **In this example, `ado`'s no-priors characterization operator is used to -> systematically sample and measure the target property across the parameter -> space using various sampling strategies aimed at covering uniformly the -> parameter space.** Using the no-priors characterization -> operator involves: +> You have an experiment with multiple parameters and need an initial measured +> dataset that covers the configuration space efficiently. +> **In this example, `ado`'s `random_walk` operator with the no-priors sampler +> is used for efficient space-filling sampling of the target property across the +> parameter space, moving beyond standard random-walk or brute-force sampling.** +> Using the no-priors sampler with `random_walk` involves: > -> 1. Defining the parameter space to explore. -> 2. Creating an `operation` that uses no-priors characterization to sample -> points using a chosen strategy. -> 3. Observing the sampling process as it measures the target output property with -> the selected strategy. +> 1. Defining the configuration space to explore. +> 2. Creating an `operation` that uses `random_walk` with the no-priors sampler +> to order and submit points with a space-filling strategy. +> 3. Observing the measurement process as the selected strategy orders and +> submits the points. > [!IMPORTANT] Prerequisites > @@ -25,7 +24,6 @@ > ```commandline > git clone https://github.com/IBM/ado.git > cd ado -> pip install plugins/operators/no-priors-characterization/ > pip install examples/no-priors-characterization/custom_experiments/ > ``` @@ -36,16 +34,16 @@ > [!TIP] TL;DR > -> To create a `discoveryspace` and explore it with the no-priors -> characterization operator, execute the following from the root of the `ado` -> repository: +> To create a `discoveryspace` and perform efficient space-filling sampling with +> the `random_walk` operator using the no-priors sampler, execute the following +> from the root of the `ado` repository: > > ```bash > : # Create the space to explore based on a custom experiment > ado create space -f \ > examples/no-priors-characterization/example_yamls/space_reaction.yaml \ > --new-sample-store -> : # Explore it with no-priors characterization! +> : # Run a space-filling characterization operation > ado create operation -f \ > examples/no-priors-characterization/example_yamls/op_basic_sampling.yaml \ > --use-latest space @@ -53,54 +51,63 @@ -## What is No-Priors Characterization? +## What is Space-Filling Sampling with the No-Priors Sampler? -**No-Priors Characterization** is a sampling operator designed to explore a -parameter space systematically without requiring any prior knowledge or -existing data. It's perfect for initial exploration of a system where you want -to gather representative samples across the entire parameter space. +The **no-priors sampler** is an advanced sampler for the `random_walk` operator +that provides efficient space-filling exploration when you do not yet have a +useful prior model or historical dataset. It is a strong fit for the first phase +of an exploration, where you want representative coverage across a configuration +space before switching to model-based or target-driven workflows. **Handling Existing Measurements**: If the discovery space already contains -measured entities for the target property, the operator automatically: +measured entities for the target property, the sampler automatically: - Identifies which entities have already been measured - Excludes them from sampling, so that the operator will measure the - desired amount of entities + desired amount of new entities -The operator supports three sampling strategies: +The sampler supports multiple sampling strategies: -1. **Random Sampling (`random`)**: Uniformly random sampling across the - parameter space. Fast and simple, but may not provide optimal coverage. +1. **Random Sampling (`random`)**: A baseline random ordering across the + candidate configuration space. Fast and simple, but usually less + space-filling than the advanced strategies. 2. **Concatenated Latin Hypercube Sampling (`clhs`)**: An adaptation of Latin - Hypercube Sampling for discrete spaces. Good coverage in each dimension is - obtained by avoiding measuring parameters combinations with many common - values. Particularly effective for high-dimensional spaces. + Hypercube Sampling for discrete spaces. It improves dimension-wise coverage + by reducing repeated reuse of the same values early in the sampling process. + This is often a strong default for high-dimensional spaces. 3. **Sobol Sampling (`sobol`)**: A quasi-random low-discrepancy sampling - method that provides better space-filling properties than pure random - sampling. It has been adapted for discrete parameter spaces. It falls back - to Concatenated Latin Hypercube Sampling when collisions are detected - during the discretization process. + method that provides stronger space-filling properties than pure random + sampling. It is adapted for discrete parameter spaces and falls back to CLHS + when collisions are detected during discretization. + +4. **One-Shift Sampling (`one_shift`)**: A heuristic for higher-dimensional + spaces that attempts to maximize minimum distance between samples. + +5. **Recursive Aggregation (`recursive_aggregation`)**: Another heuristic for + higher-dimensional spaces with different coverage characteristics. > [!CAUTION] > -> In the current version of no-priors characterization, if not all -> measurements produce the observed target output property specified in the -> `operation.parameters.targetOutput` field, the operation may fail or produce -> incomplete results. Ensure all experiments return the expected target property. +> In the current version, if not all measurements produce the observed target +> output property specified in the sampler's `targetOutput` parameter, the +> operation may fail or produce incomplete results. Ensure all experiments +> return the expected target property. -The operator samples a specified number of points in batches, measures them -using the configured experiment, and stores the results in the sample store. +The sampler orders a specified number of new points, which `random_walk` then +measures in batches using the configured experiment, storing the results in the +sample store. ## Creating a `discoveryspace` -A `discoveryspace` describes the parameters you want to explore (`entitySpace`) -and how to measure them (`measurementSpace`). In this example, we'll use two -custom Python functions as experiments and take inspiration from the Chemistry domain: +A `discoveryspace` describes the configuration space you want to explore +(`entitySpace`) and how to measure it (`measurementSpace`). In this example, +we use two custom Python functions as experiments and take inspiration from the +chemistry domain: 1. **`calculate_reaction_yield`**: Calculates chemical reaction yield based on temperature (K), concentration (mol/L), and catalyst amount (g) using an @@ -126,15 +133,15 @@ The output will be similar to: Success! Created space with identifier: space-bfed2d-19b49a ``` -## Exploring with a No-Priors Characterization Operation +## Running a Space-Filling Sampling Operation -Next, we will run an `operation` that uses no-priors characterization to -explore the `discoveryspace`. We provide three example configurations with -different sampling strategies: +Next, we run an `operation` that uses `random_walk` with the no-priors sampler +to perform space-filling sampling of the `discoveryspace`. We provide three +example configurations with different strategies: -### Basic Sampling with CLHS +### Space-Filling Sampling with CLHS -The configuration for a basic sampling operation using CLHS is in +The configuration for a CLHS-based space-filling operation is in `op_basic_sampling.yaml`: @@ -146,7 +153,9 @@ The configuration for a basic sampling operation using CLHS is in ``` -To run the operation, execute: +This configuration uses the no-priors sampler with CLHS to prioritize early +coverage across the configuration space rather than relying on plain random +ordering. @@ -158,10 +167,10 @@ ado create operation -f \ -### Exploration with Random Sampling +### Baseline Random Sampling -For an exploration with random sampling (uses random sampling with 20 samples -and batch size of 5 for quick initial exploration): +For a baseline comparison using random sampling with 20 samples and batch size +of 5: ```commandline ado create operation -f \ @@ -172,10 +181,13 @@ ado create operation -f \ **Note**: Each operation samples different points from the space based on its strategy and parameters, even when using the same discovery space. -### Thorough Coverage with Sobol Sequence +Random sampling is useful as a baseline, but CLHS and Sobol generally provide +better space-filling behavior for initial characterization. + +### Detailed Coverage with Sobol Sequence -For comprehensive coverage using Sobol sequences (uses Sobol sampling with 100 -samples and batch size of 1 for detailed parameter space coverage): +For denser low-discrepancy coverage using Sobol sequences with 100 samples and +batch size of 1: ```commandline ado create operation -f \ @@ -183,10 +195,13 @@ ado create operation -f \ --use-latest space ``` +This is a good option when you want more uniform low-discrepancy coverage of +the available configuration space. + ### What to Expect in the Terminal -You will see output as the no-priors characterization operator samples and -measures points. The key stages are: +You will see output as the `random_walk` operator with the no-priors sampler +orders, submits, and measures points. The key stages are: #### Initialization @@ -195,7 +210,7 @@ The operator will log the start of the sampling process: ```commandline -2026-03-09 16:30:00,000 INFO MainThread no_priors_characterization.operator: Starting no-priors characterization with 30 samples using clhs strategy +2026-03-09 16:30:00,000 INFO MainThread RandomWalk: Running random walk for 30 iterations. Sampler is custom sampler class: ... ``` @@ -211,7 +226,7 @@ submitted and completed: (RandomWalk pid=82843) Continuous batching: SUBMIT EXPERIMENT. Submitted experiment custom_experiments.calculate_reaction_yield for temperature.353-concentration.4.1-catalyst_amount.4.5. Request identifier: c72090 (RandomWalk pid=82843) (RandomWalk pid=82843) Continuous batching: SUMMARY. Entities sampled and submitted: 2. Experiments completed: 1 Waiting on 1 active requests. There are 0 dependent experiments -(RandomWalk pid=82843) Continuous Batching: EXPERIMENT COMPLETION. Received finished notification for experiment in measurement request in group 1: request-c72090-experiment-calculate_reaction_yield-entities-temperature.353-concentration.4.1-catalyst_amount.4.5 (no_priors_characterization)-requester-randomwalk-1.6.1.dev9+03a65e7b.dirty-9a277d-time-2026-03-10 11:43:11.066810+00:00 +(RandomWalk pid=82843) Continuous Batching: EXPERIMENT COMPLETION. Received finished notification for experiment in measurement request in group 1: request-c72090-experiment-calculate_reaction_yield-entities-temperature.353-concentration.4.1-catalyst_amount.4.5 (random_walk)-requester-randomwalk-1.6.1.dev9+03a65e7b.dirty-9a277d-time-2026-03-10 11:43:11.066810+00:00 ``` @@ -223,7 +238,7 @@ The operation will end with a success message: ```commandline -Success! Created operation with identifier operation-no-priors-characterization-v0.1-8b23a245 and it finished successfully. +Success! Created operation with identifier operation-random_walk-v0.1-8b23a245 and it finished successfully. ``` @@ -259,23 +274,68 @@ yield values. ┌───────┬──────────────────────────────────────────────────────────┬────────────────────────────┬─────────────────────────────────────────────┬─────────────┬───────────────┬─────────────────┬──────────┐ │ INDEX │ identifier │ generatorid │ experiment_id │ temperature │ concentration │ catalyst_amount │ yield │ ├───────┼──────────────────────────────────────────────────────────┼────────────────────────────┼─────────────────────────────────────────────┼─────────────┼───────────────┼─────────────────┼──────────┤ -│ 0 │ temperature.300-concentration.1.0-catalyst_amount.2.0 │ no_priors_characterization │ custom_experiments.calculate_reaction_yield │ 300 │ 1.0 │ 2.0 │ 45.23 │ -│ 1 │ temperature.350-concentration.2.5-catalyst_amount.5.0 │ no_priors_characterization │ custom_experiments.calculate_reaction_yield │ 350 │ 2.5 │ 5.0 │ 78.91 │ -│ 2 │ temperature.400-concentration.0.5-catalyst_amount.1.0 │ no_priors_characterization │ custom_experiments.calculate_reaction_yield │ 400 │ 0.5 │ 1.0 │ 92.15 │ +│ 0 │ temperature.300-concentration.1.0-catalyst_amount.2.0 │ random_walk │ custom_experiments.calculate_reaction_yield │ 300 │ 1.0 │ 2.0 │ 45.23 │ +│ 1 │ temperature.350-concentration.2.5-catalyst_amount.5.0 │ random_walk │ custom_experiments.calculate_reaction_yield │ 350 │ 2.5 │ 5.0 │ 78.91 │ +│ 2 │ temperature.400-concentration.0.5-catalyst_amount.1.0 │ random_walk │ custom_experiments.calculate_reaction_yield │ 400 │ 0.5 │ 1.0 │ 92.15 │ │ ... │ ... │ ... │ ... │ ... │ ... │ ... │ ... │ └───────┴──────────────────────────────────────────────────────────┴────────────────────────────┴─────────────────────────────────────────────┴─────────────┴───────────────┴─────────────────┴──────────┘ ``` +## Comparison with Other Sampling Approaches + +### When to Use the No-Priors Sampler + +Use the no-priors sampler with `random_walk` when you want to: + +- Build an initial measured dataset before surrogate modelling or optimization +- Cover a discrete or discretized configuration space more efficiently than + plain random sampling +- Avoid repeatedly measuring entities that already have the target output +- Get better space-filling coverage than the base `random_walk` samplers + +### Comparison with Base Random Walk Samplers + +The base `random_walk` samplers (`random`, `sequential`, grouped modes) are +simpler and appropriate when: + +- You want to iterate through existing entities in the sample store +- You need deterministic sequential traversal of a finite space +- You don't need optimized space-filling properties + +The no-priors sampler adds: + +- Active reordering of candidates using dedicated space-filling strategies +- Automatic exclusion of already-measured entities for a target output +- Multiple strategy options (CLHS, Sobol, etc.) for different coverage needs + +### Comparison with LHC and Ray Tune + +For continuous optimization or hyperparameter tuning, consider: + +- **Latin Hypercube Sampling (LHC)** via ray-tune: Better for continuous spaces + and when you want to leverage Ray's distributed execution +- **Ray Tune operators**: Appropriate for model hyperparameter optimization with + adaptive search algorithms (e.g., Bayesian optimization, HyperBand) + +The no-priors sampler is specifically designed for: + +- Discrete or discretized configuration spaces +- Initial characterization before optimization +- Cases where you want space-filling coverage without a surrogate model + ## Takeaways -- **Systematic Exploration**: The no-priors characterization operator provides - systematic sampling of parameter spaces without requiring prior knowledge. -- **Multiple Strategies**: Choose from random, Sobol, or CLHS sampling based on - your needs for speed vs. coverage quality. -- **Flexible Configuration**: Adjust the number of samples and batch size to - balance thoroughness with computational resources. -- **Foundation for Further Analysis**: The sampled data can serve as a - foundation for building surrogate models or for use with other operators like - TRIM. +- **Efficient space-filling**: The no-priors sampler helps cover a configuration + space more effectively than plain random ordering. +- **Multiple strategies**: Choose from random, Sobol, CLHS, or higher-dimensional + heuristics depending on the trade-off you want between baseline simplicity and + coverage quality. +- **Flexible configuration**: Adjust the number of samples and batch size to + balance throughput, coverage, and experimental resources. +- **Foundation for later workflows**: The resulting dataset is well suited for + surrogate modelling, optimization, or follow-on operators such as TRIM. +- **Integrated with random_walk**: The sampler works within the standard + `random_walk` operator flow, benefiting from its batching, filtering, and + memoization capabilities. diff --git a/examples/no-priors-characterization/example_yamls/op_basic_sampling.yaml b/examples/no-priors-characterization/example_yamls/op_basic_sampling.yaml index 566ecf4e1..f821d0806 100644 --- a/examples/no-priors-characterization/example_yamls/op_basic_sampling.yaml +++ b/examples/no-priors-characterization/example_yamls/op_basic_sampling.yaml @@ -1,13 +1,26 @@ # Copyright IBM Corporation 2025, 2026 # SPDX-License-Identifier: MIT +# CLHS space-filling configuration using random_walk with no-priors sampler spaces: - space-c8717f-3a68bf operation: module: - operationType: characterize - operatorName: no_priors_characterization + operationType: explore + operatorName: random_walk parameters: - targetOutput: yield - samples: 30 + numberEntities: 30 batchSize: 1 - sampling_strategy: clhs + samplerConfig: + module: + moduleName: orchestrator.core.discoveryspace.no_priors_sampler + moduleClass: NoPriorsSampleSelector + parameters: + targetOutput: yield + samples: 30 + batchSize: 1 + sampling_strategy: clhs + singleMeasurement: true + filter: + filterMode: unmeasured + +# Made with Bob diff --git a/examples/no-priors-characterization/example_yamls/op_quick_exploration.yaml b/examples/no-priors-characterization/example_yamls/op_quick_exploration.yaml index 1d5bea309..8eebffd7c 100644 --- a/examples/no-priors-characterization/example_yamls/op_quick_exploration.yaml +++ b/examples/no-priors-characterization/example_yamls/op_quick_exploration.yaml @@ -1,13 +1,26 @@ # Copyright IBM Corporation 2025, 2026 # SPDX-License-Identifier: MIT +# Baseline random ordering using random_walk with no-priors sampler spaces: - space-c8717f-3a68bf operation: module: - operationType: characterize - operatorName: no_priors_characterization + operationType: explore + operatorName: random_walk parameters: - targetOutput: yield - samples: 20 + numberEntities: 20 batchSize: 5 - sampling_strategy: random + samplerConfig: + module: + moduleName: orchestrator.core.discoveryspace.no_priors_sampler + moduleClass: NoPriorsSampleSelector + parameters: + targetOutput: yield + samples: 20 + batchSize: 5 + sampling_strategy: random + singleMeasurement: true + filter: + filterMode: unmeasured + +# Made with Bob diff --git a/examples/no-priors-characterization/example_yamls/op_thorough_coverage.yaml b/examples/no-priors-characterization/example_yamls/op_thorough_coverage.yaml index a2026f891..b0963edb1 100644 --- a/examples/no-priors-characterization/example_yamls/op_thorough_coverage.yaml +++ b/examples/no-priors-characterization/example_yamls/op_thorough_coverage.yaml @@ -1,13 +1,26 @@ # Copyright IBM Corporation 2025, 2026 # SPDX-License-Identifier: MIT +# Sobol low-discrepancy ordering using random_walk with no-priors sampler spaces: - space-c8717f-3a68bf operation: module: - operationType: characterize - operatorName: no_priors_characterization + operationType: explore + operatorName: random_walk parameters: - targetOutput: yield - samples: 100 + numberEntities: 100 batchSize: 1 - sampling_strategy: sobol + samplerConfig: + module: + moduleName: orchestrator.core.discoveryspace.no_priors_sampler + moduleClass: NoPriorsSampleSelector + parameters: + targetOutput: yield + samples: 100 + batchSize: 1 + sampling_strategy: sobol + singleMeasurement: true + filter: + filterMode: unmeasured + +# Made with Bob diff --git a/plugins/operators/no-priors-characterization/src/no_priors_characterization/no_priors_pydantic.py b/orchestrator/core/discoveryspace/no_priors_parameters.py similarity index 89% rename from plugins/operators/no-priors-characterization/src/no_priors_characterization/no_priors_pydantic.py rename to orchestrator/core/discoveryspace/no_priors_parameters.py index 3608470df..e7aee7288 100644 --- a/plugins/operators/no-priors-characterization/src/no_priors_characterization/no_priors_pydantic.py +++ b/orchestrator/core/discoveryspace/no_priors_parameters.py @@ -65,8 +65,4 @@ class NoPriorsParameters(BaseModel): ] = "clhs" -if __name__ == "__main__": - params = NoPriorsParameters.model_validate(NoPriorsParameters(targetOutput="test")) - print( - f"type of model_validate output on no-priors-characterization default is {type(params)}, printing the full object gives {params}" - ) +# Made with Bob diff --git a/plugins/operators/no-priors-characterization/src/no_priors_characterization/no_priors_sampler.py b/orchestrator/core/discoveryspace/no_priors_sampler.py similarity index 95% rename from plugins/operators/no-priors-characterization/src/no_priors_characterization/no_priors_sampler.py rename to orchestrator/core/discoveryspace/no_priors_sampler.py index 2d7c220d1..0f7456587 100644 --- a/plugins/operators/no-priors-characterization/src/no_priors_characterization/no_priors_sampler.py +++ b/orchestrator/core/discoveryspace/no_priors_sampler.py @@ -7,11 +7,11 @@ from pydantic import BaseModel -from no_priors_characterization.no_priors_pydantic import NoPriorsParameters -from no_priors_characterization.utils.order import order_df_for_sampling_with_no_priors -from no_priors_characterization.utils.space_df_connector import ( +from orchestrator.core.discoveryspace.no_priors_parameters import NoPriorsParameters +from orchestrator.core.discoveryspace.no_priors_utils import ( get_list_of_entities_from_df_and_space, get_source_and_target, + order_df_for_sampling_with_no_priors, ) from orchestrator.core.discoveryspace.samplers import BaseSampler from orchestrator.core.discoveryspace.space import DiscoverySpace, Entity @@ -137,3 +137,6 @@ def parameters_model(cls) -> type[BaseModel] | None: def __init__(self, parameters: NoPriorsParameters) -> None: self.params = parameters + + +# Made with Bob diff --git a/orchestrator/core/discoveryspace/no_priors_utils.py b/orchestrator/core/discoveryspace/no_priors_utils.py new file mode 100644 index 000000000..36a06e829 --- /dev/null +++ b/orchestrator/core/discoveryspace/no_priors_utils.py @@ -0,0 +1,953 @@ +# Copyright IBM Corporation 2025, 2026 +# SPDX-License-Identifier: MIT + +""" +Utility functions for no-priors sampling, including: +- High-dimensional sampling strategies (CLHS, Sobol, random) +- DataFrame ordering and index mapping +- Entity/point conversion and validation +- Discovery space data extraction +""" + +from __future__ import annotations + +import itertools +import logging +import math +import random +from typing import TYPE_CHECKING, Any, Literal + +import numpy as np +import pandas as pd +from scipy.stats.qmc import Sobol + +from orchestrator.core.discoveryspace.space import DiscoverySpace +from orchestrator.schema.virtual_property import PropertyAggregationMethodEnum + +if TYPE_CHECKING: + from collections.abc import Hashable + + from orchestrator.metastore.project import ProjectContext + from orchestrator.schema.entity import Entity + +logger = logging.getLogger(__name__) + + +# ============================================================================ +# 1D Sampling Functions +# ============================================================================ + + +def get_index_list_van_der_corput( + length_segment: int, + tot_points_to_sample: int, + sampled_indices: list[int] | None = None, + sort: bool = False, + verbose: bool = False, +) -> list[int]: + """ + Selects indices from a 1D segment using a modified Van der Corput sequence. + + Args: + length_segment: Total number of units in the 1D segment + tot_points_to_sample: Total number of indices to sample + sampled_indices: List of indices already sampled + sort: If True, returns the final list sorted + verbose: If True, prints debug information + + Returns: + List of sampled indices + + Raises: + ValueError: If tot_points_to_sample exceeds length_segment + """ + if tot_points_to_sample == 0: + return [] + + if tot_points_to_sample > length_segment: + raise ValueError( + "ValueError: You are trying to sample more points than those that are available" + ) + + if sampled_indices is None: + sampled_indices = [] + + if len(sampled_indices) == length_segment: + maximal_indices_list = list(range(length_segment)) + if sampled_indices.sort() != maximal_indices_list: + logging.error( + "Sampled indices do not correspond to [0,..., max_n_indices -1]" + "Returning list(range(max_n_indices)" + ) + return maximal_indices_list + + if len(sampled_indices) > tot_points_to_sample: + logging.warning( + "Number of sampled indices is greater than the number of indices you want to sample" + "Returning sampled indices" + ) + return sampled_indices + + index_list = list(sampled_indices) + sampled_set = set(index_list) + + for point in [0, length_segment - 1]: + if point not in sampled_set: + index_list.append(point) + sampled_set.add(point) + if len(index_list) == tot_points_to_sample: + return sorted(index_list) + + def build_prefix_and_len(index_list: list[int]) -> tuple[list[int], int]: + if not index_list: + return [0], 0 + + M = max(index_list) + 1 + sampled_set = set(index_list) + prefix = [0] * (M + 1) + s = 0 + + for i in range(M): + s += 1 if i in sampled_set else 0 + prefix[i + 1] = s + + return prefix, M + + def get_list_min_weight( + prefix: list[int], M: int, d: int, selectable_indices: list[int] + ) -> list[int]: + vals = {} + for i in selectable_indices: + if i >= M: + break + left = max(0, i - d) + right = min(M - 1, i + d) + total = prefix[right + 1] - prefix[left] + denom = right - left + 1 + mean = total / denom + vals[i] = mean + + if not vals: + return [] + + min_val = min(vals.values()) + out = [] + for i in selectable_indices: + if i >= M: + break + if vals.get(i) == min_val: + out.append(i) + return out + + def get_selectable_indices() -> list[int]: + return [i for i in range(length_segment) if i not in sampled_set] + + max_d = length_segment + + while len(index_list) < tot_points_to_sample: + selection = 0 + selectable_indices = get_selectable_indices() + prefix, M = build_prefix_and_len(index_list=index_list) + d = 1 + previous_set = selectable_indices + + while selection == 0: + indices = get_list_min_weight(prefix, M, d, selectable_indices) + + if not indices: + if not previous_set: + raise ValueError( + "Previous candidate set should not be empty or None" + ) + if verbose: + logger.info( + f"No intersection found with d={d}. Using the previous set " + f"Appending to {index_list} the first element of {previous_set}" + ) + chosen = previous_set[0] + index_list.append(chosen) + sampled_set.add(chosen) + selection = 1 + else: + previous_set = selectable_indices + selectable_indices = indices + + if len(selectable_indices) == 1 or d == max_d: + if verbose: + logger.info( + f"Appending to {index_list} the first element of {selectable_indices}" + ) + chosen = selectable_indices[0] + index_list.append(chosen) + sampled_set.add(chosen) + selection = 1 + + d += 1 + + if sort: + return sorted(index_list) + return index_list + + +# ============================================================================ +# High-Dimensional Sampling Functions +# ============================================================================ + + +def concatenated_latin_hypercube_sampling( + dimensions: list[int], + final_sample_size: int, + seed: int | None = None, +) -> list[list[int]]: + """ + Generates samples using Concatenated Latin Hypercube Sampling. + + Args: + dimensions: Cardinality (size) of each dimension + final_sample_size: Total number of points to sample + seed: Optional PRNG seed for reproducibility + + Returns: + List of sampled points + + Raises: + ValueError: If any dimension size is less than 1 + """ + if any(d <= 0 for d in dimensions): + raise ValueError( + f"All dimensions must be >= 1, received dimensions={dimensions}" + ) + + if final_sample_size <= 0: + return [] + + rng = random.Random() if seed is None else random.Random(seed) # noqa: S311 + pools: list[list[int]] = [list(range(d)) for d in dimensions] + samples: list[list[int]] = [] + + for _ in range(final_sample_size): + point: list[int] = [] + for j, d in enumerate(dimensions): + if not pools[j]: + pools[j] = list(range(d)) + k = rng.randrange(len(pools[j])) + value = pools[j].pop(k) + point.append(value) + samples.append(point) + + return samples + + +def sobol_sampling( + dimensions: list[int], final_sample_size: int, seed: int | None = None +) -> list[list[int]]: + """ + Generates Sobol sampled points scaled to integer dimensions. + + Falls back to CLHS if collisions are detected. + + Args: + dimensions: Size of each dimension + final_sample_size: Number of points to sample + seed: Random seed for the Sobol scrambler + + Returns: + List of sampled points + """ + sampler = Sobol(d=len(dimensions), scramble=True, rng=seed) + points = sampler.random(final_sample_size) + + discrete_points = [ + [int(val * d) for val, d in zip(p, dimensions, strict=True)] for p in points + ] + + unique_points = {tuple(p) for p in discrete_points} + n_collisions = final_sample_size - len(unique_points) + + if n_collisions > 0: + logger.error( + f"Sobol sampling failed, {n_collisions} collisions detected, defaulting to clhs sampling" + ) + return concatenated_latin_hypercube_sampling( + dimensions=dimensions, final_sample_size=final_sample_size, seed=seed + ) + + return discrete_points + + +def random_high_dimensional_sampling( + dimensions: list[int], final_sample_size: int, seed: int | None = None +) -> list[list[int]]: + """ + Generate unique random samples from a high-dimensional space. + + Args: + dimensions: Cardinality of each dimension + final_sample_size: Total number of points to sample + seed: Optional PRNG seed + + Returns: + List of sampled points + + Raises: + ValueError: If final_sample_size exceeds total configurations + """ + if seed is not None: + random.seed(seed) + + num_configs = math.prod(dimensions) + if final_sample_size > num_configs: + raise ValueError( + f"Cannot generate {final_sample_size} unique samples. " + f"The sample space only contains {num_configs} possibilities." + ) + + configs = list(itertools.product(*[range(d) for d in dimensions])) + actual_sample_size = min(final_sample_size, len(configs)) + + if actual_sample_size < final_sample_size: + logger.warning( + f"Requested {final_sample_size} samples but only {len(configs)} unique " + f"configurations available. Sampling {actual_sample_size} instead." + ) + + samples = random.sample(configs, actual_sample_size) + return [list(s) for s in samples] + + +def get_sampling_indices_multi_dimensional( + dimensions: list[int], + n: int | Literal["all", "max"], + space: dict[str, int] | None = None, + strategy: Literal["random", "clhs", "sobol"] = "clhs", + seed: int | None = None, +) -> list[list[int]]: + """ + Generate sampling indices for a high-dimensional space. + + Args: + dimensions: Sizes of each dimension + n: Number of points to sample ('all', 'max', or integer) + space: Optional mapping of dimension names to sizes + strategy: Sampling strategy ('random', 'clhs', or 'sobol') + seed: Controls randomness + + Returns: + List of sampled multi-dimensional coordinates + """ + if seed is not None: + random.seed(seed) + + if space: + indices_dict = { + k: get_index_list_van_der_corput(v, v) for k, v in space.items() + } + if [len(indices) for indices in list(indices_dict.values())] != dimensions: + logger.error( + f"A space dict has been provided ->{space}. It is inconsistent with dimensions={dimensions}" + ) + raise ValueError("Space has inconsistent dimensions!") + logger.info( + "Sampling indices for each named dimension (ordered low to high): %s", + indices_dict, + ) + + orders = [get_index_list_van_der_corput(v, v) for v in dimensions] + + if logger.isEnabledFor(logging.DEBUG): + logger.debug("Dimensions: %s", dimensions) + logger.debug("Sampling orders for each dimension:") + for i, o in enumerate(orders): + logger.debug("Dimension %d order: %s", i, o) + + maximum_n = math.prod(dimensions) + lcm = math.lcm(*dimensions) + + if lcm != maximum_n: + logger.debug( + "Periodicity detected, the sampling subroutine will ensure that you will not sample" + "the same configuration more than once." + ) + + if isinstance(n, str): + if n == "all": + n = maximum_n + elif n == "max": + n = max(dimensions) + else: + raise ValueError(f"Unrecognized string for n: {n}") + + if n > maximum_n: + logger.warning( + f"Maximal sample size is {maximum_n}, you requested {n} sampling prescriptions." + f"Elaborating prescription for n_samples = {maximum_n}" + ) + + logger.debug("Preparing to sample %d out of %d possible points.", n, maximum_n) + + match strategy: + case "random": + return random_high_dimensional_sampling(dimensions, n, seed=seed) + case "clhs": + return concatenated_latin_hypercube_sampling( + dimensions=dimensions, final_sample_size=n, seed=seed + ) + case "sobol": + return sobol_sampling(dimensions=dimensions, final_sample_size=n, seed=seed) + case _: + raise NotImplementedError(f"Strategy {strategy} is unknown") + + +# ============================================================================ +# DataFrame Ordering and Index Mapping +# ============================================================================ + + +def get_index_list_nn_high_dimensional( + orders_to_sample: list[list[int]], dimensions: list[int] +) -> list[int]: + """ + Map high-dimensional sampling orders to linear (flattened) indices. + + Args: + orders_to_sample: List of multi-dimensional coordinates + dimensions: Size of each dimension + + Returns: + List of linear indices + + Warns: + If duplicate or out-of-bounds indices are detected + """ + indices = [] + cprod = np.cumprod(np.array(dimensions), dtype=int).tolist() + maximum_n = cprod[-1] + + for order in orders_to_sample: + index = 0 + multiplier = 1 + for i in reversed(range(len(dimensions))): + index += order[i] * multiplier + multiplier *= dimensions[i] + + if index > maximum_n: + logging.warning( + f"Out of bound index {index} computed from order {order}, dimensions are {dimensions}" + ) + indices.append(index) + + if len(set(indices)) != len(indices): + logger.error(f"{len(indices) - len(set(indices))} Duplicated indices!") + + out_of_bounds_list = [i for i in indices if i > maximum_n] + if out_of_bounds_list: + logger.error( + f"The following indices are out of bound: {out_of_bounds_list}, maximum admissible value is {maximum_n-1}" + ) + + return indices + + +def order_df_for_get_index_list_nn_high_dimensional( + df: pd.DataFrame, constitutive_properties: list[str], dimensions: list[int] +) -> pd.DataFrame: + """ + Ensure DataFrame is ordered and complete for high-dimensional index generation. + + Args: + df: Input DataFrame + constitutive_properties: Column names defining the space + dimensions: Expected cardinality for each property + + Returns: + DataFrame sorted and augmented with missing combinations + """ + df = df.sort_values(by=constitutive_properties).reset_index(drop=True) + expected_len = math.prod(dimensions) + + if len(df) == expected_len: + return df + + unique_values = [ + sorted(df[prop].dropna().unique()) for prop in constitutive_properties + ] + all_combinations = list(itertools.product(*unique_values)) + actual_expected_len = len(all_combinations) + + logger.warning( + f"DataFrame length mismatch: expected {expected_len} (product of {dimensions}), " + f"but got {len(df)}. Actual unique combinations: {actual_expected_len}." + ) + + existing_combinations = { + tuple(row[prop] for prop in constitutive_properties) for _, row in df.iterrows() + } + + missing_combinations = [ + comb for comb in all_combinations if comb not in existing_combinations + ] + + if missing_combinations: + logger.info( + f"Injecting {len(missing_combinations)} missing rows to satisfy the property." + ) + injected_rows = [] + for comb in missing_combinations: + row_data = dict(zip(constitutive_properties, comb, strict=False)) + for col in df.columns: + if col not in constitutive_properties: + row_data[col] = pd.NA + injected_rows.append(row_data) + + df = pd.concat([df, pd.DataFrame(injected_rows)], ignore_index=True) + df = df.sort_values(by=constitutive_properties).reset_index(drop=True) + logger.info(f"Injected rows: {injected_rows}") + + return df + + +def order_df_for_sampling_with_no_priors( + df: pd.DataFrame, + constitutive_properties: list[str], + n: int, + strategy: Literal["random", "clhs", "sobol"], +) -> pd.DataFrame: + """ + Orders a DataFrame for high-dimensional sampling without prior knowledge. + + Args: + df: Input dataset + constitutive_properties: Column names defining the configuration space + n: Number of samples to generate + strategy: Sampling strategy + + Returns: + DataFrame with n sampled rows + + Raises: + ValueError: If n <= 0 after adjustment or no samples available + """ + len_original = len(df) + df_unique = df.drop_duplicates(subset=constitutive_properties).reset_index( + drop=True + ) + delta_len = len_original - len(df_unique) + if delta_len > 0: + logging.warning( + f"Removing {delta_len} duplicate configurations." + f"They are characterized by the same combination of constitutive properties = {constitutive_properties}" + ) + + if n > len(df_unique): + logging.warning( + f"Requested {n} samples, but DataFrame has only {len(df_unique)} rows. Adjusting n to {len(df_unique)}." + ) + n = len(df_unique) + + if n <= 0: + logging.error( + f"No samples available to select. DataFrame has {len(df_unique)} rows and {n} samples were requested." + ) + return pd.DataFrame(columns=df_unique.columns) + + def _get_sorted_uniques(prop: str) -> list: + vals = df_unique[prop].unique() + try: + return sorted(vals) + except TypeError: + logging.warning( + f"Cannot sort mixed types for property '{prop}'. " + "Keeping original order." + ) + return list(vals) + + value_dict = {prop: _get_sorted_uniques(prop) for prop in constitutive_properties} + space_dict = {prop: len(vals) for prop, vals in value_dict.items()} + dimensions = list(space_dict.values()) + + df_unique = order_df_for_get_index_list_nn_high_dimensional( + df_unique, constitutive_properties, dimensions=dimensions + ).reset_index(drop=True) + + orders_to_sample = get_sampling_indices_multi_dimensional( + dimensions=dimensions, space=space_dict, n=n, strategy=strategy + ) + + indices_to_sample = get_index_list_nn_high_dimensional(orders_to_sample, dimensions) + + logger.info(f"Indexes are:\n {indices_to_sample}") + try: + return df_unique.iloc[indices_to_sample] + except IndexError: + logging.error( + f"Index Error detected. Length of the dataframe is {len(df_unique)}." + "The indices that cause the error are:" + ) + max_len = len(df_unique) + out_of_bounds_list = [i for i in indices_to_sample if i < 0 or i >= max_len] + logging.error(out_of_bounds_list) + logging.error("Returning empty dataset") + return pd.DataFrame({}) + + +# ============================================================================ +# Discovery Space Data Extraction +# ============================================================================ + + +def get_project_context() -> ProjectContext: + """Retrieve the current ADO project context from configuration.""" + import orchestrator.cli.core.config + + ado_configuration = orchestrator.cli.core.config.AdoConfiguration.load() + return ado_configuration.project_context # type: ignore[name-defined] + + +def get_space( + space_or_space_id: DiscoverySpace | str, +) -> DiscoverySpace: + """Get a DiscoverySpace object from either a space object or identifier string.""" + if isinstance(space_or_space_id, DiscoverySpace): + return space_or_space_id + + return DiscoverySpace.from_stored_configuration( + project_context=get_project_context(), + space_identifier=space_or_space_id, + ) + + +def get_df_all_entities_no_measurements( + discoverySpace: DiscoverySpace | str, +) -> pd.DataFrame: + """ + Return a DataFrame of all entities in the Discovery Space. + + Returns: + DataFrame with columns: ['identifier', ] + """ + space = get_space(space_or_space_id=discoverySpace) + entity_space = space.entitySpace + cp_ids = [cp.identifier for cp in entity_space.constitutiveProperties] + + list_of_dicts_to_convert = [] + for point_values in entity_space.sequential_point_iterator(): + point_dict = dict(zip(cp_ids, point_values, strict=True)) + entity = entity_space.entity_for_point(point_dict) + ed = {"identifier": entity.identifier} + ed.update(point_dict) + list_of_dicts_to_convert.append(ed) + + return pd.DataFrame(list_of_dicts_to_convert) + + +def get_df_at_least_one_measured_value( + discoverySpace: DiscoverySpace | str, + targetOutput_list: list[str] | None = None, + add_measurement_id: bool = False, +) -> pd.DataFrame: + """ + Return a DataFrame of entities with at least one measured target output. + + Returns: + DataFrame with columns: ['identifier' (optional), , ] + """ + if not targetOutput_list: + targetOutput_list = [] + space = get_space(space_or_space_id=discoverySpace) + col_list = [cp.identifier for cp in space.entitySpace.constitutiveProperties] + if add_measurement_id: + col_list = ["identifier", *col_list] + + discoverySpace.sample_store.refresh() + + df = pd.DataFrame( + space.matchingEntitiesTable( + property_type="target", + aggregationMethod=PropertyAggregationMethodEnum.mean, + ) + ) + + if df.empty: + logger.warning( + "No measured properties found in the discovery space\nReturning empty DataFrame\n " + ) + return df + + all_df_cols = list(df.columns) + valid_targetOutput_list = [] + for el in targetOutput_list: + if el in all_df_cols: + valid_targetOutput_list.append(el) + elif f"{el}-mean" in all_df_cols and el not in all_df_cols: + logger.warning( + f"Column named '{el}-mean' (instead of '{el}', which is not present)" + "found in the DataFrame obtained through matchingEntitiesTable. " + f"Renaming it to '{el}'." + ) + df.rename(columns={f"{el}-mean": el}, inplace=True) + valid_targetOutput_list += [el] + elif f"{el}-mean" in all_df_cols and el in all_df_cols: + logger.warning( + f"Columns named '{el}-mean' and '{el}'" + "found in the DataFrame obtained through matchingEntitiesTable. " + f"Renaming it to '{el}'." + ) + logger.error("Unexpected behavior can happen!") + df.rename(columns={f"{el}-mean": el}, inplace=True) + valid_targetOutput_list += [el] + col_list += valid_targetOutput_list + + if valid_targetOutput_list != targetOutput_list: + if len(valid_targetOutput_list) == 0: + logger.error( + "No valid target in the columns of the DataFrame." + f"columns are:\t{list(df.columns)}." + f"First rows are:\n{df.head(5)}" + ) + else: + not_found = [ + t for t in targetOutput_list if t not in valid_targetOutput_list + ] + logger.error( + f"Found measurements for the following valid targets:\t{valid_targetOutput_list}" + ) + logger.error( + f"No measurement found for the following valid targets:\t{not_found}" + ) + + removed_cols = [c for c in list(df.columns) if c not in col_list] + logger.debug( + "Obtaining df with at least one measured target." + f"Removed columns: {removed_cols}" + ) + + df = df[col_list] + df.dropna(inplace=True) + + if df.empty: + logger.warning( + "Although there were some measured properties in the discovery space." + ) + logger.warning( + "All measured properties in the discovery space" + f"are different from the desired outputs {targetOutput_list}.Returning empty DataFrame\n " + ) + + return df + + +def get_source_and_target( + discoverySpace: DiscoverySpace | str, + targetOutput: str, + log_string: str = "", +) -> tuple[pd.DataFrame, pd.DataFrame]: + """ + Build source (labeled) and target (unlabeled) DataFrames for a target output. + + Returns: + Tuple of (source_df, target_df) + """ + dfm = get_df_at_least_one_measured_value(discoverySpace, [targetOutput]) + dfu = get_df_all_entities_no_measurements(discoverySpace) + keys = [c for c in dfu.columns if c in dfm.columns and c != "identifier"] + + if dfm.empty: + logger.warning("The source space is empty") + return dfm, dfu + + df = dfu.merge(dfm, on=keys, how="left") + + if targetOutput not in list(df.columns): + logger.info( + f"""The target output was not present in the columns of the measured+unmeasured DataFrame,' \ + meaning that '{targetOutput}' has never been measured in this space. + dfm.empty = {df.empty}. Adding an empty column to the DataFrame. + """ + ) + logger.debug("Adding an empty column to the DataFrame.") + df[targetOutput] = pd.NA + + if targetOutput in list(df.columns): + df_measured_drop_na = df.dropna(subset=[targetOutput]) + df_unmeasured_drop_na = df[df[targetOutput].isna()].drop(columns=[targetOutput]) + n_rows_dropped = len(df) - len(df_measured_drop_na) + logger.debug( + f"Dropped {n_rows_dropped} rows. Function called with log_string={log_string}" + ) + if df_measured_drop_na.empty: + logger.warning( + f"Empty source after dropping rows that contain Nan in {targetOutput} column" + ) + if df_unmeasured_drop_na.empty: + logger.warning( + f"Empty target after filtering rows that contain Nan in {targetOutput} column" + ) + return df_measured_drop_na, df_unmeasured_drop_na + + save_path = "df_with_no_targetOutput_columns.csv" + logger.error( + f"'{targetOutput}' column is missing, saving df in {save_path}, returning unmerged DataFrames" + ) + df.to_csv(save_path) + return dfm, dfu + + +# ============================================================================ +# Entity/Point Conversion +# ============================================================================ + + +def validate_points_in_space( + points: list[dict], + space: DiscoverySpace, +) -> tuple[list[dict], list[int]]: + """ + Validate point dictionaries against a Discovery Space. + + Returns: + Tuple of (valid_points, invalid_indices) + """ + valid_points: list[dict] = [] + invalid_indices: list[int] = [] + + for i, p in enumerate(points): + if space.entitySpace.isPointInSpace(p): + valid_points.append(p) + else: + invalid_indices.append(i) + return valid_points, invalid_indices + + +def df_to_points( + df: pd.DataFrame, + cols: list[str] | None = None, + dropna: bool = True, + drop_duplicates: bool = False, +) -> list[dict[Hashable, Any]]: + """ + Convert DataFrame rows to list of point dictionaries. + + Args: + df: Input DataFrame + cols: Columns to include + dropna: If True, drop rows containing NaN + drop_duplicates: If True, drop duplicate rows + + Returns: + List of point dictionaries + """ + if cols is None: + cols = list(df.columns) + missing = set(cols) - set(df.columns) + if missing: + raise KeyError(f"Requested columns not present in DataFrame: {missing}") + + sub = df[cols].copy() + if dropna: + sub = sub.dropna(how="any") + if drop_duplicates: + sub = sub.drop_duplicates() + + def to_py(x: object) -> object: + if isinstance(x, (np.generic)): + return x.item() + return x + + for c in sub.columns: + sub[c] = sub[c].map(to_py) + + return sub.to_dict(orient="records") + + +def df_to_points_parsing( + df: pd.DataFrame, + cols: list[str] | None = None, + dropna: bool = True, + parse_values: bool = False, +) -> list[dict]: + """Convert DataFrame to points with optional string value parsing.""" + import ast + + points = df_to_points(df, cols=cols, dropna=dropna) + if not parse_values: + return points + + parsed = [] + for p in points: + newp = {} + for k, v in p.items(): + if isinstance(v, str): + try: + newp[k] = ast.literal_eval(v) + except Exception: + newp[k] = v + else: + newp[k] = v + parsed.append(newp) + return parsed + + +def make_points_from_df( + df: pd.DataFrame, + space: DiscoverySpace, + cols: list[str] | None = None, + dropna: bool = True, + parse_values: bool = True, +) -> list[dict]: + """ + Convert DataFrame of constitutive properties into point dictionaries. + + Args: + df: Input DataFrame + space: Discovery Space providing canonical order + cols: Explicit list of columns to use + dropna: If True, drop rows with NaN + parse_values: If True, parse string values + + Returns: + List of point dictionaries + """ + if cols is None: + cols = [cp.identifier for cp in space.entitySpace.constitutiveProperties] + + missing = set(cols) - set(df.columns) + if missing: + raise KeyError(f"Requested columns not present in DataFrame: {missing}") + + return df_to_points_parsing(df, cols=cols, dropna=dropna, parse_values=parse_values) + + +def get_list_of_entities_from_df_and_space( + df: pd.DataFrame, space: DiscoverySpace +) -> list[Entity]: + """ + Convert DataFrame rows to Entity objects validated against a discovery space. + + Args: + df: DataFrame containing constitutive property values + space: DiscoverySpace defining the entity space constraints + + Returns: + List of valid Entity objects + """ + points = make_points_from_df(df=df, space=space) + valid_points, __ = validate_points_in_space(points, space) + + list_of_entities = [] + from orchestrator.schema.point import SpacePoint + + for p in valid_points: + sp = SpacePoint(entity=p) + entity = sp.to_entity(generatorid="no_priors_characterization") + list_of_entities.append(entity) + + numberEntities = len(list_of_entities) + if numberEntities != len(df): + numberEntities_log = f"""Warning: number of valid entities {numberEntities} is different from the number of rows in the ordered df {len(df)}. + This means that some rows in the ordered df did not correspond to valid entities in the discovery space. + """ + logging.warning(numberEntities_log) + return list_of_entities + + +# Made with Bob diff --git a/plugins/operators/no-priors-characterization/README.md b/plugins/operators/no-priors-characterization/README.md deleted file mode 100644 index f7bdf3c03..000000000 --- a/plugins/operators/no-priors-characterization/README.md +++ /dev/null @@ -1,46 +0,0 @@ -# ADO No-Priors Characterization Operator - -`ado-no-priors-characterization` is an operator plugin for the -[Accelerated Discovery Orchestrator (ADO)](https://github.com/IBM/ado), -providing initial exploration of discovery spaces using high-dimensional -sampling strategies. - -**No-Priors Characterization** is designed for unbiased exploration when no -measured data exists yet, establishing an initial dataset for subsequent -model-based exploration. - -## How it Works - -The `No-Priors Characterization` operator uses different sampling strategies -to ensure good coverage of the discovery space: - -- **`random`**: Random sampling across the space for unbiased exploration. - This provides the baseline sampling approach. -- **`clhs`** (Concatenated Latin Hypercube Sampling): Ensures uniform coverage - by enforcing stratification in each dimension independently. Each dimension - cycles through all possible values before repeating. -- **`sobol`**: Sobol sequence sampling for quasi-random low-discrepancy coverage - -The operator retrieves already-measured entities from the discovery space, -orders the unmeasured entities using the specified sampling strategy, -and yields entities in batches -for measurement. - -## Installation - -You can install the `No-Priors Characterization` operator and its dependencies -(including `ado-core`) directly from PyPI: - -```bash -pip install ado-no-priors-characterization -``` - -## More Information - -To learn more about No-Priors Characterization and explore the full -capabilities of ADO, including detailed documentation, configuration guides, and -additional examples, visit the official ADO website: - -- **No-Priors Quickstart**: -- **Configuring No-Priors**: -- **ADO Documentation**: diff --git a/plugins/operators/no-priors-characterization/pyproject.toml b/plugins/operators/no-priors-characterization/pyproject.toml deleted file mode 100644 index 691d60f55..000000000 --- a/plugins/operators/no-priors-characterization/pyproject.toml +++ /dev/null @@ -1,29 +0,0 @@ -[project] -name = "ado-no-priors-characterization" -description = "No-priors characterization operator for sampling discovery spaces using high-dimensional sampling strategies" -readme = "README.md" -requires-python = ">=3.10,<3.14" -dependencies = [ - "ado-core", - "numpy", - "pandas>=2.2.0", - "scipy", -] -dynamic = ["version"] - -[project.entry-points."ado.operators"] -no-priors-characterization = "no_priors_characterization.operator" - -[build-system] -requires = ["setuptools", "setuptools_scm"] -build-backend = "setuptools.build_meta" - -[tool.setuptools.packages.find] -where = ["src"] -include = ["no_priors_characterization*"] - -[tool.setuptools_scm] -root = "../../../" - -[tool.uv.sources] -ado-core = { workspace = true } diff --git a/plugins/operators/no-priors-characterization/src/no_priors_characterization/__init__.py b/plugins/operators/no-priors-characterization/src/no_priors_characterization/__init__.py deleted file mode 100644 index ff303fb0e..000000000 --- a/plugins/operators/no-priors-characterization/src/no_priors_characterization/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -# Copyright IBM Corporation 2025, 2026 -# SPDX-License-Identifier: MIT - -from no_priors_characterization.operator import no_priors_characterization - -__all__ = ["no_priors_characterization"] diff --git a/plugins/operators/no-priors-characterization/src/no_priors_characterization/operator.py b/plugins/operators/no-priors-characterization/src/no_priors_characterization/operator.py deleted file mode 100644 index cf0840bc5..000000000 --- a/plugins/operators/no-priors-characterization/src/no_priors_characterization/operator.py +++ /dev/null @@ -1,106 +0,0 @@ -# Copyright IBM Corporation 2025, 2026 -# SPDX-License-Identifier: MIT - -import logging -from importlib.metadata import version - -from no_priors_characterization.no_priors_pydantic import NoPriorsParameters -from orchestrator.core.discoveryspace.space import DiscoverySpace -from orchestrator.core.operation.config import FunctionOperationInfo -from orchestrator.core.operation.operation import OperationOutput -from orchestrator.modules.operators.collections import characterize_operation - -logger = logging.getLogger(__name__) - - -@characterize_operation( - name="no_priors_characterization", - configuration_model=NoPriorsParameters, - configuration_model_default=NoPriorsParameters(targetOutput="default_target"), - description=""" - No-priors characterization samples a discovery space using high-dimensional - sampling strategies (random, CLHS, Sobol, etc.) without relying on prior - model knowledge or feature importance. This operator is useful for initial - exploration of discovery spaces when no training data exists yet. - """, - version=version("ado-no-priors-characterization"), -) -def no_priors_characterization( - discoverySpace: DiscoverySpace = None, # type: ignore[name-defined] - operationInfo: FunctionOperationInfo | None = None, - **kwargs: object, -) -> OperationOutput: - """ - Execute no-priors characterization on a discovery space. - - Samples entities using high-dimensional sampling strategies without requiring - prior model training or feature importance information. Useful for initial - characterization when no measured data exists. - - Args: - discoverySpace: The discovery space to characterize - operationInfo: Optional operation metadata - **kwargs: Additional parameters validated against NoPriorsParameters model - - Returns: - OperationOutput containing the operation resources and metadata - """ - # Lazy import to avoid circular import issues during plugin loading - from orchestrator.modules.operators.randomwalk import ( - CustomSamplerConfiguration, - RandomWalkParameters, - SamplerModuleConf, - random_walk, - ) - - params = NoPriorsParameters.model_validate(kwargs) - logger.info( - f"No-priors characterization starts. Target variable = {params.targetOutput}" - ) - logger.info(f"Parameters: {params}") - - # Configure the no-priors sampler - no_priors_module = SamplerModuleConf( - moduleClass="NoPriorsSampleSelector", - moduleName="no_priors_characterization.no_priors_sampler", - ) - - no_priors_sampler_config = CustomSamplerConfiguration( - module=no_priors_module, parameters=params - ) - - no_priors_random_walk_params = RandomWalkParameters( - samplerConfig=no_priors_sampler_config, - batchSize=params.batchSize, - numberEntities=params.samples, - singleMeasurement=True, - ) - - # Execute the random walk with the no-priors sampler - from orchestrator.core.metadata import ConfigurationMetadata - - # Create metadata with custom fields for tracking no-priors parameters - metadata = ConfigurationMetadata( - name="No-priors characterization", - description=f"No-priors characterization using {params.sampling_strategy} strategy with {params.samples} samples", - ) - # Add custom fields using extra="allow" in ConfigurationMetadata - metadata.sampling_strategy = params.sampling_strategy # type: ignore[attr-defined] - metadata.samples = params.samples # type: ignore[attr-defined] - - updated_operation_info = FunctionOperationInfo( - metadata=metadata, - actuatorConfigurationIdentifiers=( - operationInfo.actuatorConfigurationIdentifiers if operationInfo else [] - ), - ) - - op_output = random_walk( - discoverySpace=discoverySpace, - operationInfo=updated_operation_info, - **no_priors_random_walk_params.model_dump(), - ) - - logger.info("No-priors characterization completed") - - return op_output diff --git a/plugins/operators/no-priors-characterization/src/no_priors_characterization/utils/__init__.py b/plugins/operators/no-priors-characterization/src/no_priors_characterization/utils/__init__.py deleted file mode 100644 index deb2683da..000000000 --- a/plugins/operators/no-priors-characterization/src/no_priors_characterization/utils/__init__.py +++ /dev/null @@ -1,33 +0,0 @@ -# Copyright IBM Corporation 2025, 2026 -# SPDX-License-Identifier: MIT - -# Export commonly used utilities for easier imports -from no_priors_characterization.utils.high_dimensional_sampling import ( - concatenated_latin_hypercube_sampling, - get_sampling_indices_multi_dimensional, -) -from no_priors_characterization.utils.one_dimensional_sampling import ( - get_index_list_ordered_partitions, - get_index_list_van_der_corput, -) -from no_priors_characterization.utils.space_df_connector import ( - get_df_all_entities_no_measurements, - get_list_of_entities_from_df_and_space, - get_project_context, - get_source_and_target, - get_space, -) -from orchestrator.utilities.pandas import sort_rows_by_column_names - -__all__ = [ - "concatenated_latin_hypercube_sampling", - "get_df_all_entities_no_measurements", - "get_index_list_ordered_partitions", - "get_index_list_van_der_corput", - "get_list_of_entities_from_df_and_space", - "get_project_context", - "get_sampling_indices_multi_dimensional", - "get_source_and_target", - "get_space", - "sort_rows_by_column_names", -] diff --git a/plugins/operators/no-priors-characterization/src/no_priors_characterization/utils/high_dimensional_sampling.py b/plugins/operators/no-priors-characterization/src/no_priors_characterization/utils/high_dimensional_sampling.py deleted file mode 100644 index a77bbabb4..000000000 --- a/plugins/operators/no-priors-characterization/src/no_priors_characterization/utils/high_dimensional_sampling.py +++ /dev/null @@ -1,338 +0,0 @@ -# Copyright IBM Corporation 2025, 2026 -# SPDX-License-Identifier: MIT - -import logging -import math -import random -from typing import Literal - -import numpy as np -from scipy.stats.qmc import Sobol - -from no_priors_characterization.utils.one_dimensional_sampling import ( - get_index_list_van_der_corput, -) - -logger_high_dimensional = logging.getLogger(__name__) - - -def concatenated_latin_hypercube_sampling( - dimensions: list[int], - final_sample_size: int, - seed: int | None = None, -) -> list[list[int]]: - """ - Generates samples using a Concatenated Latin Hypercube Sampling strategy. - - For each dimension independently, this method enforces a 1D stratification - (Latin Hypercube property) by generating random permutations of the - possible values. If the number of requested samples 'final_sample_size' exceeds the cardinality - of a dimension, new random permutations are concatenated to the sequence. - - This guarantees that for any dimension j with size d_j, every sequence - of d_j samples contains exactly one instance of every value in range(d_j). - - Args: - dimensions (List[int]): Cardinality (size) of each dimension. Must be positive. - final_sample_size (int): Total number of points to sample. - seed (Optional[int]): Optional PRNG seed for reproducibility. - - Returns: - List[List[int]]: A list of final_sample_size sampled points, where each point is a - list of indices corresponding to the dimensions. - - Raises: - ValueError: If any dimension size is less than 1. - """ - if any(d <= 0 for d in dimensions): - raise ValueError( - f"All dimensions must be >= 1, received dimensions={dimensions}" - ) - - if final_sample_size <= 0: - return [] - - # Use default RNG when seed is not provided, otherwise create seeded instance - rng = random.Random() if seed is None else random.Random(seed) # noqa: S311 - - # Per-dimension pools: active permutation for the current block. - # We maintain the Latin Hypercube property by sampling without replacement. - pools: list[list[int]] = [list(range(d)) for d in dimensions] - samples: list[list[int]] = [] - - for _ in range(final_sample_size): - point: list[int] = [] - for j, d in enumerate(dimensions): - # If the current permutation block is exhausted, start a new one (new cycle). - if not pools[j]: - pools[j] = list(range(d)) - - # Select a random element from the remaining pool for this block. - k = rng.randrange(len(pools[j])) - value = pools[j].pop(k) - point.append(value) - - samples.append(point) - - return samples - - -# NOTE: preliminary tests on collision reveal that if final_sample_size is half of the product of dimensions collisions are rare -def sobol_sampling( - dimensions: list[int], final_sample_size: int, seed: int | None = None -) -> list[list[int]]: - """ - Generates Sobol sampled points scaled to integer dimensions. - - This function uses a Sobol sequence to generate points in the unit hypercube [0, 1)^d, - scales them to the specified integer dimensions, and checks for collisions. If collisions - occur (duplicate points), it falls back to Concatenated Latin Hypercube Sampling. - - Args: - dimensions (list[int]): A list of integers representing the size (cardinality) of each dimension. - final_sample_size (int): The number of points to sample. - seed (int | None, optional): Random seed for the Sobol scrambler. Defaults to None. - - Returns: - list[list[int]]: A list of final_sample_size points, where each point is a list of integer coordinates. - """ - # Sobol generates points in [0, 1). We scale them to the integer dimensions. - - sampler = Sobol(d=len(dimensions), scramble=True, rng=seed) - points = sampler.random(final_sample_size) - - # Scale and floor to get integer indices - discrete_points = [ - [int(val * d) for val, d in zip(p, dimensions, strict=True)] for p in points - ] - - # Check for collisions - # Convert inner lists to tuples because lists are unhashable and cannot be used in a set - unique_points = {tuple(p) for p in discrete_points} - n_collisions = final_sample_size - len(unique_points) - - if n_collisions > 0: - logger_high_dimensional.error( - f"Sobol sampling failed, {n_collisions} collisions detected, defaulting to clhs sampling" - ) - return concatenated_latin_hypercube_sampling( - dimensions=dimensions, final_sample_size=final_sample_size, seed=seed - ) - - return discrete_points - - -# TODO: test this function -def distinct_sobol_sampling( - dimensions: list[int], final_sample_size: int, seed: int | None = None -) -> list[list[int]]: - """ - Generates 'n' distinct points on a grid of size 'dimensions' using a Sobol sequence. - Guarantees no collisions by skipping duplicates in the sequence. - """ - # 1. Safety Check: Is the grid big enough? - total_capacity = np.prod(dimensions) - if final_sample_size > total_capacity: - raise ValueError( - f"Cannot generate {final_sample_size} distinct points: Grid only has {total_capacity} cells." - ) - - # 2. Setup Sobol - # We scramble to get better coverage. - sampler = Sobol(d=len(dimensions), scramble=True, rng=seed) - - unique_points = set() - results = [] - - # 3. Iterative Generation - # We generate in batches to be efficient. - # Start with a batch larger than N to account for potential rejections. - batch_size = max(final_sample_size * 2, 64) - - while len(results) < final_sample_size: - # Draw a batch of float points [0, 1) - raw_points = sampler.random(batch_size) - - for p in raw_points: - # Discretize: Map [0, 1) -> Integer coordinates - # Using int(x * dim) scales it to the grid index [0, dim-1] - coord = tuple([int(p[i] * dimensions[i]) for i in range(len(dimensions))]) - - # Check Uniqueness - if coord in unique_points: - continue - - unique_points.add(coord) - results.append(list(coord)) - - # Stop immediately if we have enough - if len(results) == final_sample_size: - return results - - # If we need more points, increase batch size for next iteration - # (helpful if the grid is nearly full and collisions are frequent) - batch_size *= 2 - - return results - - -def random_high_dimensional_sampling( - dimensions: list[int], final_sample_size: int, seed: int | None = None -) -> list[list[int]]: - """ - Generate n unique random samples from a high-dimensional space. - - Args: - dimensions: Cardinality (size) of each dimension. Must be positive. - final_sample_size: Total number of points to sample. - seed: Optional PRNG seed for reproducibility. - - Returns: - List of final_sample_size sampled points, each point is a list of indices - - Raises: - ValueError: If final_sample_size exceeds the total number of possible configurations - """ - import itertools - import random - from math import prod - - # Set the seed for the random number generator - if seed is not None: - random.seed(seed) - - # Check if the number of requested samples is valid - num_configs = prod(dimensions) - if final_sample_size > num_configs: - raise ValueError( - f"Cannot generate {final_sample_size} unique samples. " - f"The sample space only contains {num_configs} possibilities." - ) - - # This still creates all combinations in memory, which is a limitation - # for extremely large dimensional spaces. - configs = list(itertools.product(*[range(d) for d in dimensions])) - - # Ensure we don't try to sample more than available - actual_sample_size = min(final_sample_size, len(configs)) - if actual_sample_size < final_sample_size: - import logging - - logger = logging.getLogger(__name__) - logger.warning( - f"Requested {final_sample_size} samples but only {len(configs)} unique " - f"configurations available. Sampling {actual_sample_size} instead." - ) - - # random.sample is highly optimized for this task. - # It's much faster than manually choosing and removing. - samples = random.sample(configs, actual_sample_size) - - return [list(s) for s in samples] - - -def get_sampling_indices_multi_dimensional( - dimensions: list[int], - n: int | Literal["all", "max"], - space: dict[str, int] | None = None, - strategy: Literal["random", "clhs", "sobol"] = "clhs", - seed: int | None = None, -) -> list[list[int]]: - """ - Generate sampling indices for a high-dimensional space using `get_index_list_van_der_corput` for each dimension. - - Args: - dimensions (List[int]): Sizes of each dimension (e.g., [8, 5]). - n (int | str): Number of points to sample: - - 'all': sample all possible combinations (product of dimensions) - - 'max': sample up to max(dimensions) - strategy (str): sampling subroutine: - - 'random': selects random points from the beginning - - 'clhs': refer to concatenated_latin_hypercube_sampling - - 'sobol': sobol sampling - - space (Optional[Dict[str, int]]): Optional mapping of dimension names to sizes (used only for logging/debug purposes). - Example: - space = {'batch_size': 8, 'model_name': 5} - seed (Optional[int]): controls the randomness - - note: strategies may have an upper bound on the number of elements that respect the strategy that they can return - if this number is exceeded, they resort to random sampling. - - Returns: - List[List[int]]: Outer list length = n (or product of dimensions if n='all'). - Each inner list contains one sampled combination across dimensions. - """ - - # Set the seed for the random number generator - if seed is not None: - random.seed(seed) - - # Log space details if provided - if space: - indices_dict = { - k: get_index_list_van_der_corput(v, v) for k, v in space.items() - } - if [len(indices) for indices in list(indices_dict.values())] != dimensions: - logger_high_dimensional.error( - f"A space dict has been provided ->{space}. It is inconsistent with dimensions={dimensions}" - ) - logger_high_dimensional.warning( - f"list(indices_dict.values()) = {list(indices_dict.values())}" - ) - raise ValueError("Space has inconsistent dimensions!") - logger_high_dimensional.info( - "Sampling indices for each named dimension (ordered low to high): %s", - indices_dict, - ) - - # Compute sampling orders for each dimension - orders = [get_index_list_van_der_corput(v, v) for v in dimensions] - - if logger_high_dimensional.isEnabledFor(logging.DEBUG): - logger_high_dimensional.debug("Dimensions: %s", dimensions) - logger_high_dimensional.debug("Sampling orders for each dimension:") - for i, o in enumerate(orders): - logger_high_dimensional.debug("Dimension %d order: %s", i, o) - - # Calculate maximum possible samples - maximum_n = 1 - for d in dimensions: - maximum_n *= d - lcm = math.lcm(*dimensions) - - if lcm != maximum_n: - logger_high_dimensional.debug( - "Periodicity detected, the sampling subroutine will ensure that you will not sampple" - "the same configuration more than once." - ) - - if isinstance(n, str): - if n == "all": - n = maximum_n - elif n == "max": - n = max(dimensions) - else: - raise ValueError(f"Unrecognized string for n: {n}") - - if n > maximum_n: - logger_high_dimensional.warning( - f"Maximal sample size is {maximum_n}, you requested {n} sampling presciptions." - f"Elaborating prescription for n_samples = {maximum_n}" - ) - - logger_high_dimensional.debug( - "Preparing to sample %d out of %d possible points.", n, maximum_n - ) - - match strategy: - case "random": - return random_high_dimensional_sampling(dimensions, n, seed=seed) - case "clhs": - return concatenated_latin_hypercube_sampling( - dimensions=dimensions, final_sample_size=n, seed=seed - ) - case "sobol": - return sobol_sampling(dimensions=dimensions, final_sample_size=n, seed=seed) - case _: - raise NotImplementedError(f"Strategy {strategy} is unknown") diff --git a/plugins/operators/no-priors-characterization/src/no_priors_characterization/utils/one_dimensional_sampling.py b/plugins/operators/no-priors-characterization/src/no_priors_characterization/utils/one_dimensional_sampling.py deleted file mode 100644 index e5b28e625..000000000 --- a/plugins/operators/no-priors-characterization/src/no_priors_characterization/utils/one_dimensional_sampling.py +++ /dev/null @@ -1,293 +0,0 @@ -# Copyright IBM Corporation 2025, 2026 -# SPDX-License-Identifier: MIT -# -import logging - -logger = logging.getLogger(__name__) - - -def get_index_list_van_der_corput( - length_segment: int, - tot_points_to_sample: int, - sampled_indices: list[int] | None = None, - sort: bool = False, - verbose: bool = False, -) -> list[int]: - """ - Selects a set of indices from a 1D segment using a deterministic sampling strategy. - It is a modified Van der Corput Sequence - - :param length_segment: Total number of units in the 1D segment. - :type length_segment: int - :param tot_points_to_sample: Total number of indices to sample. - :type tot_points_to_sample: int - :param sampled_indices: List of indices already sampled. Defaults to an empty list. - :type sampled_indices: list[int], optional - :param sort: If True, returns the final list sorted in ascending order. Defaults to False. - :type sort: bool, optional - :param verbose: If True, prints debug information during sampling. Defaults to False. - :type verbose: bool, optional - - :raises ValueError: If `tot_points_to_sample` exceeds `length_segment`. - - :return: A list of sampled indices satisfying the distribution strategy. - :rtype: list[int] - - ## Additional Observations and examples - This function assumes that the data has been projected into a 1D segment based on feature importance, - making it isomorphic to a 1d segment. The goal is to sample `tot_points_to_sample` indices from this segment, - optionally considering a set of already sampled indices (`sampled_indices`). The strategy ensures that the - selected points are well-distributed and structurally balanced, akin to placing support ropes on a beam to - prevent collapse. - - The metaphor used is that of a beam suspended by ropes. Initially, ropes are placed at the extremities (indices 0 and `length_segment - 1`) - to ensure boundary support. Additional ropes (sampled points) are added iteratively at the midpoint of the longest unsampled intervals. - In cases of symmetry or multiple equally sparse regions, the algorithm evaluates local neighborhood density to prioritize selection. - - - For example, consider a segment of 14 elements (get_index_list_van_der_corput(14,8)): - - :: - - Index: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 - Sample: 1 - 8 5 - 7 3 - - 4 - 6 - 2 - - Here, numbers in the bottom row represent the order in which each point is added, and `-` indicates unsampled positions. - The algorithm ensures that each new point is placed where it maximally improves the balance of the structure, - often targeting the midpoint of the largest gaps. - - :examples: - - >>> get_index_list_van_der_corput(5, 3, sampled_indices=[0, 4]) - [0, 2, 4] - - >>> get_index_list_van_der_corput(10, 4, sampled_indices=[0, 4, 9]) - [0, 4, 6, 9] - - This strategy is particularly useful in optimization settings where boundary coverage and balanced sampling are important. - """ - - if tot_points_to_sample == 0: - return [] - - if tot_points_to_sample > length_segment: - raise ValueError( - "ValueError: You are trying to sample more points than those that are available" - ) - - if sampled_indices is None: - sampled_indices = [] - - if len(sampled_indices) == length_segment: - maximal_indices_list = list(range(length_segment)) - if sampled_indices.sort() != maximal_indices_list: - logging.error( - "Sampled indices do not correspond to [0,..., max_n_indices -1]" - "Returning list(range(max_n_indices)" - ) - return maximal_indices_list - - if len(sampled_indices) > tot_points_to_sample: - logging.warning( - "Number of sampled indices is greater than the number of indices you want to sample" - "Returning sampled indices" - ) - return sampled_indices - - index_list = list(sampled_indices) - sampled_set = set(index_list) - - for point in [0, length_segment - 1]: - if point not in sampled_set: - index_list.append(point) - sampled_set.add(point) - if len(index_list) == tot_points_to_sample: - return sorted(index_list) - - def build_prefix_and_len(index_list: list[int]) -> tuple[list[int], int]: - """ - Builds prefix sums over a truncated mask: M = max(index_list)+1. - prefix[j] = sum(mask[0:j]) with prefix length M+1. - """ - if not index_list: - return [0], 0 - - M = max(index_list) + 1 - - # You must define sampled_set based on the input list - sampled_set = set(index_list) - - prefix = [0] * (M + 1) - s = 0 - - for i in range(M): - # i represents the current index in the imaginary mask array - s += 1 if i in sampled_set else 0 - prefix[i + 1] = s - - return prefix, M - - def get_list_min_weight( - prefix: list[int], M: int, d: int, selectable_indices: list[int] - ) -> list[int]: - """ - uses prefix sums instead of numpy.mean. - Only considers indices i in selectable_indices intersected with [0, M-1], - and preserves ascending order for ties exactly like the OG. - """ - # cmpute mean densities and track min - # We must preserve order: OG loops i = 0..M-1 and filters by membership. - # Achieve the same by iterating selectable_indices (which we build in ascending order) - # but breaking when i >= M. - vals = {} - for i in selectable_indices: - if i >= M: - break - left = i - d - right = i + d - if left < 0: - left = 0 - if right >= M: - right = M - 1 - total = prefix[right + 1] - prefix[left] - denom = right - left + 1 - mean = total / denom # float64-equivalent - matches numpy.mean on booleans - vals[i] = mean - - if not vals: - return [] - - min_val = min(vals.values()) - # preserving order of candidates as OG: ascending index order - out = [] - for i in selectable_indices: - if i >= M: - break - if vals.get(i) == min_val: - out.append(i) - return out - - def get_selectable_indices() -> list[int]: - # OG did O(N*m) with "i not in list", but we do O(N) with a set, but order identical. - return [i for i in range(length_segment) if i not in sampled_set] - - max_d = length_segment - - # main loop - while len(index_list) < tot_points_to_sample: - selection = 0 - selectable_indices = get_selectable_indices() - - # prefix sums for the current (truncated) mask once per outer iteration - prefix, M = build_prefix_and_len(index_list=index_list) - - d = 1 - # keeping "previous set" semantics exactly (used when l becomes empty) - previous_set = selectable_indices - - while selection == 0: - indices = get_list_min_weight(prefix, M, d, selectable_indices) - - if not indices: - # Exact OG behavior: pick first element of the previous set - # when the intersection is empty at this d. - if not previous_set: - raise ValueError( - "Previous candidate set should not be empty or None" - ) - if verbose: - logger.info( - f"No intersection found with d={d}. Using the previous set " - f"Appending to {index_list} the first element of {previous_set}" - ) - chosen = previous_set[0] - index_list.append(chosen) - sampled_set.add(chosen) - selection = 1 - - else: - # narrowing minimal-density set - previous_set = selectable_indices - selectable_indices = indices - - if len(selectable_indices) == 1 or d == max_d: - # pick the first element (ascending order preserved) - if verbose: - logger.info( - f"Appending to {index_list} the first element of {selectable_indices}" - ) - chosen = selectable_indices[0] - index_list.append(chosen) - sampled_set.add(chosen) - selection = 1 - - # OG increments d regardless it's immaterial after selection, but we mirror it - d += 1 - - if sort: - return sorted(index_list) - return index_list - - -def get_index_list_ordered_partitions(n: int, tot_points: int) -> list[int]: - """ - Select indices from a 1D segment using a partition-based sampling strategy. - - The data is treated as isomorphic to a 1D segment ordered by feature importance. - Points are selected by iteratively finding midpoints of the largest gaps. - - Args: - n: Total length of the segment (len(df)), valid indices are 0 to n-1 - tot_points: Number of points to sample - - Returns: - Sorted list of sampled indices - - Raises: - ValueError: If tot_points exceeds n - """ - if tot_points == 0: - logger.debug("No points selected from the list, return empty list") - return [] - if tot_points > n: - raise ValueError - if tot_points == 1: - return [0] - index_list = [n - 1, 0] - number_of_inner_points_sampled = 0 - while number_of_inner_points_sampled + 2 < tot_points: - l_copy_sorted = index_list.copy() - l_copy_sorted.sort() - l_copy = index_list.copy() - for _i, el in enumerate(l_copy[1:]): - start = el - index_seen = l_copy_sorted.index(el) - end = l_copy_sorted[index_seen + 1] - mid = midpoint(start=start, end=end) - if mid in index_list: - continue - number_of_inner_points_sampled += 1 - index_list.append(mid) - if number_of_inner_points_sampled + 2 == tot_points: - break - index_list.sort() - return index_list - - -def midpoint(start: int, end: int) -> int: - """ - Calculate the midpoint between two indices. - - Args: - start: Starting index - end: Ending index - - Returns: - Integer midpoint index - - Raises: - ValueError: If start is greater than end - """ - if end - start < 0: - raise ValueError("Start is greater than end!") - return start + ((end - start) // 2) diff --git a/plugins/operators/no-priors-characterization/src/no_priors_characterization/utils/order.py b/plugins/operators/no-priors-characterization/src/no_priors_characterization/utils/order.py deleted file mode 100644 index 5ff4e320e..000000000 --- a/plugins/operators/no-priors-characterization/src/no_priors_characterization/utils/order.py +++ /dev/null @@ -1,247 +0,0 @@ -# Copyright IBM Corporation 2025, 2026 -# SPDX-License-Identifier: MIT - -import itertools -import logging -import math -from typing import Literal - -import numpy as np -import pandas as pd - -from no_priors_characterization.utils.high_dimensional_sampling import ( - get_sampling_indices_multi_dimensional, -) - -logger = logging.getLogger(__name__) - - -def order_df_for_sampling_with_no_priors( - df: pd.DataFrame, - constitutive_properties: list[str], - n: int, - strategy: Literal["random", "clhs", "sobol"], -) -> pd.DataFrame: - """ - Orders a DataFrame for high-dimensional sampling without prior knowledge. - - Deduplicates rows based on constitutive properties, orders them for sampling, - and returns a subset of n samples using the specified strategy. - - Args: - df: Input dataset containing at least the columns specified in - constitutive_properties. May contain duplicate configurations. - constitutive_properties: Column names defining the configuration space. - Uniqueness is enforced over the Cartesian product of these properties. - n: Number of samples to generate. Adjusted if larger than available - unique configurations. - strategy: Sampling strategy - "random", "clhs", or "sobol". - - Returns: - DataFrame with n sampled rows, preserving the original column schema. - Index is positional (0..n-1). - - Raises: - ValueError: If n <= 0 after adjustment or no samples are available. - """ - - # Filtering - len_original = len(df) - df_unique = df.drop_duplicates(subset=constitutive_properties).reset_index( - drop=True - ) - delta_len = len_original - len(df_unique) - if delta_len > 0: - logging.warning( - f"Removing {delta_len} duplicate configurations." - f"They are characterized by the same combination of constitutive properties = {constitutive_properties}" - ) - - if n > len(df_unique): - logging.warning( - f"Requested {n} samples, but DataFrame has only {len(df_unique)} rows. Adjusting n to {len(df_unique)}." - ) - n = len(df_unique) - - if n <= 0: - logging.error( - f"No samples available to select. DataFrame has {len(df_unique)} rows and {n} samples were requested." - ) - # Return empty DataFrame with same columns as input - return pd.DataFrame(columns=df_unique.columns) - - # Build dictionaries - def _get_sorted_uniques(prop: str) -> list: - """Helper to safely sort unique values for a property.""" - vals = df_unique[prop].unique() - try: - return sorted(vals) - except TypeError: - logging.warning( - f"Cannot sort mixed types for property '{prop}'. " - "Keeping original order." - ) - return list(vals) - - value_dict = {prop: _get_sorted_uniques(prop) for prop in constitutive_properties} - - space_dict = {prop: len(vals) for prop, vals in value_dict.items()} - - dimensions = list(space_dict.values()) - - # Order DataFrame for index mapping - df_unique = order_df_for_get_index_list_nn_high_dimensional( - df_unique, constitutive_properties, dimensions=dimensions - ).reset_index(drop=True) - - # Generate sampling orders - orders_to_sample = get_sampling_indices_multi_dimensional( - dimensions=dimensions, space=space_dict, n=n, strategy=strategy - ) - - # Map orders to DataFrame indices - indices_to_sample = get_index_list_nn_high_dimensional(orders_to_sample, dimensions) - - logger.info(f"Indexes are:\n {indices_to_sample}") - try: - return df_unique.iloc[indices_to_sample] - except IndexError: - logging.error( - f"Index Error detected. Length of the dataframe is {len(df_unique)}." - "The indices that cause the error are:" - ) - max_len = len(df_unique) - out_of_bounds_list = [i for i in indices_to_sample if i < 0 or i >= max_len] - - logging.error(out_of_bounds_list) - logging.error("Returning empty dataset") - return pd.DataFrame({}) - - -def order_df_for_get_index_list_nn_high_dimensional( - df: pd.DataFrame, constitutive_properties: list[str], dimensions: list[int] -) -> pd.DataFrame: - """ - Ensure DataFrame is ordered and complete for high-dimensional index generation. - - Prepares the DataFrame so rows align with the Cartesian product implied by - constitutive_properties and dimensions. Sorts rows, validates completeness, - and injects missing combinations if needed. - - Args: - df: Input DataFrame containing at least the columns in constitutive_properties. - constitutive_properties: Column names defining the high-dimensional space. - Order determines sort priority. - dimensions: Expected cardinality for each constitutive property. - Used to compute expected_len = product(dimensions). - - Returns: - DataFrame sorted by constitutive_properties and augmented with any missing - combinations. Injected rows have NaN for non-constitutive columns. - - Notes: - If dimensions and actual unique values disagree, uses observed unique - values to generate combinations. - """ - # Sort by constitutive properties - df = df.sort_values(by=constitutive_properties).reset_index(drop=True) - - expected_len = math.prod(dimensions) - - # Return early if already complete - if len(df) == expected_len: - return df - - # Generate all possible combinations based on actual unique values - unique_values = [ - sorted(df[prop].dropna().unique()) for prop in constitutive_properties - ] - all_combinations = list(itertools.product(*unique_values)) - actual_expected_len = len(all_combinations) - - logger.warning( - f"DataFrame length mismatch: expected {expected_len} (product of {dimensions}), " - f"but got {len(df)}. Actual unique combinations: {actual_expected_len}." - ) - - # Identify existing combinations - existing_combinations = { - tuple(row[prop] for prop in constitutive_properties) for _, row in df.iterrows() - } - - # Find missing combinations - missing_combinations = [ - comb for comb in all_combinations if comb not in existing_combinations - ] - - if missing_combinations: - logger.info( - f"Injecting {len(missing_combinations)} missing rows to satisfy the property." - ) - injected_rows = [] - for comb in missing_combinations: - row_data = dict(zip(constitutive_properties, comb, strict=False)) - # Fill other columns with NaN - for col in df.columns: - if col not in constitutive_properties: - row_data[col] = pd.NA - injected_rows.append(row_data) - - # Append missing rows - df = pd.concat([df, pd.DataFrame(injected_rows)], ignore_index=True) - - # Sort again after injection - df = df.sort_values(by=constitutive_properties).reset_index(drop=True) - - logger.info(f"Injected rows: {injected_rows}") - - return df - - -def get_index_list_nn_high_dimensional( - orders_to_sample: list[list[int]], dimensions: list[int] -) -> list[int]: - """ - Map high-dimensional sampling orders to linear (flattened) indices. - - Converts multi-dimensional coordinates to linear indices using row-major ordering, - where the last dimension varies fastest. - - Args: - orders_to_sample: List of multi-dimensional coordinates [i0, i1, ..., ik] - dimensions: Size of each dimension [d0, d1, ..., dk] - - Returns: - List of linear indices corresponding to the input coordinates - - Warns: - If duplicate or out-of-bounds indices are detected - """ - indices = [] - cprod = np.cumprod(np.array(dimensions), dtype=int).tolist() - maximum_n = cprod[-1] - - for order in orders_to_sample: - index = 0 - multiplier = 1 - # Iterate reversed so last dimension varies fastest - for i in reversed(range(len(dimensions))): - index += order[i] * multiplier - multiplier *= dimensions[i] - - if index > maximum_n: - logging.warning( - f"Out of bound index {index} computed from order {order}, dimensions are {dimensions}" - ) - indices.append(index) - - if len(set(indices)) != len(indices): - logger.error(f"{len(indices) - len(set(indices))} Duplicated indices!") - - out_of_bounds_list = [i for i in indices if i > maximum_n] - if out_of_bounds_list: - logger.error( - f"The following indices are out of bound: {out_of_bounds_list}, maximum admissible value is {maximum_n-1}" - ) - - return indices diff --git a/plugins/operators/no-priors-characterization/src/no_priors_characterization/utils/space_df_connector.py b/plugins/operators/no-priors-characterization/src/no_priors_characterization/utils/space_df_connector.py deleted file mode 100644 index 9c29c2fa3..000000000 --- a/plugins/operators/no-priors-characterization/src/no_priors_characterization/utils/space_df_connector.py +++ /dev/null @@ -1,524 +0,0 @@ -# Copyright IBM Corporation 2025, 2026 -# SPDX-License-Identifier: MIT - -from __future__ import annotations - -import logging -from typing import TYPE_CHECKING, Any - -import pandas as pd - -from orchestrator.core.discoveryspace.space import DiscoverySpace -from orchestrator.schema.virtual_property import PropertyAggregationMethodEnum - -if TYPE_CHECKING: - from collections.abc import Hashable - - from orchestrator.metastore.project import ProjectContext - from orchestrator.schema.entity import Entity - -logger = logging.getLogger(__name__) - - -def get_project_context() -> ProjectContext: - """ - Retrieve the current ADO project context from configuration. - - Returns: - ProjectContext object for the active project - """ - import orchestrator.cli.core.config - - ado_configuration = orchestrator.cli.core.config.AdoConfiguration.load() - return ado_configuration.project_context # type: ignore[name-defined] - - -def get_space( - space_or_space_id: DiscoverySpace | str, -) -> DiscoverySpace: - """ - Get a DiscoverySpace object from either a space object or identifier string. - - Args: - space_or_space_id: Either a DiscoverySpace object or its string identifier - - Returns: - DiscoverySpace object - """ - - if isinstance(space_or_space_id, DiscoverySpace): - return space_or_space_id - - return DiscoverySpace.from_stored_configuration( - project_context=get_project_context(), - space_identifier=space_or_space_id, - ) - - -# %% - - -def get_df_all_entities_no_measurements( - discoverySpace: DiscoverySpace | str, -) -> pd.DataFrame: - """ - Return a DataFrame of all entities in the given Discovery Space, regardless of whether - they have any mea sured target outputs. - - - Each row represents an entity from the entity space. - - Includes the entity identifier and all constitutive property values. - - Does NOT include any measured target outputs (only features). - - Useful for generating the full feature set for prediction or backfilling missing measurements. - - Parameters - ---------- - discoverySpace : DiscoverySpace | str - The Discovery Space object or its identifier. - targetOutput_list : list, optional - List of target output names (ignored in this function, included for API consistency). - - Returns - ------- - pd.DataFrame - DataFrame with columns: ['identifier', ]. - """ - - space = get_space(space_or_space_id=discoverySpace) - - entity_space = space.entitySpace - cp_ids = [cp.identifier for cp in entity_space.constitutiveProperties] - - list_of_dicts_to_convert = [] - for point_values in entity_space.sequential_point_iterator(): - point_dict = dict(zip(cp_ids, point_values, strict=True)) - entity = entity_space.entity_for_point(point_dict) - ed = {"identifier": entity.identifier} - ed.update(point_dict) - list_of_dicts_to_convert.append(ed) - - return pd.DataFrame(list_of_dicts_to_convert) - - -def get_df_at_least_one_measured_value( - discoverySpace: DiscoverySpace | str, - targetOutput_list: list[str] | None = None, - add_measurement_id: bool = False, -) -> pd.DataFrame: - """ - Return a DataFrame of entities that have at least one measured target output from the - provided list, aggregated across all experiments in the Discovery Space. - - - Each row represents an entity with measurements. - - Includes identifier (optional), constitutive properties, and the requested target outputs. - - Drops rows with missing values for the selected targets. - - May Return an empty DataFrame - - Parameters - ---------- - discoverySpace : DiscoverySpace | str - The Discovery Space object or its identifier. - targetOutput_list : list - List of target output names to include in the DataFrame. - add_measurement_id : bool - If True, include the entity identifier column in the output. - - Returns - ------- - pd.DataFrame - DataFrame with columns: ['identifier' (optional), , ]. - """ - - if not targetOutput_list: - targetOutput_list = [] - space = get_space(space_or_space_id=discoverySpace) - col_list = [cp.identifier for cp in space.entitySpace.constitutiveProperties] - if add_measurement_id: - col_list = ["identifier", *col_list] - - discoverySpace.sample_store.refresh() - - df = pd.DataFrame( - space.matchingEntitiesTable( - property_type="target", - aggregationMethod=PropertyAggregationMethodEnum.mean, - ) - ) - - if df.empty: - # NOTE: this condition is hit when there are no measurements at all existing in the space - logger.warning( - "No measured properties found in the discovery space\nReturning empty DataFrame\n " - ) - return df - - all_df_cols = list(df.columns) - valid_targetOutput_list = [] - for el in targetOutput_list: - if el in all_df_cols: - valid_targetOutput_list.append(el) - elif f"{el}-mean" in all_df_cols and el not in all_df_cols: - logger.warning( - f"Column named '{el}-mean' (instead of '{el}', which is not present)" - "found in the DataFrame obtained through matchingEntitiesTable. " - f"Renaming it to '{el}'." - ) - # Rename the column in the DataFrame - df.rename(columns={f"{el}-mean": el}, inplace=True) - valid_targetOutput_list += [el] - elif f"{el}-mean" in all_df_cols and el in all_df_cols: - logger.warning( - f"Columns named '{el}-mean' and '{el}'" - "found in the DataFrame obtained through matchingEntitiesTable. " - f"Renaming it to '{el}'." - ) - logger.error("Unexpected behavior can happen!") - # Rename the column in the DataFrame - df.rename(columns={f"{el}-mean": el}, inplace=True) - valid_targetOutput_list += [el] - col_list += valid_targetOutput_list - - # Something unexpected happened: log here about it - if valid_targetOutput_list != targetOutput_list: - if len(valid_targetOutput_list) == 0: - logger.error( - "No valid target in the columns of the DataFrame." - f"columns are:\t{list(df.columns)}." - f"First rows are:\n{df.head(5)}" - ) - else: - not_found = [ - t for t in targetOutput_list if t not in valid_targetOutput_list - ] - logger.error( - f"Found measurements for the following valid targets:\t{valid_targetOutput_list}" - ) - logger.error( - f"No measurement found for the following valid targets:\t{not_found}" - ) - - removed_cols = [c for c in list(df.columns) if c not in col_list] - logger.debug( - "Obtaining df with at least one measured target." - f"Removed columns: {removed_cols}" - ) - - df = df[col_list] - - # I can still have Nans here for cols in targetOutput_list, - # because I am taking points for which I have at least one of the measured properties of the experiment - df.dropna(inplace=True) - - # The resulting DataFrame can be empty - if df.empty: - logger.warning( - "Although there were some measured properties in the discovery space." - ) - logger.warning( - "All measured properties in the discovery space" - f"are different from the desired outputs {targetOutput_list}.Returning empty DataFrame\n " - ) - - return df - - -def get_source_and_target( - discoverySpace: DiscoverySpace | str, - targetOutput: str, - log_string: str = "", -) -> tuple[pd.DataFrame, pd.DataFrame]: - """ - Build source (labeled) and target (unlabeled) DataFrames for a given target output `t`. - Note, source can be empty - - - Retrieves measured entities for `t` and all entities without measurements. - - Merges on common feature columns (excluding 'identifier'). - - Splits into: - source_df: rows with non-null `t` (features + target). - target_df: rows with null `t` (features only). - - Parameters - ---------- - discoverySpace : str - Discovery Space identifier (e.g., 'space-1a2469-6a3ed5'). - t : str - Target output column name. - - Returns - ------- - tuple - (source_df, target_df) - """ - - dfm = get_df_at_least_one_measured_value(discoverySpace, [targetOutput]) - dfu = get_df_all_entities_no_measurements(discoverySpace) - keys = [c for c in dfu.columns if c in dfm.columns and c != "identifier"] - - if dfm.empty: - logger.warning("The source space is empty") - return dfm, dfu - - df = dfu.merge(dfm, on=keys, how="left") - - # If nothing is measured you do not have the columns, so I add the column as empty to run the - # following logic safely - if targetOutput not in list(df.columns): - logger.info( - f"""The target output was not present in the columns of the measured+unmeasured DataFrame,' \ - meaning that '{targetOutput}' has never been measured in this space. - dfm.empty = {df.empty}. Adding an empty column to the DataFrame. - """ - ) - logger.debug("Adding an empty column to the DataFrame.") - df[targetOutput] = pd.NA - - if targetOutput in list(df.columns): - df_measured_drop_na = df.dropna(subset=[targetOutput]) - df_unmeasured_drop_na = df[df[targetOutput].isna()].drop(columns=[targetOutput]) - n_rows_dropped = len(df) - len(df_measured_drop_na) - logger.debug( - f"Dropped {n_rows_dropped} rows. Function called with log_string={log_string}" - ) - if df_measured_drop_na.empty: - logger.warning( - f"Empty source after dropping rows that contain Nan in {targetOutput} column" - ) - if df_unmeasured_drop_na.empty: - logger.warning( - f"Empty target after filtering rows that contain Nan in {targetOutput} column" - ) - return df_measured_drop_na, df_unmeasured_drop_na - save_path = "df_with_no_targetOutput_columns.csv" - logger.error( - f"'{targetOutput}' column is missing, saving df in {save_path}, returning unmerged DataFrames" - ) - df.to_csv(save_path) - return dfm, dfu - - -def validate_points_in_space( - points: list[dict], - space: DiscoverySpace, -) -> tuple[list[dict], list[int]]: - """ - Validate a list of point dictionaries against a Discovery Space entity space. - - A point is considered valid if `space.entitySpace.isPointInSpace(point)` returns True. - This function returns both the subset of valid points (in original order) and - the indices of invalid points for diagnostics. - - Parameters - ---------- - points : list[dict] - List of point dicts `{constitutive_property_id: value}` to validate. - space : DiscoverySpace - The Discovery Space whose entity space defines the validity constraints. - - Returns - ------- - (valid_points, invalid_indices) : tuple[list[dict], list[int]] - valid_points : - The points that are valid under `space.entitySpace.isPointInSpace`. - invalid_indices : - The zero-based indices (relative to the input `points`) that were invalid. - - Examples - -------- - >>> points = make_points_from_df(df, space) - >>> valid_points, invalid_idx = validate_points_in_space(points, space) - >>> if invalid_idx: - ... print(f"Warning: {len(invalid_idx)} invalid rows at indices {invalid_idx}") - """ - valid_points: list[dict] = [] - invalid_indices: list[int] = [] - - for i, p in enumerate(points): - if space.entitySpace.isPointInSpace(p): - valid_points.append(p) - else: - invalid_indices.append(i) - return valid_points, invalid_indices - - -def df_to_points( - df: pd.DataFrame, - cols: list[str] | None = None, - dropna: bool = True, - drop_duplicates: bool = False, -) -> list[dict[Hashable, Any]]: - """ - Convert DataFrame rows to list of point dictionaries. - - Args: - df: Input DataFrame - cols: Columns to include. If None, uses all columns - dropna: If True, drop rows containing any NaN values - drop_duplicates: If True, drop duplicate rows - - Returns: - List of dictionaries, each representing a point {property_id: value} - - Raises: - KeyError: If requested columns are not present in DataFrame - """ - - if cols is None: - cols = list(df.columns) - missing = set(cols) - set(df.columns) - if missing: - raise KeyError(f"Requested columns not present in DataFrame: {missing}") - - sub = df[cols].copy() - if dropna: - sub = sub.dropna(how="any") - if drop_duplicates: - sub = sub.drop_duplicates() - - # Convert numpy scalars to python builtins for safety - def to_py(x: object) -> object: - import numpy as np - - if isinstance(x, (np.generic)): - return x.item() - return x - - # apply conversion (only if needed) - for c in sub.columns: - sub[c] = sub[c].map(to_py) - - return sub.to_dict(orient="records") - - -# TODO: check if these are actually needed -def df_to_points_parsing( - df: pd.DataFrame, - cols: list[str] | None = None, - dropna: bool = True, - parse_values: bool = False, -) -> list[dict]: - """ - Convert DataFrame to points with optional string value parsing. - - Args: - df: Input DataFrame - cols: Columns to include - dropna: If True, drop rows with NaN values - parse_values: If True, parse string values using ast.literal_eval - - Returns: - List of point dictionaries with parsed values - """ - import ast - - points = df_to_points(df, cols=cols, dropna=dropna) - if not parse_values: - return points - - parsed = [] - for p in points: - newp = {} - for k, v in p.items(): - if isinstance(v, str): - try: - newp[k] = ast.literal_eval(v) - except Exception: - newp[k] = v - else: - newp[k] = v - parsed.append(newp) - return parsed - - -def make_points_from_df( - df: pd.DataFrame, - space: DiscoverySpace, - cols: list[str] | None = None, - dropna: bool = True, - parse_values: bool = True, -) -> list[dict]: - """ - Convert a DataFrame of constitutive properties into a list of point dictionaries, - using the entity-space canonical column order by default. - - Each point is a mapping {constitutive_property_id: value}. By default, rows with - any NaN across the selected columns are dropped, and string values are parsed - into Python literals where possible (e.g., "[1, 2]" -> [1, 2]) via `ast.literal_eval`. - - Parameters - ---------- - df : pd.DataFrame - Input DataFrame whose columns correspond to constitutive property identifiers. - space : DiscoverySpace - The Discovery Space providing the canonical order of constitutive properties. - cols : list[str], optional - Explicit list of columns to use. If None, uses the canonical order: - `[cp.identifier for cp in space.entitySpace.constitutiveProperties]`. - dropna : bool, default True - If True, drop rows containing any NaN in the selected columns. - parse_values : bool, default True - If True, attempt to parse string values into Python objects using `ast.literal_eval`. - - Returns - ------- - list[dict] - A list of point dicts, one per retained row: `[{prop_id: value, ...}, ...]`. - - Raises - ------ - KeyError - If any of the requested `cols` are not present in `df`. - - Examples - -------- - >>> space_cols = [cp.identifier for cp in space.entitySpace.constitutiveProperties] - >>> points = make_points_from_df(df, space, cols=space_cols, dropna=True, parse_values=True) - """ - # Determine canonical order if cols not provided - if cols is None: - cols = [cp.identifier for cp in space.entitySpace.constitutiveProperties] - - # Validate requested columns exist - missing = set(cols) - set(df.columns) - if missing: - raise KeyError(f"Requested columns not present in DataFrame: {missing}") - - # Convert rows -> point dicts, with optional parsing - return df_to_points_parsing(df, cols=cols, dropna=dropna, parse_values=parse_values) - - -def get_list_of_entities_from_df_and_space( - df: pd.DataFrame, space: DiscoverySpace -) -> list[Entity]: - """ - Convert DataFrame rows to Entity objects validated against a discovery space. - - Args: - df: DataFrame containing constitutive property values - space: DiscoverySpace defining the entity space constraints - - Returns: - List of valid Entity objects - - Warns: - If number of valid entities differs from DataFrame row count - """ - points = make_points_from_df(df=df, space=space) - valid_points, __ = validate_points_in_space(points, space) - - list_of_entities = [] - from orchestrator.schema.point import SpacePoint - - for p in valid_points: - # p is a dict mapping constitutive property id -> value - sp = SpacePoint(entity=p) - entity = sp.to_entity( - generatorid="no_priors_characterization" - ) # builds an Entity from the dict without touching the sample store - list_of_entities.append(entity) - - numberEntities = len(list_of_entities) - if numberEntities != len(df): - numberEntities_log = f"""Warning: number of valid entities {numberEntities} is different from the number of rows in the ordered df {len(df)}. - This means that some rows in the ordered df did not correspond to valid entities in the discovery space. - """ - logging.warning(numberEntities_log) - return list_of_entities diff --git a/plugins/operators/no-priors-characterization/visualize_sampling.py b/plugins/operators/no-priors-characterization/visualize_sampling.py deleted file mode 100644 index 275f7e7a0..000000000 --- a/plugins/operators/no-priors-characterization/visualize_sampling.py +++ /dev/null @@ -1,135 +0,0 @@ -# Copyright IBM Corporation 2025, 2026 -# SPDX-License-Identifier: MIT - -""" -Visualization script for comparing sampling strategies. - -This script demonstrates the distribution patterns of different sampling -strategies (random, CLHS, Sobol) in a 2D grid space. -""" - -import sys - -try: - import matplotlib.pyplot as plt - import numpy as np - from matplotlib.axes import Axes -except ModuleNotFoundError: - print("matplotlib not found. Please install it to run the visualization.") - print("pip install matplotlib") - sys.exit(1) - -from no_priors_characterization.utils.high_dimensional_sampling import ( - concatenated_latin_hypercube_sampling, - random_high_dimensional_sampling, - sobol_sampling, -) - - -def plot_grid( - ax: Axes, - dimensions: list[int] | tuple[int, int], - points: np.ndarray | list[list[int]], - title: str, -) -> None: - """ - Plot a 2D grid visualization of sampled points with overlap detection. - - Args: - ax: Matplotlib axes object to draw on. - dimensions: Dimensions of the grid [width, height]. - points: List of sampled points as [x, y] coordinates. - title: Title for the plot. - """ - from collections import defaultdict - - import matplotlib.patches as patches - - nx, ny = dimensions[0], dimensions[1] - - # Setup grid - ax.set_xlim(0, nx) - ax.set_ylim(0, ny) - ax.set_xticks(range(nx + 1)) - ax.set_yticks(range(ny + 1)) - ax.grid(True, color="black", linewidth=1) - ax.set_aspect("equal") - ax.set_title(title, fontsize=12, pad=10) - - # Track points in each cell to handle overlaps - # Maps (x, y) -> list of time indices (1-based) - grid_content = defaultdict(list) - - # points is a list of [x, y], enumerate gives us the time index (0-based) - for time, point in enumerate(points): - x, y = int(point[0]), int(point[1]) # Ensure integers - if 0 <= x < nx and 0 <= y < ny: - # Store t + 1 so the first sample is '1' - grid_content[(x, y)].append(time + 1) - - # Draw squares and text - for (x, y), indices in grid_content.items(): - count = len(indices) - # Darker alpha if multiple points hit the same square - alpha = min(0.4 + 0.2 * count, 1.0) - rect = patches.Rectangle( - (x, y), 1, 1, linewidth=0, facecolor="#ff0000", alpha=alpha - ) - ax.add_patch(rect) - - # Label is the comma-separated list of indices - label = ",".join(map(str, indices)) - - # Add text with shadow effect - ax.text( - x + 0.52, - y + 0.52, - label, - ha="center", - va="center", - color="#D4FF00", - fontweight="bold", - ) - ax.text( - x + 0.5, - y + 0.5, - label, - ha="center", - va="center", - color="#000000", - fontweight="bold", - ) - - -def main() -> None: - """Run the sampling visualization comparison.""" - # Configuration - dimensions = [20, 6] # 20 columns, 6 rows (Total 120 cells) - N = 30 # Number of samples to draw - SEED = 42 - - # Plotting - _fig, axes = plt.subplots(1, 3, figsize=(15, 5)) - - # 1. Random Sampling - pts_rnd = random_high_dimensional_sampling(dimensions, N, seed=SEED) - plot_grid(axes[0], dimensions, pts_rnd, f"Random Sampling (N={N})\n(Clumps & Gaps)") - - # 2. Concatenated LHS - pts_lhs = concatenated_latin_hypercube_sampling(dimensions, N, seed=SEED) - plot_grid( - axes[1], dimensions, pts_lhs, f"Concatenated LHS (N={N})\n(Uniform Rows/Cols)" - ) - - # 3. Sobol Sequence - pts_sobol = sobol_sampling(dimensions, N, seed=SEED) - plot_grid( - axes[2], dimensions, pts_sobol, f"Sobol Sequence (N={N})\n(Maximal Spreading)" - ) - - plt.tight_layout() - plt.show() - - -if __name__ == "__main__": - main() diff --git a/plugins/operators/trim/pyproject.toml b/plugins/operators/trim/pyproject.toml index 8806d6751..e75dc2357 100644 --- a/plugins/operators/trim/pyproject.toml +++ b/plugins/operators/trim/pyproject.toml @@ -5,7 +5,6 @@ readme = "README.md" requires-python = ">=3.10,<3.14" dependencies = [ "ado-core", - "ado-no-priors-characterization", "autogluon-tabular[catboost,xgboost]==1.5", "numpy", "pandas>=2.2.0", @@ -29,4 +28,3 @@ root = "../../../" [tool.uv.sources] ado-core = { workspace = true } -ado-no-priors-characterization = { workspace = true } diff --git a/pyproject.toml b/pyproject.toml index 127fa4e0c..24a564a0c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -88,7 +88,6 @@ resolution-helpers = [ # cuda dependencies. test = [ "ado-autoconf", - "ado-no-priors-characterization", "ado-ray-tune", "ado-sfttrainer; python_version < '3.13'", "ado-trim", @@ -137,12 +136,10 @@ members = [ "plugins/operators/profile_space", "plugins/actuators/example_actuator", "plugins/operators/trim", - "plugins/operators/no-priors-characterization" ] [tool.uv.sources] ado-autoconf = { workspace = true, editable = true } -ado-no-priors-characterization = { workspace = true, editable = true } ado-ray-tune = { workspace = true, editable = true } ado-sfttrainer = { path = "plugins/actuators/sfttrainer", editable = true } ado-trim = { workspace = true, editable = true } diff --git a/requirements.txt b/requirements.txt index c27629cd8..b12fa0cea 100644 --- a/requirements.txt +++ b/requirements.txt @@ -422,6 +422,7 @@ googleapis-common-protos==1.74.0 \ # via google-api-core greenlet==3.4.0 ; platform_machine == 'AMD64' or platform_machine == 'WIN32' or platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'ppc64le' or platform_machine == 'win32' or platform_machine == 'x86_64' \ --hash=sha256:04403ac74fe295a361f650818de93be11b5038a78f49ccfb64d3b1be8fbf1267 \ + --hash=sha256:0e1254cf0cbaa17b04320c3a78575f29f3c161ef38f59c977108f19ffddaf077 \ --hash=sha256:1054c5a3c78e2ab599d452f23f7adafef55062a783a8e241d24f3b633ba6ff82 \ --hash=sha256:16dec271460a9a2b154e3b1c2fa1050ce6280878430320e85e08c166772e3f97 \ --hash=sha256:1a54a921561dd9518d31d2d3db4d7f80e589083063ab4d3e2e950756ef809e1a \ @@ -435,20 +436,27 @@ greenlet==3.4.0 ; platform_machine == 'AMD64' or platform_machine == 'WIN32' or --hash=sha256:5b99e87be7eba788dd5b75ba1cde5639edffdec5f91fe0d734a249535ec3408c \ --hash=sha256:5cb614ace7c27571270354e9c9f696554d073f8aa9319079dcba466bbdead711 \ --hash=sha256:636d2f95c309e35f650e421c23297d5011716be15d966e6328b367c9fc513a82 \ + --hash=sha256:6f0def07ec9a71d72315cf26c061aceee53b306c36ed38c35caba952ea1b319d \ --hash=sha256:805bebb4945094acbab757d34d6e1098be6de8966009ab9ca54f06ff492def58 \ --hash=sha256:8424683caf46eb0eb6f626cb95e008e8cc30d0cb675bdfa48200925c79b38a08 \ --hash=sha256:849f8bc17acd6295fcb5de8e46d55cc0e52381c56eaf50a2afd258e97bc65940 \ + --hash=sha256:89995ce5ddcd2896d89615116dd39b9703bfa0c07b583b85b89bf1b5d6eddf81 \ + --hash=sha256:8c5696c42e6bb5cfb7c6ff4453789081c66b9b91f061e5e9367fa15792644e76 \ --hash=sha256:90036ce224ed6fe75508c1907a77e4540176dcf0744473627785dd519c6f9996 \ --hash=sha256:9390ad88b652b1903814eaabd629ca184db15e0eeb6fe8a390bbf8b9106ae15a \ --hash=sha256:956215d5e355fffa7c021d168728321fd4d31fd730ac609b1653b450f6a4bc71 \ + --hash=sha256:98eedd1803353daf1cd9ef23eef23eda5a4d22f99b1f998d273a8b78b70dd47f \ --hash=sha256:9b2d9a138ffa0e306d0e2b72976d2fb10b97e690d40ab36a472acaab0838e2de \ --hash=sha256:a0a53fb071531d003b075c444014ff8f8b1a9898d36bb88abd9ac7b3524648a2 \ --hash=sha256:a19093fbad824ed7c0f355b5ff4214bffda5f1a7f35f29b31fcaa240cc0135ab \ --hash=sha256:a1c4f6b453006efb8310affb2d132832e9bbb4fc01ce6df6b70d810d38f1f6dc \ --hash=sha256:a70ed1cb0295bee1df57b63bf7f46b4e56a5c93709eea769c1fec1bb23a95875 \ + --hash=sha256:ac6a5f618be581e1e0713aecec8e54093c235e5fa17d6d8eb7ffc487e2300508 \ --hash=sha256:b45e45fe47a19051a396abb22e19e7836a59ee6c5a90f3be427343c37908d65b \ + --hash=sha256:b7857e2202aae67bc5725e0c1f6403c20a8ff46094ece015e7d474f5f7020b55 \ --hash=sha256:c660bce1940a1acae5f51f0a064f1bc785d07ea16efcb4bc708090afc4d69e83 \ --hash=sha256:d18eae9a7fb0f499efcd146b8c9750a2e1f6e0e93b5a382b3481875354a430e6 \ + --hash=sha256:d336d46878e486de7d9458653c722875547ac8d36a1cff9ffaf4a74a3c1f62eb \ --hash=sha256:ee407d4d1ca9dc632265aee1c8732c4a2d60adff848057cdebfe5fe94eb2c8a2 \ --hash=sha256:f38b81880ba28f232f1f675893a39cf7b6db25b31cc0a09bb50787ecf957e85e \ --hash=sha256:f50a96b64dafd6169e595a5c56c9146ef80333e67d4476a65a9c55f400fc22ff \ diff --git a/tests/core/discoveryspace/test_no_priors_sampler.py b/tests/core/discoveryspace/test_no_priors_sampler.py new file mode 100644 index 000000000..a31730426 --- /dev/null +++ b/tests/core/discoveryspace/test_no_priors_sampler.py @@ -0,0 +1,97 @@ +# Copyright IBM Corporation 2025, 2026 +# SPDX-License-Identifier: MIT + +"""Tests for the no-priors sampler in core discoveryspace.""" + +import pytest +from pydantic import ValidationError + +from orchestrator.core.discoveryspace.no_priors_parameters import NoPriorsParameters +from orchestrator.core.discoveryspace.no_priors_sampler import NoPriorsSampleSelector + + +class TestNoPriorsParameters: + """Test NoPriorsParameters model.""" + + def test_default_parameters(self) -> None: + """Test default parameter values.""" + params = NoPriorsParameters(targetOutput="test_target") + assert params.targetOutput == "test_target" + assert params.samples == 20 + assert params.batchSize == 1 + assert params.sampling_strategy == "clhs" + + def test_custom_parameters(self) -> None: + """Test custom parameter values.""" + params = NoPriorsParameters( + targetOutput="custom_target", + samples=50, + batchSize=5, + sampling_strategy="sobol", + ) + assert params.targetOutput == "custom_target" + assert params.samples == 50 + assert params.batchSize == 5 + assert params.sampling_strategy == "sobol" + + def test_case_insensitive_strategy(self) -> None: + """Test that sampling_strategy is case-insensitive.""" + params = NoPriorsParameters(targetOutput="test", sampling_strategy="CLHS") + assert params.sampling_strategy == "clhs" + + params = NoPriorsParameters(targetOutput="test", sampling_strategy="Sobol") + assert params.sampling_strategy == "sobol" + + def test_invalid_strategy(self) -> None: + """Test that invalid strategy raises validation error.""" + with pytest.raises(ValidationError, match="sampling_strategy"): + NoPriorsParameters(targetOutput="test", sampling_strategy="invalid") + + def test_samples_validation(self) -> None: + """Test that samples must be >= 1.""" + with pytest.raises(ValidationError, match="samples"): + NoPriorsParameters(targetOutput="test", samples=0) + + with pytest.raises(ValidationError, match="samples"): + NoPriorsParameters(targetOutput="test", samples=-1) + + def test_batch_size_validation(self) -> None: + """Test that batchSize must be >= 1.""" + with pytest.raises(ValidationError, match="batchSize"): + NoPriorsParameters(targetOutput="test", batchSize=0) + + +class TestNoPriorsSampleSelector: + """Test NoPriorsSampleSelector sampler.""" + + def test_sampler_initialization(self) -> None: + """Test sampler can be initialized with parameters.""" + params = NoPriorsParameters(targetOutput="test_target", samples=10) + sampler = NoPriorsSampleSelector(parameters=params) + assert sampler.params == params + assert sampler.params.targetOutput == "test_target" + assert sampler.params.samples == 10 + + def test_parameters_model(self) -> None: + """Test that parameters_model returns correct type.""" + assert NoPriorsSampleSelector.parameters_model() == NoPriorsParameters + + def test_sampler_compatible_with_discovery_space_remote(self) -> None: + """Test that sampler reports compatibility with any discovery space.""" + # This is a simple compatibility check - always returns True + # We don't need a real DiscoverySpaceManager for this test + assert NoPriorsSampleSelector.samplerCompatibleWithDiscoverySpaceRemote(None) + + def test_entity_iterator_not_implemented(self) -> None: + """Test that entityIterator raises NotImplementedError.""" + params = NoPriorsParameters(targetOutput="test_target") + sampler = NoPriorsSampleSelector(parameters=params) + + # entityIterator is not implemented for this sampler + # The NotImplementedError is raised when the iterator is called + iterator = sampler.entityIterator(discoverySpace=None, batchsize=1) + with pytest.raises(NotImplementedError): + next(iterator) + + +# Made with Bob diff --git a/uv.lock b/uv.lock index 0e3c3a92c..4732221ac 100644 --- a/uv.lock +++ b/uv.lock @@ -16,7 +16,6 @@ required-markers = [ members = [ "ado-autoconf", "ado-core", - "ado-no-priors-characterization", "ado-ray-tune", "ado-trim", "ado-vllm-performance", @@ -120,7 +119,6 @@ resolution-helpers = [ ] test = [ { name = "ado-autoconf" }, - { name = "ado-no-priors-characterization" }, { name = "ado-ray-tune" }, { name = "ado-sfttrainer", marker = "python_full_version < '3.13'" }, { name = "ado-trim" }, @@ -182,7 +180,6 @@ docs = [ resolution-helpers = [{ name = "urllib3", specifier = ">=2.5.0" }] test = [ { name = "ado-autoconf", editable = "plugins/custom_experiments/autoconf" }, - { name = "ado-no-priors-characterization", editable = "plugins/operators/no-priors-characterization" }, { name = "ado-ray-tune", editable = "plugins/operators/ray_tune" }, { name = "ado-sfttrainer", marker = "python_full_version < '3.13'", editable = "plugins/actuators/sfttrainer" }, { name = "ado-trim", editable = "plugins/operators/trim" }, @@ -195,25 +192,6 @@ test = [ { name = "robotic-lab", editable = "plugins/actuators/example_actuator" }, ] -[[package]] -name = "ado-no-priors-characterization" -source = { editable = "plugins/operators/no-priors-characterization" } -dependencies = [ - { name = "ado-core" }, - { name = "numpy" }, - { name = "pandas" }, - { name = "scipy", version = "1.15.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, - { name = "scipy", version = "1.16.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, -] - -[package.metadata] -requires-dist = [ - { name = "ado-core", editable = "." }, - { name = "numpy" }, - { name = "pandas", specifier = ">=2.2.0" }, - { name = "scipy" }, -] - [[package]] name = "ado-ray-tune" source = { editable = "plugins/operators/ray_tune" } @@ -244,10 +222,10 @@ requires-dist = [ name = "ado-sfttrainer" source = { editable = "plugins/actuators/sfttrainer" } dependencies = [ - { name = "aim" }, - { name = "jwt" }, - { name = "psutil" }, - { name = "transformers" }, + { name = "aim", marker = "python_full_version < '3.13'" }, + { name = "jwt", marker = "python_full_version < '3.13'" }, + { name = "psutil", marker = "python_full_version < '3.13'" }, + { name = "transformers", marker = "python_full_version < '3.13'" }, ] [package.metadata] @@ -263,7 +241,6 @@ name = "ado-trim" source = { editable = "plugins/operators/trim" } dependencies = [ { name = "ado-core" }, - { name = "ado-no-priors-characterization" }, { name = "autogluon-tabular", extra = ["catboost", "xgboost"] }, { name = "numpy" }, { name = "pandas" }, @@ -273,7 +250,6 @@ dependencies = [ [package.metadata] requires-dist = [ { name = "ado-core", editable = "." }, - { name = "ado-no-priors-characterization", editable = "plugins/operators/no-priors-characterization" }, { name = "autogluon-tabular", extras = ["catboost", "xgboost"], specifier = "==1.5" }, { name = "numpy" }, { name = "pandas", specifier = ">=2.2.0" }, @@ -312,31 +288,31 @@ name = "aim" version = "3.29.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "aim-ui" }, - { name = "aimrecords" }, - { name = "aimrocks" }, - { name = "aiofiles" }, - { name = "alembic" }, - { name = "boto3" }, - { name = "cachetools" }, - { name = "click" }, - { name = "cryptography" }, - { name = "fastapi" }, - { name = "filelock" }, - { name = "jinja2" }, - { name = "numpy" }, - { name = "packaging" }, - { name = "pillow" }, - { name = "psutil" }, - { name = "python-dateutil" }, - { name = "pytz" }, - { name = "requests" }, - { name = "restrictedpython" }, - { name = "sqlalchemy" }, - { name = "tqdm" }, - { name = "uvicorn" }, - { name = "watchdog" }, - { name = "websockets" }, + { name = "aim-ui", marker = "python_full_version < '3.13'" }, + { name = "aimrecords", marker = "python_full_version < '3.13'" }, + { name = "aimrocks", marker = "python_full_version < '3.13'" }, + { name = "aiofiles", marker = "python_full_version < '3.13'" }, + { name = "alembic", marker = "python_full_version < '3.13'" }, + { name = "boto3", marker = "python_full_version < '3.13'" }, + { name = "cachetools", marker = "python_full_version < '3.13'" }, + { name = "click", marker = "python_full_version < '3.13'" }, + { name = "cryptography", marker = "python_full_version < '3.13'" }, + { name = "fastapi", marker = "python_full_version < '3.13'" }, + { name = "filelock", marker = "python_full_version < '3.13'" }, + { name = "jinja2", marker = "python_full_version < '3.13'" }, + { name = "numpy", marker = "python_full_version < '3.13'" }, + { name = "packaging", marker = "python_full_version < '3.13'" }, + { name = "pillow", marker = "python_full_version < '3.13'" }, + { name = "psutil", marker = "python_full_version < '3.13'" }, + { name = "python-dateutil", marker = "python_full_version < '3.13'" }, + { name = "pytz", marker = "python_full_version < '3.13'" }, + { name = "requests", marker = "python_full_version < '3.13'" }, + { name = "restrictedpython", marker = "python_full_version < '3.13'" }, + { name = "sqlalchemy", marker = "python_full_version < '3.13'" }, + { name = "tqdm", marker = "python_full_version < '3.13'" }, + { name = "uvicorn", marker = "python_full_version < '3.13'" }, + { name = "watchdog", marker = "python_full_version < '3.13'" }, + { name = "websockets", marker = "python_full_version < '3.13'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/da/25/c825c73ec2f48c93324f631dba6e4cdac3bb60a7fde36e0b916820ae62a5/aim-3.29.1.tar.gz", hash = "sha256:30fb70f983844eebd270049206c839e6dc09ce9de500048dc97a7a8b22ed83fb", size = 1660733, upload-time = "2025-05-08T09:51:58.892Z" } wheels = [ @@ -366,7 +342,7 @@ name = "aimrecords" version = "0.0.7" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "base58" }, + { name = "base58", marker = "python_full_version < '3.13'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/c9/10/21182ef96acbd9a5ca4008556ba8590cbc1af833eccbf59d53308fa6d928/aimrecords-0.0.7.tar.gz", hash = "sha256:9b562fa5b5109b4b3dd4f83be0061cadbac63fa8031f281b8b5c8ae29967072f", size = 12667, upload-time = "2020-11-09T13:29:29.071Z" } wheels = [ @@ -548,10 +524,10 @@ name = "alembic" version = "1.18.4" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "mako" }, - { name = "sqlalchemy" }, + { name = "mako", marker = "python_full_version < '3.13'" }, + { name = "sqlalchemy", marker = "python_full_version < '3.13'" }, { name = "tomli", marker = "python_full_version < '3.11'" }, - { name = "typing-extensions" }, + { name = "typing-extensions", marker = "python_full_version < '3.13'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/94/13/8b084e0f2efb0275a1d534838844926f798bd766566b1375174e2448cd31/alembic-1.18.4.tar.gz", hash = "sha256:cb6e1fd84b6174ab8dbb2329f86d631ba9559dd78df550b57804d607672cedbc", size = 2056725, upload-time = "2026-02-10T16:00:47.195Z" } wheels = [ @@ -2612,14 +2588,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/0c/bc/e30e1e3d5e8860b0e0ce4d2b16b2681b77fd13542fc0d72f7e3c22d16eff/greenlet-3.4.0-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:d18eae9a7fb0f499efcd146b8c9750a2e1f6e0e93b5a382b3481875354a430e6", size = 284315, upload-time = "2026-04-08T17:02:52.322Z" }, { url = "https://files.pythonhosted.org/packages/5b/cc/e023ae1967d2a26737387cac083e99e47f65f58868bd155c4c80c01ec4e0/greenlet-3.4.0-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:636d2f95c309e35f650e421c23297d5011716be15d966e6328b367c9fc513a82", size = 601916, upload-time = "2026-04-08T16:24:35.533Z" }, { url = "https://files.pythonhosted.org/packages/67/32/5be1677954b6d8810b33abe94e3eb88726311c58fa777dc97e390f7caf5a/greenlet-3.4.0-cp310-cp310-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:234582c20af9742583c3b2ddfbdbb58a756cfff803763ffaae1ac7990a9fac31", size = 616399, upload-time = "2026-04-08T16:30:54.536Z" }, + { url = "https://files.pythonhosted.org/packages/82/0a/3a4af092b09ea02bcda30f33fd7db397619132fe52c6ece24b9363130d34/greenlet-3.4.0-cp310-cp310-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:ac6a5f618be581e1e0713aecec8e54093c235e5fa17d6d8eb7ffc487e2300508", size = 621077, upload-time = "2026-04-08T16:40:34.946Z" }, { url = "https://files.pythonhosted.org/packages/74/bf/2d58d5ea515704f83e34699128c9072a34bea27d2b6a556e102105fe62a5/greenlet-3.4.0-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:523677e69cd4711b5a014e37bc1fb3a29947c3e3a5bb6a527e1cc50312e5a398", size = 611978, upload-time = "2026-04-08T15:56:31.335Z" }, + { url = "https://files.pythonhosted.org/packages/8c/39/3786520a7d5e33ee87b3da2531f589a3882abf686a42a3773183a41ef010/greenlet-3.4.0-cp310-cp310-manylinux_2_39_riscv64.whl", hash = "sha256:d336d46878e486de7d9458653c722875547ac8d36a1cff9ffaf4a74a3c1f62eb", size = 416893, upload-time = "2026-04-08T16:43:02.392Z" }, { url = "https://files.pythonhosted.org/packages/bd/69/6525049b6c179d8a923256304d8387b8bdd4acab1acf0407852463c6d514/greenlet-3.4.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:b45e45fe47a19051a396abb22e19e7836a59ee6c5a90f3be427343c37908d65b", size = 1571957, upload-time = "2026-04-08T16:26:17.041Z" }, { url = "https://files.pythonhosted.org/packages/4e/6c/bbfb798b05fec736a0d24dc23e81b45bcee87f45a83cfb39db031853bddc/greenlet-3.4.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:5434271357be07f3ad0936c312645853b7e689e679e29310e2de09a9ea6c3adf", size = 1637223, upload-time = "2026-04-08T15:57:27.556Z" }, { url = "https://files.pythonhosted.org/packages/b7/7d/981fe0e7c07bd9d5e7eb18decb8590a11e3955878291f7a7de2e9c668eb7/greenlet-3.4.0-cp310-cp310-win_amd64.whl", hash = "sha256:a19093fbad824ed7c0f355b5ff4214bffda5f1a7f35f29b31fcaa240cc0135ab", size = 237902, upload-time = "2026-04-08T17:03:14.16Z" }, { url = "https://files.pythonhosted.org/packages/fb/c6/dba32cab7e3a625b011aa5647486e2d28423a48845a2998c126dd69c85e1/greenlet-3.4.0-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:805bebb4945094acbab757d34d6e1098be6de8966009ab9ca54f06ff492def58", size = 285504, upload-time = "2026-04-08T15:52:14.071Z" }, { url = "https://files.pythonhosted.org/packages/54/f4/7cb5c2b1feb9a1f50e038be79980dfa969aa91979e5e3a18fdbcfad2c517/greenlet-3.4.0-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:439fc2f12b9b512d9dfa681c5afe5f6b3232c708d13e6f02c845e0d9f4c2d8c6", size = 605476, upload-time = "2026-04-08T16:24:37.064Z" }, { url = "https://files.pythonhosted.org/packages/d6/af/b66ab0b2f9a4c5a867c136bf66d9599f34f21a1bcca26a2884a29c450bd9/greenlet-3.4.0-cp311-cp311-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a70ed1cb0295bee1df57b63bf7f46b4e56a5c93709eea769c1fec1bb23a95875", size = 618336, upload-time = "2026-04-08T16:30:56.59Z" }, + { url = "https://files.pythonhosted.org/packages/6d/31/56c43d2b5de476f77d36ceeec436328533bff960a4cba9a07616e93063ab/greenlet-3.4.0-cp311-cp311-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:8c5696c42e6bb5cfb7c6ff4453789081c66b9b91f061e5e9367fa15792644e76", size = 625045, upload-time = "2026-04-08T16:40:37.111Z" }, { url = "https://files.pythonhosted.org/packages/e5/5c/8c5633ece6ba611d64bf2770219a98dd439921d6424e4e8cf16b0ac74ea5/greenlet-3.4.0-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c660bce1940a1acae5f51f0a064f1bc785d07ea16efcb4bc708090afc4d69e83", size = 613515, upload-time = "2026-04-08T15:56:32.478Z" }, + { url = "https://files.pythonhosted.org/packages/80/ca/704d4e2c90acb8bdf7ae593f5cbc95f58e82de95cc540fb75631c1054533/greenlet-3.4.0-cp311-cp311-manylinux_2_39_riscv64.whl", hash = "sha256:89995ce5ddcd2896d89615116dd39b9703bfa0c07b583b85b89bf1b5d6eddf81", size = 419745, upload-time = "2026-04-08T16:43:04.022Z" }, { url = "https://files.pythonhosted.org/packages/a9/df/950d15bca0d90a0e7395eb777903060504cdb509b7b705631e8fb69ff415/greenlet-3.4.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:ee407d4d1ca9dc632265aee1c8732c4a2d60adff848057cdebfe5fe94eb2c8a2", size = 1574623, upload-time = "2026-04-08T16:26:18.596Z" }, { url = "https://files.pythonhosted.org/packages/1a/e7/0839afab829fcb7333c9ff6d80c040949510055d2d4d63251f0d1c7c804e/greenlet-3.4.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:956215d5e355fffa7c021d168728321fd4d31fd730ac609b1653b450f6a4bc71", size = 1639579, upload-time = "2026-04-08T15:57:29.231Z" }, { url = "https://files.pythonhosted.org/packages/d9/2b/b4482401e9bcaf9f5c97f67ead38db89c19520ff6d0d6699979c6efcc200/greenlet-3.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:5cb614ace7c27571270354e9c9f696554d073f8aa9319079dcba466bbdead711", size = 238233, upload-time = "2026-04-08T17:02:54.286Z" }, @@ -2627,7 +2607,9 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/65/8b/3669ad3b3f247a791b2b4aceb3aa5a31f5f6817bf547e4e1ff712338145a/greenlet-3.4.0-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:1a54a921561dd9518d31d2d3db4d7f80e589083063ab4d3e2e950756ef809e1a", size = 286902, upload-time = "2026-04-08T15:52:12.138Z" }, { url = "https://files.pythonhosted.org/packages/38/3e/3c0e19b82900873e2d8469b590a6c4b3dfd2b316d0591f1c26b38a4879a5/greenlet-3.4.0-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:16dec271460a9a2b154e3b1c2fa1050ce6280878430320e85e08c166772e3f97", size = 606099, upload-time = "2026-04-08T16:24:38.408Z" }, { url = "https://files.pythonhosted.org/packages/b5/33/99fef65e7754fc76a4ed14794074c38c9ed3394a5bd129d7f61b705f3168/greenlet-3.4.0-cp312-cp312-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:90036ce224ed6fe75508c1907a77e4540176dcf0744473627785dd519c6f9996", size = 618837, upload-time = "2026-04-08T16:30:58.298Z" }, + { url = "https://files.pythonhosted.org/packages/44/57/eae2cac10421feae6c0987e3dc106c6d86262b1cb379e171b017aba893a6/greenlet-3.4.0-cp312-cp312-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:6f0def07ec9a71d72315cf26c061aceee53b306c36ed38c35caba952ea1b319d", size = 624901, upload-time = "2026-04-08T16:40:38.981Z" }, { url = "https://files.pythonhosted.org/packages/36/f7/229f3aed6948faa20e0616a0b8568da22e365ede6a54d7d369058b128afd/greenlet-3.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a1c4f6b453006efb8310affb2d132832e9bbb4fc01ce6df6b70d810d38f1f6dc", size = 615062, upload-time = "2026-04-08T15:56:33.766Z" }, + { url = "https://files.pythonhosted.org/packages/6a/8a/0e73c9b94f31d1cc257fe79a0eff621674141cdae7d6d00f40de378a1e42/greenlet-3.4.0-cp312-cp312-manylinux_2_39_riscv64.whl", hash = "sha256:0e1254cf0cbaa17b04320c3a78575f29f3c161ef38f59c977108f19ffddaf077", size = 423927, upload-time = "2026-04-08T16:43:05.293Z" }, { url = "https://files.pythonhosted.org/packages/08/97/d988180011aa40135c46cd0d0cf01dd97f7162bae14139b4a3ef54889ba5/greenlet-3.4.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:9b2d9a138ffa0e306d0e2b72976d2fb10b97e690d40ab36a472acaab0838e2de", size = 1573511, upload-time = "2026-04-08T16:26:20.058Z" }, { url = "https://files.pythonhosted.org/packages/d4/0f/a5a26fe152fb3d12e6a474181f6e9848283504d0afd095f353d85726374b/greenlet-3.4.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:8424683caf46eb0eb6f626cb95e008e8cc30d0cb675bdfa48200925c79b38a08", size = 1640396, upload-time = "2026-04-08T15:57:30.88Z" }, { url = "https://files.pythonhosted.org/packages/42/cf/bb2c32d9a100e36ee9f6e38fad6b1e082b8184010cb06259b49e1266ca01/greenlet-3.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:a0a53fb071531d003b075c444014ff8f8b1a9898d36bb88abd9ac7b3524648a2", size = 238892, upload-time = "2026-04-08T17:03:10.094Z" }, @@ -2635,7 +2617,9 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/7a/75/7e9cd1126a1e1f0cd67b0eda02e5221b28488d352684704a78ed505bd719/greenlet-3.4.0-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:43748988b097f9c6f09364f260741aa73c80747f63389824435c7a50bfdfd5c1", size = 285856, upload-time = "2026-04-08T15:52:45.82Z" }, { url = "https://files.pythonhosted.org/packages/9d/c4/3e2df392e5cb199527c4d9dbcaa75c14edcc394b45040f0189f649631e3c/greenlet-3.4.0-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5566e4e2cd7a880e8c27618e3eab20f3494452d12fd5129edef7b2f7aa9a36d1", size = 610208, upload-time = "2026-04-08T16:24:39.674Z" }, { url = "https://files.pythonhosted.org/packages/da/af/750cdfda1d1bd30a6c28080245be8d0346e669a98fdbae7f4102aa95fff3/greenlet-3.4.0-cp313-cp313-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:1054c5a3c78e2ab599d452f23f7adafef55062a783a8e241d24f3b633ba6ff82", size = 621269, upload-time = "2026-04-08T16:30:59.767Z" }, + { url = "https://files.pythonhosted.org/packages/e0/93/c8c508d68ba93232784bbc1b5474d92371f2897dfc6bc281b419f2e0d492/greenlet-3.4.0-cp313-cp313-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:98eedd1803353daf1cd9ef23eef23eda5a4d22f99b1f998d273a8b78b70dd47f", size = 628455, upload-time = "2026-04-08T16:40:40.698Z" }, { url = "https://files.pythonhosted.org/packages/54/78/0cbc693622cd54ebe25207efbb3a0eb07c2639cb8594f6e3aaaa0bb077a8/greenlet-3.4.0-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f82cb6cddc27dd81c96b1506f4aa7def15070c3b2a67d4e46fd19016aacce6cf", size = 617549, upload-time = "2026-04-08T15:56:34.893Z" }, + { url = "https://files.pythonhosted.org/packages/7f/46/cfaaa0ade435a60550fd83d07dfd5c41f873a01da17ede5c4cade0b9bab8/greenlet-3.4.0-cp313-cp313-manylinux_2_39_riscv64.whl", hash = "sha256:b7857e2202aae67bc5725e0c1f6403c20a8ff46094ece015e7d474f5f7020b55", size = 426238, upload-time = "2026-04-08T16:43:06.865Z" }, { url = "https://files.pythonhosted.org/packages/ba/c0/8966767de01343c1ff47e8b855dc78e7d1a8ed2b7b9c83576a57e289f81d/greenlet-3.4.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:227a46251ecba4ff46ae742bc5ce95c91d5aceb4b02f885487aff269c127a729", size = 1575310, upload-time = "2026-04-08T16:26:21.671Z" }, { url = "https://files.pythonhosted.org/packages/b8/38/bcdc71ba05e9a5fda87f63ffc2abcd1f15693b659346df994a48c968003d/greenlet-3.4.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5b99e87be7eba788dd5b75ba1cde5639edffdec5f91fe0d734a249535ec3408c", size = 1640435, upload-time = "2026-04-08T15:57:32.572Z" }, { url = "https://files.pythonhosted.org/packages/a1/c2/19b664b7173b9e4ef5f77e8cef9f14c20ec7fce7920dc1ccd7afd955d093/greenlet-3.4.0-cp313-cp313-win_amd64.whl", hash = "sha256:849f8bc17acd6295fcb5de8e46d55cc0e52381c56eaf50a2afd258e97bc65940", size = 238760, upload-time = "2026-04-08T17:04:03.878Z" }, @@ -3174,7 +3158,7 @@ name = "jwt" version = "1.4.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "cryptography" }, + { name = "cryptography", marker = "python_full_version < '3.13'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/7f/20/21254c9e601e6c29445d1e8854c2a81bdb554e07a82fb1f9846137a6965c/jwt-1.4.0.tar.gz", hash = "sha256:f6f789128ac247142c79ee10f3dba6e366ec4e77c9920d18c1592e28aa0a7952", size = 24911, upload-time = "2025-06-23T13:28:38.289Z" } wheels = [ @@ -3394,7 +3378,7 @@ name = "mako" version = "1.3.10" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "markupsafe" }, + { name = "markupsafe", marker = "python_full_version < '3.13'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/9e/38/bd5b78a920a64d708fe6bc8e0a2c075e1389d53bef8413725c63ba041535/mako-1.3.10.tar.gz", hash = "sha256:99579a6f39583fa7e5630a28c3c1f440e4e97a414b80372649c0ce338da2ea28", size = 392474, upload-time = "2025-04-10T12:44:31.16Z" } wheels = [ diff --git a/website/docs/operators/no-priors-characterization.md b/website/docs/operators/no-priors-characterization.md deleted file mode 120000 index dee9ca30f..000000000 --- a/website/docs/operators/no-priors-characterization.md +++ /dev/null @@ -1 +0,0 @@ -../../../plugins/operators/no-priors-characterization/README.md \ No newline at end of file diff --git a/website/mkdocs.yml b/website/mkdocs.yml index 707547925..ecc746b06 100644 --- a/website/mkdocs.yml +++ b/website/mkdocs.yml @@ -194,4 +194,3 @@ nav: - The Random Walk Operator: operators/random-walk.md - The Ray Tune Operator: operators/optimisation-with-ray-tune.md - The TRIM Operator: operators/trim.md - - The No-Priors Characterization Operator: operators/no-priors-characterization.md From d656e13ab6d35db8622ed3f7174bc8b4f4fb5d64 Mon Sep 17 00:00:00 2001 From: Daniele Lotito Date: Mon, 20 Apr 2026 16:42:31 +0100 Subject: [PATCH 02/23] chore: delete legacy example, not needed because now the new sampling is integrated in random walk --- examples/no-priors-characterization/README.md | 341 ------------------ .../no_priors_custom_experiments/__init__.py | 2 - .../experiments.py | 176 --------- .../custom_experiments/pyproject.toml | 19 - .../example_yamls/op_basic_sampling.yaml | 26 -- .../example_yamls/op_quick_exploration.yaml | 26 -- .../example_yamls/op_thorough_coverage.yaml | 26 -- .../example_yamls/space_reaction.yaml | 21 -- 8 files changed, 637 deletions(-) delete mode 100644 examples/no-priors-characterization/README.md delete mode 100644 examples/no-priors-characterization/custom_experiments/no_priors_custom_experiments/__init__.py delete mode 100644 examples/no-priors-characterization/custom_experiments/no_priors_custom_experiments/experiments.py delete mode 100644 examples/no-priors-characterization/custom_experiments/pyproject.toml delete mode 100644 examples/no-priors-characterization/example_yamls/op_basic_sampling.yaml delete mode 100644 examples/no-priors-characterization/example_yamls/op_quick_exploration.yaml delete mode 100644 examples/no-priors-characterization/example_yamls/op_thorough_coverage.yaml delete mode 100644 examples/no-priors-characterization/example_yamls/space_reaction.yaml diff --git a/examples/no-priors-characterization/README.md b/examples/no-priors-characterization/README.md deleted file mode 100644 index 0bdd09202..000000000 --- a/examples/no-priors-characterization/README.md +++ /dev/null @@ -1,341 +0,0 @@ -# Performing Efficient Space-Filling Sampling of a Configuration Space - - - -> [!NOTE] The scenario -> -> You have an experiment with multiple parameters and need an initial measured -> dataset that covers the configuration space efficiently. -> **In this example, `ado`'s `random_walk` operator with the no-priors sampler -> is used for efficient space-filling sampling of the target property across the -> parameter space, moving beyond standard random-walk or brute-force sampling.** -> Using the no-priors sampler with `random_walk` involves: -> -> 1. Defining the configuration space to explore. -> 2. Creating an `operation` that uses `random_walk` with the no-priors sampler -> to order and submit points with a space-filling strategy. -> 3. Observing the measurement process as the selected strategy orders and -> submits the points. - -> [!IMPORTANT] Prerequisites -> -> Get the example files and install dependencies: -> -> ```commandline -> git clone https://github.com/IBM/ado.git -> cd ado -> pip install examples/no-priors-characterization/custom_experiments/ -> ``` - -> [!CAUTION] -> -> All commands below assume you are running them from the -> **top-level of the `ado` repository**. - -> [!TIP] TL;DR -> -> To create a `discoveryspace` and perform efficient space-filling sampling with -> the `random_walk` operator using the no-priors sampler, execute the following -> from the root of the `ado` repository: -> -> ```bash -> : # Create the space to explore based on a custom experiment -> ado create space -f \ -> examples/no-priors-characterization/example_yamls/space_reaction.yaml \ -> --new-sample-store -> : # Run a space-filling characterization operation -> ado create operation -f \ -> examples/no-priors-characterization/example_yamls/op_basic_sampling.yaml \ -> --use-latest space -> ``` - - - -## What is Space-Filling Sampling with the No-Priors Sampler? - -The **no-priors sampler** is an advanced sampler for the `random_walk` operator -that provides efficient space-filling exploration when you do not yet have a -useful prior model or historical dataset. It is a strong fit for the first phase -of an exploration, where you want representative coverage across a configuration -space before switching to model-based or target-driven workflows. - -**Handling Existing Measurements**: If the discovery space already contains -measured entities for the target property, the sampler automatically: - -- Identifies which entities have already been measured -- Excludes them from sampling, so that the operator will measure the - desired amount of new entities - -The sampler supports multiple sampling strategies: - -1. **Random Sampling (`random`)**: A baseline random ordering across the - candidate configuration space. Fast and simple, but usually less - space-filling than the advanced strategies. - -2. **Concatenated Latin Hypercube Sampling (`clhs`)**: An adaptation of Latin - Hypercube Sampling for discrete spaces. It improves dimension-wise coverage - by reducing repeated reuse of the same values early in the sampling process. - This is often a strong default for high-dimensional spaces. - -3. **Sobol Sampling (`sobol`)**: A quasi-random low-discrepancy sampling - method that provides stronger space-filling properties than pure random - sampling. It is adapted for discrete parameter spaces and falls back to CLHS - when collisions are detected during discretization. - -4. **One-Shift Sampling (`one_shift`)**: A heuristic for higher-dimensional - spaces that attempts to maximize minimum distance between samples. - -5. **Recursive Aggregation (`recursive_aggregation`)**: Another heuristic for - higher-dimensional spaces with different coverage characteristics. - - -> [!CAUTION] -> -> In the current version, if not all measurements produce the observed target -> output property specified in the sampler's `targetOutput` parameter, the -> operation may fail or produce incomplete results. Ensure all experiments -> return the expected target property. - - - -The sampler orders a specified number of new points, which `random_walk` then -measures in batches using the configured experiment, storing the results in the -sample store. - -## Creating a `discoveryspace` - -A `discoveryspace` describes the configuration space you want to explore -(`entitySpace`) and how to measure it (`measurementSpace`). In this example, -we use two custom Python functions as experiments and take inspiration from the -chemistry domain: - -1. **`calculate_reaction_yield`**: Calculates chemical reaction yield based on - temperature (K), concentration (mol/L), and catalyst amount (g) using an - Arrhenius-like equation. - -2. **`calculate_material_strength`**: Calculates material tensile strength (MPa) - based on composition percentages, temperature (°C), and grain size (μm) using - a Hall-Petch relationship. - -First, create the `discoveryspace` by executing this command from the repository -root: - -```commandline -ado create space -f \ - examples/no-priors-characterization/example_yamls/space_reaction.yaml \ - --new-sample-store -``` - -This will create a new space and a sample store to hold the measurement results. -The output will be similar to: - -```terminaloutput -Success! Created space with identifier: space-bfed2d-19b49a -``` - -## Running a Space-Filling Sampling Operation - -Next, we run an `operation` that uses `random_walk` with the no-priors sampler -to perform space-filling sampling of the `discoveryspace`. We provide three -example configurations with different strategies: - -### Space-Filling Sampling with CLHS - -The configuration for a CLHS-based space-filling operation is in -`op_basic_sampling.yaml`: - - - -```yaml -{% - include-markdown "./example_yamls/op_basic_sampling.yaml" -%} -``` - - -This configuration uses the no-priors sampler with CLHS to prioritize early -coverage across the configuration space rather than relying on plain random -ordering. - - - -```commandline -ado create operation -f \ - examples/no-priors-characterization/example_yamls/op_basic_sampling.yaml \ - --use-latest space -``` - - - -### Baseline Random Sampling - -For a baseline comparison using random sampling with 20 samples and batch size -of 5: - -```commandline -ado create operation -f \ - examples/no-priors-characterization/example_yamls/op_quick_exploration.yaml \ - --use-latest space -``` - -**Note**: Each operation samples different points from the space based on its -strategy and parameters, even when using the same discovery space. - -Random sampling is useful as a baseline, but CLHS and Sobol generally provide -better space-filling behavior for initial characterization. - -### Detailed Coverage with Sobol Sequence - -For denser low-discrepancy coverage using Sobol sequences with 100 samples and -batch size of 1: - -```commandline -ado create operation -f \ - examples/no-priors-characterization/example_yamls/op_thorough_coverage.yaml \ - --use-latest space -``` - -This is a good option when you want more uniform low-discrepancy coverage of -the available configuration space. - -### What to Expect in the Terminal - -You will see output as the `random_walk` operator with the no-priors sampler -orders, submits, and measures points. The key stages are: - -#### Initialization - -The operator will log the start of the sampling process: - - - -```commandline -2026-03-09 16:30:00,000 INFO MainThread RandomWalk: Running random walk for 30 iterations. Sampler is custom sampler class: ... -``` - - - -#### Sampling and Measurement - -For each batch of points, you will see output indicating the experiments being -submitted and completed: - - - -```commandline -(RandomWalk pid=82843) Continuous batching: SUBMIT EXPERIMENT. Submitted experiment custom_experiments.calculate_reaction_yield for temperature.353-concentration.4.1-catalyst_amount.4.5. Request identifier: c72090 -(RandomWalk pid=82843) -(RandomWalk pid=82843) Continuous batching: SUMMARY. Entities sampled and submitted: 2. Experiments completed: 1 Waiting on 1 active requests. There are 0 dependent experiments -(RandomWalk pid=82843) Continuous Batching: EXPERIMENT COMPLETION. Received finished notification for experiment in measurement request in group 1: request-c72090-experiment-calculate_reaction_yield-entities-temperature.353-concentration.4.1-catalyst_amount.4.5 (random_walk)-requester-randomwalk-1.6.1.dev9+03a65e7b.dirty-9a277d-time-2026-03-10 11:43:11.066810+00:00 -``` - - - -#### Completion - -The operation will end with a success message: - - - -```commandline -Success! Created operation with identifier operation-random_walk-v0.1-8b23a245 and it finished successfully. -``` - - - -## Looking at the `operation` output - -After the operation completes, you can view the sampled entities and their -measured values. - -You can see the relationship between the space and operations with: - -```commandline -ado show related space --use-latest -``` - -This will show the `discoveryspace` and the operations that were run. -To see the entities of the space that have been measured, you can run: - - - -```commandline -ado show entities space --use-latest -``` - - - -This will display a table of the entities sampled and their measured reaction -yield values. - - - -```text -┌───────┬──────────────────────────────────────────────────────────┬────────────────────────────┬─────────────────────────────────────────────┬─────────────┬───────────────┬─────────────────┬──────────┐ -│ INDEX │ identifier │ generatorid │ experiment_id │ temperature │ concentration │ catalyst_amount │ yield │ -├───────┼──────────────────────────────────────────────────────────┼────────────────────────────┼─────────────────────────────────────────────┼─────────────┼───────────────┼─────────────────┼──────────┤ -│ 0 │ temperature.300-concentration.1.0-catalyst_amount.2.0 │ random_walk │ custom_experiments.calculate_reaction_yield │ 300 │ 1.0 │ 2.0 │ 45.23 │ -│ 1 │ temperature.350-concentration.2.5-catalyst_amount.5.0 │ random_walk │ custom_experiments.calculate_reaction_yield │ 350 │ 2.5 │ 5.0 │ 78.91 │ -│ 2 │ temperature.400-concentration.0.5-catalyst_amount.1.0 │ random_walk │ custom_experiments.calculate_reaction_yield │ 400 │ 0.5 │ 1.0 │ 92.15 │ -│ ... │ ... │ ... │ ... │ ... │ ... │ ... │ ... │ -└───────┴──────────────────────────────────────────────────────────┴────────────────────────────┴─────────────────────────────────────────────┴─────────────┴───────────────┴─────────────────┴──────────┘ -``` - - - -## Comparison with Other Sampling Approaches - -### When to Use the No-Priors Sampler - -Use the no-priors sampler with `random_walk` when you want to: - -- Build an initial measured dataset before surrogate modelling or optimization -- Cover a discrete or discretized configuration space more efficiently than - plain random sampling -- Avoid repeatedly measuring entities that already have the target output -- Get better space-filling coverage than the base `random_walk` samplers - -### Comparison with Base Random Walk Samplers - -The base `random_walk` samplers (`random`, `sequential`, grouped modes) are -simpler and appropriate when: - -- You want to iterate through existing entities in the sample store -- You need deterministic sequential traversal of a finite space -- You don't need optimized space-filling properties - -The no-priors sampler adds: - -- Active reordering of candidates using dedicated space-filling strategies -- Automatic exclusion of already-measured entities for a target output -- Multiple strategy options (CLHS, Sobol, etc.) for different coverage needs - -### Comparison with LHC and Ray Tune - -For continuous optimization or hyperparameter tuning, consider: - -- **Latin Hypercube Sampling (LHC)** via ray-tune: Better for continuous spaces - and when you want to leverage Ray's distributed execution -- **Ray Tune operators**: Appropriate for model hyperparameter optimization with - adaptive search algorithms (e.g., Bayesian optimization, HyperBand) - -The no-priors sampler is specifically designed for: - -- Discrete or discretized configuration spaces -- Initial characterization before optimization -- Cases where you want space-filling coverage without a surrogate model - -## Takeaways - -- **Efficient space-filling**: The no-priors sampler helps cover a configuration - space more effectively than plain random ordering. -- **Multiple strategies**: Choose from random, Sobol, CLHS, or higher-dimensional - heuristics depending on the trade-off you want between baseline simplicity and - coverage quality. -- **Flexible configuration**: Adjust the number of samples and batch size to - balance throughput, coverage, and experimental resources. -- **Foundation for later workflows**: The resulting dataset is well suited for - surrogate modelling, optimization, or follow-on operators such as TRIM. -- **Integrated with random_walk**: The sampler works within the standard - `random_walk` operator flow, benefiting from its batching, filtering, and - memoization capabilities. diff --git a/examples/no-priors-characterization/custom_experiments/no_priors_custom_experiments/__init__.py b/examples/no-priors-characterization/custom_experiments/no_priors_custom_experiments/__init__.py deleted file mode 100644 index 2a4c79e8b..000000000 --- a/examples/no-priors-characterization/custom_experiments/no_priors_custom_experiments/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -# Copyright IBM Corporation 2025, 2026 -# SPDX-License-Identifier: MIT diff --git a/examples/no-priors-characterization/custom_experiments/no_priors_custom_experiments/experiments.py b/examples/no-priors-characterization/custom_experiments/no_priors_custom_experiments/experiments.py deleted file mode 100644 index 8d426ef38..000000000 --- a/examples/no-priors-characterization/custom_experiments/no_priors_custom_experiments/experiments.py +++ /dev/null @@ -1,176 +0,0 @@ -# Copyright IBM Corporation 2025, 2026 -# SPDX-License-Identifier: MIT - -from typing import Literal - -import numpy as np - -from orchestrator.modules.actuators.custom_experiments import custom_experiment -from orchestrator.schema.domain import PropertyDomain, VariableTypeEnum -from orchestrator.schema.property import ConstitutiveProperty - -# --------------------------- -# Properties for Reaction Yield -# --------------------------- - -temperature = ConstitutiveProperty( - identifier="temperature", - propertyDomain=PropertyDomain( - variableType=VariableTypeEnum.CONTINUOUS_VARIABLE_TYPE, - domainRange=[273, 473], # 0-200°C in Kelvin - ), -) - -concentration = ConstitutiveProperty( - identifier="concentration", - propertyDomain=PropertyDomain( - variableType=VariableTypeEnum.CONTINUOUS_VARIABLE_TYPE, - domainRange=[0.1, 5.0], # mol/L - ), -) - -catalyst_amount = ConstitutiveProperty( - identifier="catalyst_amount", - propertyDomain=PropertyDomain( - variableType=VariableTypeEnum.CONTINUOUS_VARIABLE_TYPE, - domainRange=[0.0, 10.0], # grams - ), -) - -# --------------------------- -# Properties for Material Strength -# --------------------------- - -composition_a = ConstitutiveProperty( - identifier="composition_a", - propertyDomain=PropertyDomain( - variableType=VariableTypeEnum.CONTINUOUS_VARIABLE_TYPE, - domainRange=[0, 100], # percentage - ), -) - -composition_b = ConstitutiveProperty( - identifier="composition_b", - propertyDomain=PropertyDomain( - variableType=VariableTypeEnum.CONTINUOUS_VARIABLE_TYPE, - domainRange=[0, 100], # percentage - ), -) - -temperature_celsius = ConstitutiveProperty( - identifier="temperature_celsius", - propertyDomain=PropertyDomain( - variableType=VariableTypeEnum.CONTINUOUS_VARIABLE_TYPE, - domainRange=[-50, 200], # Celsius - ), -) - -grain_size = ConstitutiveProperty( - identifier="grain_size", - propertyDomain=PropertyDomain( - variableType=VariableTypeEnum.CONTINUOUS_VARIABLE_TYPE, - domainRange=[1, 100], # micrometers - ), -) - -# --------------------------- -# Reaction Yield Experiment -# --------------------------- - - -@custom_experiment( - required_properties=[temperature, concentration, catalyst_amount], - output_property_identifiers=["yield"], -) -def calculate_reaction_yield( - temperature: float, concentration: float, catalyst_amount: float -) -> dict[Literal["yield"], float]: - """ - Calculate chemical reaction yield using Arrhenius-like equation with catalyst effect. - - The yield is calculated using: - k = A * exp(-Ea / (R * T)) * (1 + 0.1 * catalyst_amount) - yield = 100 * (1 - exp(-k * concentration * time)) - - where: - A = 1e10 (pre-exponential factor) - Ea = 50000 J/mol (activation energy) - R = 8.314 J/(mol·K) (gas constant) - time = 3600 s (reaction time) - - Args: - temperature: Reaction temperature in Kelvin - concentration: Reactant concentration in mol/L - catalyst_amount: Catalyst amount in grams - - Returns: - dict: Dictionary containing the calculated yield as a percentage (0-100) - """ - A = 1e10 # pre-exponential factor - Ea = 50000 # J/mol, activation energy - R = 8.314 # J/(mol·K), gas constant - time = 3600 # seconds, reaction time - - # Calculate rate constant with catalyst effect - k = A * np.exp(-Ea / (R * temperature)) * (1 + 0.1 * catalyst_amount) - - # Calculate yield - reaction_yield = 100 * (1 - np.exp(-k * concentration * time)) - - # Ensure yield is between 0 and 100 - reaction_yield = np.clip(reaction_yield, 0, 100) - - return {"yield": float(reaction_yield)} - - -# --------------------------- -# Material Strength Experiment -# --------------------------- - - -@custom_experiment( - required_properties=[composition_a, composition_b, temperature_celsius, grain_size], - output_property_identifiers=["tensile_strength"], -) -def calculate_material_strength( - composition_a: float, - composition_b: float, - temperature_celsius: float, - grain_size: float, -) -> dict[Literal["tensile_strength"], float]: - """ - Calculate material tensile strength using Hall-Petch relationship with composition effects. - - The strength is calculated using: - base_strength = composition_a * 500 + composition_b * 300 + (100 - composition_a - composition_b) * 200 - temp_factor = 1 - 0.002 * (temperature_celsius - 20) - grain_factor = 1 + 100 / sqrt(grain_size) - tensile_strength = base_strength * temp_factor * grain_factor / 1000 - - Args: - composition_a: Percentage of component A (0-100) - composition_b: Percentage of component B (0-100) - temperature_celsius: Testing temperature in Celsius - grain_size: Grain size in micrometers - - Returns: - dict: Dictionary containing the calculated tensile strength in MPa - """ - # Calculate base strength from composition - composition_c = 100 - composition_a - composition_b - base_strength = composition_a * 500 + composition_b * 300 + composition_c * 200 - - # Temperature effect (strength decreases with temperature) - temp_factor = 1 - 0.002 * (temperature_celsius - 20) - temp_factor = np.clip(temp_factor, 0.1, 2.0) # Prevent unrealistic values - - # Hall-Petch relationship (strength increases with smaller grain size) - grain_factor = 1 + 100 / np.sqrt(grain_size) - - # Calculate final tensile strength in MPa - tensile_strength = base_strength * temp_factor * grain_factor / 1000 - - # Ensure positive strength - tensile_strength = np.maximum(tensile_strength, 0) - - return {"tensile_strength": float(tensile_strength)} diff --git a/examples/no-priors-characterization/custom_experiments/pyproject.toml b/examples/no-priors-characterization/custom_experiments/pyproject.toml deleted file mode 100644 index 2c21edf05..000000000 --- a/examples/no-priors-characterization/custom_experiments/pyproject.toml +++ /dev/null @@ -1,19 +0,0 @@ -[project] -name = "no_priors_custom_experiments" -description = "A set of custom experiments used to test No-Priors Characterization Operation" -dependencies = [ - "ado-core", - "numpy", -] -dynamic = ["version"] - -[project.entry-points."ado.custom_experiments"] -# This should be python file with your decorated function(s). -no_priors_experiments = "no_priors_custom_experiments.experiments" - -[build-system] -requires = ["setuptools", "setuptools_scm"] -build-backend = "setuptools.build_meta" - -[tool.setuptools_scm] -root = "../../../" diff --git a/examples/no-priors-characterization/example_yamls/op_basic_sampling.yaml b/examples/no-priors-characterization/example_yamls/op_basic_sampling.yaml deleted file mode 100644 index f821d0806..000000000 --- a/examples/no-priors-characterization/example_yamls/op_basic_sampling.yaml +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright IBM Corporation 2025, 2026 -# SPDX-License-Identifier: MIT -# CLHS space-filling configuration using random_walk with no-priors sampler -spaces: - - space-c8717f-3a68bf -operation: - module: - operationType: explore - operatorName: random_walk - parameters: - numberEntities: 30 - batchSize: 1 - samplerConfig: - module: - moduleName: orchestrator.core.discoveryspace.no_priors_sampler - moduleClass: NoPriorsSampleSelector - parameters: - targetOutput: yield - samples: 30 - batchSize: 1 - sampling_strategy: clhs - singleMeasurement: true - filter: - filterMode: unmeasured - -# Made with Bob diff --git a/examples/no-priors-characterization/example_yamls/op_quick_exploration.yaml b/examples/no-priors-characterization/example_yamls/op_quick_exploration.yaml deleted file mode 100644 index 8eebffd7c..000000000 --- a/examples/no-priors-characterization/example_yamls/op_quick_exploration.yaml +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright IBM Corporation 2025, 2026 -# SPDX-License-Identifier: MIT -# Baseline random ordering using random_walk with no-priors sampler -spaces: - - space-c8717f-3a68bf -operation: - module: - operationType: explore - operatorName: random_walk - parameters: - numberEntities: 20 - batchSize: 5 - samplerConfig: - module: - moduleName: orchestrator.core.discoveryspace.no_priors_sampler - moduleClass: NoPriorsSampleSelector - parameters: - targetOutput: yield - samples: 20 - batchSize: 5 - sampling_strategy: random - singleMeasurement: true - filter: - filterMode: unmeasured - -# Made with Bob diff --git a/examples/no-priors-characterization/example_yamls/op_thorough_coverage.yaml b/examples/no-priors-characterization/example_yamls/op_thorough_coverage.yaml deleted file mode 100644 index b0963edb1..000000000 --- a/examples/no-priors-characterization/example_yamls/op_thorough_coverage.yaml +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright IBM Corporation 2025, 2026 -# SPDX-License-Identifier: MIT -# Sobol low-discrepancy ordering using random_walk with no-priors sampler -spaces: - - space-c8717f-3a68bf -operation: - module: - operationType: explore - operatorName: random_walk - parameters: - numberEntities: 100 - batchSize: 1 - samplerConfig: - module: - moduleName: orchestrator.core.discoveryspace.no_priors_sampler - moduleClass: NoPriorsSampleSelector - parameters: - targetOutput: yield - samples: 100 - batchSize: 1 - sampling_strategy: sobol - singleMeasurement: true - filter: - filterMode: unmeasured - -# Made with Bob diff --git a/examples/no-priors-characterization/example_yamls/space_reaction.yaml b/examples/no-priors-characterization/example_yamls/space_reaction.yaml deleted file mode 100644 index eea63704b..000000000 --- a/examples/no-priors-characterization/example_yamls/space_reaction.yaml +++ /dev/null @@ -1,21 +0,0 @@ -# Copyright IBM Corporation 2025, 2026 -# SPDX-License-Identifier: MIT -sampleStoreIdentifier: 3a68bf -metadata: - name: reaction_yield_space -entitySpace: - - identifier: temperature - propertyDomain: - domainRange: [273, 473] - interval: 10 - - identifier: concentration - propertyDomain: - domainRange: [0.1, 5.0] - interval: 0.2 - - identifier: catalyst_amount - propertyDomain: - domainRange: [0.0, 10.0] - interval: 0.5 -experiments: - - actuatorIdentifier: custom_experiments - experimentIdentifier: calculate_reaction_yield From 3b74762cd01e35fcb0a91f8d9cfa1b91180eeece Mon Sep 17 00:00:00 2001 From: Daniele Lotito Date: Mon, 20 Apr 2026 16:43:03 +0100 Subject: [PATCH 03/23] docs: add samplers --- website/docs/operators/random-walk.md | 41 +++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/website/docs/operators/random-walk.md b/website/docs/operators/random-walk.md index 507bd16c7..5d2568ac8 100644 --- a/website/docs/operators/random-walk.md +++ b/website/docs/operators/random-walk.md @@ -279,6 +279,47 @@ spaces: - your-spaces ``` +### Advanced Samplers + +When the base samplers are not enough, `random_walk` can also use more +specialized samplers that still integrate with its normal batching, filtering, and +memoization. + +#### Quasi-Random Sampling Strategies + +The `NoPriorsSampleSelector` provides quasi-random sampling strategies designed +for high-dimensional discrete spaces. These strategies produce sequences where +consecutive elements are maximally dispersed, favoring uniform coverage of the +space: + +- **`sobol`**: Sobol sequences are low-discrepancy quasi-random sequences widely + used for space-filling designs. They provide better coverage than pure random + sampling by ensuring points are well-distributed across all dimensions. +- **`clhs`**: Concatenated Latin Hypercube Sampling (CLHS) samples each dimension + independently without replacement, cycling through all values before repeating. + This ensures each dimension is uniformly covered. + +**Collision Handling**: Sobol sampling may produce collisions (duplicate points) +when mapping continuous Sobol sequences to discrete integer coordinates. When +collisions are detected, the sampler automatically falls back to CLHS to ensure +the requested number of unique samples. + +#### Example: Sobol Sampling + +Example using Sobol ordering for quasi-random low-discrepancy coverage: + +```yaml +samplerConfig: + module: + moduleName: orchestrator.core.discoveryspace.no_priors_sampler + moduleClass: NoPriorsSampleSelector + parameters: + targetOutput: yield + samples: 100 + batchSize: 1 + sampling_strategy: sobol +``` + ### Custom Samplers It is also possible to specify that `random_walk` uses a custom sampler. This is From b804c1e1133a31738e0d9ef800798c6e0d32d590 Mon Sep 17 00:00:00 2001 From: Daniele Lotito Date: Mon, 20 Apr 2026 16:48:31 +0100 Subject: [PATCH 04/23] chore: remove LLM attribution since it has just copy-pasted files --- orchestrator/core/discoveryspace/no_priors_parameters.py | 3 --- orchestrator/core/discoveryspace/no_priors_sampler.py | 3 --- 2 files changed, 6 deletions(-) diff --git a/orchestrator/core/discoveryspace/no_priors_parameters.py b/orchestrator/core/discoveryspace/no_priors_parameters.py index e7aee7288..1271ac769 100644 --- a/orchestrator/core/discoveryspace/no_priors_parameters.py +++ b/orchestrator/core/discoveryspace/no_priors_parameters.py @@ -63,6 +63,3 @@ class NoPriorsParameters(BaseModel): ), ), ] = "clhs" - - -# Made with Bob diff --git a/orchestrator/core/discoveryspace/no_priors_sampler.py b/orchestrator/core/discoveryspace/no_priors_sampler.py index 0f7456587..f84da20e9 100644 --- a/orchestrator/core/discoveryspace/no_priors_sampler.py +++ b/orchestrator/core/discoveryspace/no_priors_sampler.py @@ -137,6 +137,3 @@ def parameters_model(cls) -> type[BaseModel] | None: def __init__(self, parameters: NoPriorsParameters) -> None: self.params = parameters - - -# Made with Bob From 98d7d9602be391216bd745737b16e202dbc66fcb Mon Sep 17 00:00:00 2001 From: Daniele Lotito Date: Tue, 21 Apr 2026 09:24:01 +0100 Subject: [PATCH 05/23] refactor(needs fix): update trim to use no priors sampler from random walk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Operation Creation: ❌ FAILED with recursion error Command: uv run ado create operation -f examples/trim/example_yamls/op_pressure.yaml --use-latest space Exit code: 133 The operation started and displayed the discovery space details correctly Ray cluster initialized successfully Failure Details Immediate Failure Symptom: RecursionError: maximum recursion depth exceeded Precise Location: orchestrator/modules/operators/_orchestrate_core.py:43 in log_space_details() Call Stack: File "orchestrator/modules/operators/_orchestrate_core.py", line 117, in _run_operation_harness operation_output: OperationOutput | None = run_closure() File "orchestrator/modules/operators/_general_orchestration.py", line 32, in _run_general_operation_core return operation_function( File "orchestrator/modules/operators/collections.py", line 153, in wrapper return orchestrate_general_operation( File "orchestrator/modules/operators/_general_orchestration.py", line 101, in orchestrate_general_operation log_space_details(discovery_space) File "orchestrator/modules/operators/_orchestrate_core.py", line 43, in log_space_details console.print(discovery_space) Root Cause: The recursion occurs in the Rich library's rendering chain when attempting to print the discovery_space object. The stack trace shows infinite recursion through: rich/console.py → rich/panel.py → rich/padding.py → rich/pretty.py Specifically in pretty.py:489 where repr_str = "".join(str(line) for line in lines) creates a circular reference Additional Observations: The operation created multiple nested sub-operations (visible in the deeply nested error message showing operation identifiers like operation-trim-1.7.1.dev72+gb804c1e11.d20260420-1e3afcbc, operation-trim-1.7.1.dev72+gb804c1e11.d20260420-9a0c5225, etc.) Each sub-operation encountered the same recursion error when trying to log space details The error cascaded through multiple operation levels before the final SIGTRAP signal Conclusion The previous recursion failure still reproduces exactly. The issue is not intermittent—it consistently occurs at the same location (log_space_details()) when the TRIM operator attempts to print the discovery space object using Rich's console rendering. --- plugins/operators/trim/src/trim/operator.py | 26 ++++++++++++++----- .../operators/trim/src/trim/trim_pydantic.py | 3 ++- .../operators/trim/src/trim/trim_sampler.py | 10 +++---- .../operators/trim/src/trim/utils/order.py | 4 ++- .../tests/test_high_dimensional_sampling.py | 5 ++-- plugins/operators/trim/tests/test_sampling.py | 24 +++-------------- 6 files changed, 35 insertions(+), 37 deletions(-) diff --git a/plugins/operators/trim/src/trim/operator.py b/plugins/operators/trim/src/trim/operator.py index cbde836ce..093183cbb 100644 --- a/plugins/operators/trim/src/trim/operator.py +++ b/plugins/operators/trim/src/trim/operator.py @@ -5,8 +5,7 @@ import logging from importlib.metadata import version -from no_priors_characterization.utils import get_source_and_target - +from orchestrator.core.discoveryspace.no_priors_utils import get_source_and_target from orchestrator.core.discoveryspace.space import DiscoverySpace from orchestrator.core.operation.config import FunctionOperationInfo from orchestrator.core.operation.operation import OperationOutput @@ -55,7 +54,6 @@ def trim( OperationOutput containing the operation resources and metadata """ # Lazy import to avoid circular import issues during plugin loading - from orchestrator.modules.operators.collections import characterize from orchestrator.modules.operators.randomwalk import ( CustomSamplerConfiguration, RandomWalkParameters, @@ -95,9 +93,23 @@ def trim( f"Note: Trim sampler has been called with a minimum budget of {params.samplingBudget.minPoints} points." ) - # Call the no-priors-characterization operator directly - no_priors_operator = characterize.no_priors_characterization - op_output_characterization_no_prior = no_priors_operator( + # Use random-walk with no-priors sampler instead of direct operator call + no_priors_module = SamplerModuleConf( + moduleClass="NoPriorsSampleSelector", + moduleName="orchestrator.core.discoveryspace.no_priors_sampler", + ) + no_priors_sampler_config = CustomSamplerConfiguration( + module=no_priors_module, + parameters=params.noPriorParameters, + ) + no_priors_rwparams = RandomWalkParameters( + samplerConfig=no_priors_sampler_config, + batchSize=params.noPriorParameters.batchSize, + numberEntities=params.samplingBudget.minPoints - len(source_df), + singleMeasurement=True, + ) + + op_output_characterization_no_prior = random_walk( discoverySpace=discoverySpace, operationInfo=FunctionOperationInfo.model_validate( { @@ -112,7 +124,7 @@ def trim( ), } ), - **params.noPriorParameters.model_dump(), + **no_priors_rwparams.model_dump(), ) source_df, target_df = get_source_and_target( diff --git a/plugins/operators/trim/src/trim/trim_pydantic.py b/plugins/operators/trim/src/trim/trim_pydantic.py index 0010d297b..aac460b3d 100644 --- a/plugins/operators/trim/src/trim/trim_pydantic.py +++ b/plugins/operators/trim/src/trim/trim_pydantic.py @@ -5,9 +5,10 @@ from typing import Annotated import pydantic -from no_priors_characterization.no_priors_pydantic import NoPriorsParameters from pydantic import BaseModel, ConfigDict, Field, model_validator +from orchestrator.core.discoveryspace.no_priors_parameters import NoPriorsParameters + class SamplingBudget(pydantic.BaseModel): minPoints: Annotated[ diff --git a/plugins/operators/trim/src/trim/trim_sampler.py b/plugins/operators/trim/src/trim/trim_sampler.py index c22b83bea..116434931 100644 --- a/plugins/operators/trim/src/trim/trim_sampler.py +++ b/plugins/operators/trim/src/trim/trim_sampler.py @@ -19,6 +19,11 @@ import pandas as pd from autogluon.tabular import TabularDataset, TabularPredictor +from orchestrator.core.discoveryspace.no_priors_utils import ( + get_index_list_van_der_corput, + get_list_of_entities_from_df_and_space, + get_source_and_target, +) from orchestrator.core.discoveryspace.samplers import BaseSampler from trim.trim_pydantic import TrimParameters @@ -29,11 +34,6 @@ from orchestrator.modules.operators.discovery_space_manager import ( DiscoverySpaceManager, ) -from no_priors_characterization.utils import ( - get_index_list_van_der_corput, - get_list_of_entities_from_df_and_space, - get_source_and_target, -) from orchestrator.utilities.pandas import sort_rows_by_column_names from trim.utils.exceptions import InsufficientDataError diff --git a/plugins/operators/trim/src/trim/utils/order.py b/plugins/operators/trim/src/trim/utils/order.py index 459657ade..1a8779de7 100644 --- a/plugins/operators/trim/src/trim/utils/order.py +++ b/plugins/operators/trim/src/trim/utils/order.py @@ -9,8 +9,10 @@ import numpy as np import pandas as pd from autogluon.tabular import TabularPredictor -from no_priors_characterization.utils import get_sampling_indices_multi_dimensional +from orchestrator.core.discoveryspace.no_priors_utils import ( + get_sampling_indices_multi_dimensional, +) from trim.trim_pydantic import AutoGluonArgs from trim.utils.miscellaneous import delete_dir diff --git a/plugins/operators/trim/tests/test_high_dimensional_sampling.py b/plugins/operators/trim/tests/test_high_dimensional_sampling.py index 0b2c6457c..724a86d53 100644 --- a/plugins/operators/trim/tests/test_high_dimensional_sampling.py +++ b/plugins/operators/trim/tests/test_high_dimensional_sampling.py @@ -13,10 +13,11 @@ from typing import Any import pytest -from no_priors_characterization.utils.high_dimensional_sampling import ( +from test_data_documentation import TEST_DATAFRAMES + +from orchestrator.core.discoveryspace.no_priors_utils import ( concatenated_latin_hypercube_sampling, ) -from test_data_documentation import TEST_DATAFRAMES class TestConcatenatedLatinHypercubeSampling: diff --git a/plugins/operators/trim/tests/test_sampling.py b/plugins/operators/trim/tests/test_sampling.py index a0113b1ae..f381e1a9f 100644 --- a/plugins/operators/trim/tests/test_sampling.py +++ b/plugins/operators/trim/tests/test_sampling.py @@ -2,10 +2,10 @@ # SPDX-License-Identifier: MIT import pytest -from no_priors_characterization.utils.one_dimensional_sampling import ( - get_index_list_ordered_partitions, + +from orchestrator.core.discoveryspace.no_priors_utils import ( get_index_list_van_der_corput, -) # Replace with actual module name +) # --- Error Handling Tests --- @@ -36,21 +36,3 @@ def test_get_index_list_nn_full_sampling() -> None: def test_get_index_list_nn_sorted_sampling(points: int, expected: list[int]) -> None: """Should return sorted sampling for segment of length 17.""" assert get_index_list_van_der_corput(17, points, sort=True) == expected - - -# --- Functional Tests for get_index_list_ordered_partitions --- - - -@pytest.mark.parametrize( - ("points", "expected"), - [ - (7, [0, 2, 4, 8, 10, 12, 16]), - (8, [0, 2, 4, 6, 8, 10, 12, 16]), - (9, [0, 2, 4, 6, 8, 10, 12, 14, 16]), - ], -) -def test_get_index_list_ordered_partitions_sampling( - points: int, expected: list[int] -) -> None: - """Should return correct partition-based sampling for segment of length 17.""" - assert get_index_list_ordered_partitions(17, points) == expected From 3c2ce55e97402064aac8ae92e4ec8ff1381fde0a Mon Sep 17 00:00:00 2001 From: Daniele Lotito Date: Wed, 22 Apr 2026 11:03:15 +0100 Subject: [PATCH 06/23] chore: remove legacy docs about no priors characterization --- website/docs/examples/example_yamls/op_basic_sampling.yaml | 1 - website/docs/examples/example_yamls/op_quick_exploration.yaml | 1 - website/docs/examples/example_yamls/op_thorough_coverage.yaml | 1 - website/docs/examples/example_yamls/space_reaction.yaml | 1 - website/docs/examples/no-priors-characterization.md | 1 - website/mkdocs.yml | 1 - 6 files changed, 6 deletions(-) delete mode 120000 website/docs/examples/example_yamls/op_basic_sampling.yaml delete mode 120000 website/docs/examples/example_yamls/op_quick_exploration.yaml delete mode 120000 website/docs/examples/example_yamls/op_thorough_coverage.yaml delete mode 120000 website/docs/examples/example_yamls/space_reaction.yaml delete mode 120000 website/docs/examples/no-priors-characterization.md diff --git a/website/docs/examples/example_yamls/op_basic_sampling.yaml b/website/docs/examples/example_yamls/op_basic_sampling.yaml deleted file mode 120000 index e01111145..000000000 --- a/website/docs/examples/example_yamls/op_basic_sampling.yaml +++ /dev/null @@ -1 +0,0 @@ -../../../../examples/no-priors-characterization/example_yamls/op_basic_sampling.yaml \ No newline at end of file diff --git a/website/docs/examples/example_yamls/op_quick_exploration.yaml b/website/docs/examples/example_yamls/op_quick_exploration.yaml deleted file mode 120000 index ee9e2d0c6..000000000 --- a/website/docs/examples/example_yamls/op_quick_exploration.yaml +++ /dev/null @@ -1 +0,0 @@ -../../../../examples/no-priors-characterization/example_yamls/op_quick_exploration.yaml \ No newline at end of file diff --git a/website/docs/examples/example_yamls/op_thorough_coverage.yaml b/website/docs/examples/example_yamls/op_thorough_coverage.yaml deleted file mode 120000 index c38ecaf28..000000000 --- a/website/docs/examples/example_yamls/op_thorough_coverage.yaml +++ /dev/null @@ -1 +0,0 @@ -../../../../examples/no-priors-characterization/example_yamls/op_thorough_coverage.yaml \ No newline at end of file diff --git a/website/docs/examples/example_yamls/space_reaction.yaml b/website/docs/examples/example_yamls/space_reaction.yaml deleted file mode 120000 index 48a189ac5..000000000 --- a/website/docs/examples/example_yamls/space_reaction.yaml +++ /dev/null @@ -1 +0,0 @@ -../../../../examples/no-priors-characterization/example_yamls/space_reaction.yaml \ No newline at end of file diff --git a/website/docs/examples/no-priors-characterization.md b/website/docs/examples/no-priors-characterization.md deleted file mode 120000 index 7daf43406..000000000 --- a/website/docs/examples/no-priors-characterization.md +++ /dev/null @@ -1 +0,0 @@ -../../../examples/no-priors-characterization/README.md \ No newline at end of file diff --git a/website/mkdocs.yml b/website/mkdocs.yml index 1b9e3c72d..3e0adc9d9 100644 --- a/website/mkdocs.yml +++ b/website/mkdocs.yml @@ -158,7 +158,6 @@ nav: - Space Characterization: - Identify the important dimensions of a space: examples/lhu.md - Quickly building a predictive model for a configuration space: examples/trim.md - - Characterizing Spaces Without Prior Knowledge: examples/no-priors-characterization.md - Fine-Tuning Throughput: - Measure throughput of fine-tuning locally: examples/finetune-locally.md - Measure throughput of fine-tuning on a remote RayCluster: examples/finetune-remotely.md From ec7cabbc1f2220a83176aa7d2a4490516e316560 Mon Sep 17 00:00:00 2001 From: Daniele Lotito Date: Wed, 22 Apr 2026 11:18:47 +0100 Subject: [PATCH 07/23] refactor: remove legacy operator from tests --- tests/fixtures/modules/operators.py | 2 +- tests/operators/test_general_orchestration.py | 2 +- tests/operators/test_trim_example_integration.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/fixtures/modules/operators.py b/tests/fixtures/modules/operators.py index 7e3320bc9..557613eba 100644 --- a/tests/fixtures/modules/operators.py +++ b/tests/fixtures/modules/operators.py @@ -17,7 +17,7 @@ @pytest.fixture def expected_characterize_operators() -> list[str]: - return ["profile", "detect_anomalous_series", "trim", "no_priors_characterization"] + return ["profile", "detect_anomalous_series", "trim"] @pytest.fixture diff --git a/tests/operators/test_general_orchestration.py b/tests/operators/test_general_orchestration.py index 4db4c904a..86c250800 100644 --- a/tests/operators/test_general_orchestration.py +++ b/tests/operators/test_general_orchestration.py @@ -14,7 +14,7 @@ @pytest.mark.parametrize( "operator_name", - ["profile", "no_priors_characterization"], + ["profile"], ) def test_operator_callable_for_harness_unwraps_decorated_operator( operator_name: str, diff --git a/tests/operators/test_trim_example_integration.py b/tests/operators/test_trim_example_integration.py index 4e05f9eb5..7c75471d0 100644 --- a/tests/operators/test_trim_example_integration.py +++ b/tests/operators/test_trim_example_integration.py @@ -9,11 +9,11 @@ import pytest import trim_custom_experiments.experiments # noqa: F401 — registers ideal-gas experiment import yaml -from no_priors_characterization.no_priors_pydantic import NoPriorsParameters from testcontainers.mysql import MySqlContainer import orchestrator.modules.operators.randomwalk # noqa: F401 from orchestrator.core.discoveryspace.config import DiscoverySpaceConfiguration +from orchestrator.core.discoveryspace.no_priors_parameters import NoPriorsParameters from orchestrator.core.discoveryspace.space import DiscoverySpace from orchestrator.core.operation.resource import ( OperationExitStateEnum, From 6ffef168c98099d822810866e0e098360b2d894b Mon Sep 17 00:00:00 2001 From: Daniele Lotito Date: Wed, 22 Apr 2026 11:25:04 +0100 Subject: [PATCH 08/23] build: add scipy --- requirements.txt | 92 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) diff --git a/requirements.txt b/requirements.txt index 7614c34cf..a518082bd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -766,6 +766,7 @@ numpy==2.2.6 \ # via # ado-core # pandas + # scipy opencensus==0.11.4 \ --hash=sha256:a18487ce68bc19900336e0ff4655c5a116daf10c1b3685ece8d971bddad6a864 \ --hash=sha256:cbef87d8b8773064ab60e5c2a1ced58bbaa38a6d052c41aec224958ce544eff2 @@ -1256,6 +1257,97 @@ rpds-py==0.30.0 \ # via # jsonschema # referencing +scipy==1.15.3 ; python_full_version < '3.11' \ + --hash=sha256:05dc6abcd105e1a29f95eada46d4a3f251743cfd7d3ae8ddb4088047f24ea477 \ + --hash=sha256:06efcba926324df1696931a57a176c80848ccd67ce6ad020c810736bfd58eb1c \ + --hash=sha256:0a769105537aa07a69468a0eefcd121be52006db61cdd8cac8a0e68980bbb723 \ + --hash=sha256:0bdd905264c0c9cfa74a4772cdb2070171790381a5c4d312c973382fc6eaf730 \ + --hash=sha256:0ff17c0bb1cb32952c09217d8d1eed9b53d1463e5f1dd6052c7857f83127d539 \ + --hash=sha256:14ed70039d182f411ffc74789a16df3835e05dc469b898233a245cdfd7f162cb \ + --hash=sha256:185cd3d6d05ca4b44a8f1595af87f9c372bb6acf9c808e99aa3e9aa03bd98cf6 \ + --hash=sha256:18aaacb735ab38b38db42cb01f6b92a2d0d4b6aabefeb07f02849e47f8fb3594 \ + --hash=sha256:1c832e1bd78dea67d5c16f786681b28dd695a8cb1fb90af2e27580d3d0967e92 \ + --hash=sha256:263961f658ce2165bbd7b99fa5135195c3a12d9bef045345016b8b50c315cb82 \ + --hash=sha256:271e3713e645149ea5ea3e97b57fdab61ce61333f97cfae392c28ba786f9bb49 \ + --hash=sha256:2c620736bcc334782e24d173c0fdbb7590a0a436d2fdf39310a8902505008759 \ + --hash=sha256:34716e281f181a02341ddeaad584205bd2fd3c242063bd3423d61ac259ca7eba \ + --hash=sha256:39cb9c62e471b1bb3750066ecc3a3f3052b37751c7c3dfd0fd7e48900ed52982 \ + --hash=sha256:3ac07623267feb3ae308487c260ac684b32ea35fd81e12845039952f558047b8 \ + --hash=sha256:3b0334816afb8b91dab859281b1b9786934392aa3d527cd847e41bb6f45bee65 \ + --hash=sha256:40e54d5c7e7ebf1aa596c374c49fa3135f04648a0caabcb66c52884b943f02b4 \ + --hash=sha256:50f9e62461c95d933d5c5ef4a1f2ebf9a2b4e83b0db374cb3f1de104d935922e \ + --hash=sha256:52092bc0472cfd17df49ff17e70624345efece4e1a12b23783a1ac59a1b728ed \ + --hash=sha256:5380741e53df2c566f4d234b100a484b420af85deb39ea35a1cc1be84ff53a5c \ + --hash=sha256:5e721fed53187e71d0ccf382b6bf977644c533e506c4d33c3fb24de89f5c3ed5 \ + --hash=sha256:6487aa99c2a3d509a5227d9a5e889ff05830a06b2ce08ec30df6d79db5fcd5c5 \ + --hash=sha256:6ac6310fdbfb7aa6612408bd2f07295bcbd3fda00d2d702178434751fe48e019 \ + --hash=sha256:6cfd56fc1a8e53f6e89ba3a7a7251f7396412d655bca2aa5611c8ec9a6784a1e \ + --hash=sha256:6db907c7368e3092e24919b5e31c76998b0ce1684d51a90943cb0ed1b4ffd6c1 \ + --hash=sha256:721d6b4ef5dc82ca8968c25b111e307083d7ca9091bc38163fb89243e85e3889 \ + --hash=sha256:76ad1fb5f8752eabf0fa02e4cc0336b4e8f021e2d5f061ed37d6d264db35e3ca \ + --hash=sha256:79167bba085c31f38603e11a267d862957cbb3ce018d8b38f79ac043bc92d825 \ + --hash=sha256:795c46999bae845966368a3c013e0e00947932d68e235702b5c3f6ea799aa8c9 \ + --hash=sha256:7e11270a000969409d37ed399585ee530b9ef6aa99d50c019de4cb01e8e54e62 \ + --hash=sha256:8c9ed3ba2c8a2ce098163a9bdb26f891746d02136995df25227a20e71c396ebb \ + --hash=sha256:993439ce220d25e3696d1b23b233dd010169b62f6456488567e830654ee37a6b \ + --hash=sha256:9d61e97b186a57350f6d6fd72640f9e99d5a4a2b8fbf4b9ee9a841eab327dc13 \ + --hash=sha256:9db984639887e3dffb3928d118145ffe40eff2fa40cb241a306ec57c219ebbbb \ + --hash=sha256:9e2abc762b0811e09a0d3258abee2d98e0c703eee49464ce0069590846f31d40 \ + --hash=sha256:a345928c86d535060c9c2b25e71e87c39ab2f22fc96e9636bd74d1dbf9de448c \ + --hash=sha256:ad3432cb0f9ed87477a8d97f03b763fd1d57709f1bbde3c9369b1dff5503b253 \ + --hash=sha256:ae48a786a28412d744c62fd7816a4118ef97e5be0bee968ce8f0a2fba7acf3bb \ + --hash=sha256:aef683a9ae6eb00728a542b796f52a5477b78252edede72b8327a886ab63293f \ + --hash=sha256:b90ab29d0c37ec9bf55424c064312930ca5f4bde15ee8619ee44e69319aab163 \ + --hash=sha256:c05045d8b9bfd807ee1b9f38761993297b10b245f012b11b13b91ba8945f7e45 \ + --hash=sha256:c9deabd6d547aee2c9a81dee6cc96c6d7e9a9b1953f74850c179f91fdc729cb7 \ + --hash=sha256:dde4fc32993071ac0c7dd2d82569e544f0bdaff66269cb475e0f369adad13f11 \ + --hash=sha256:eae3cf522bc7df64b42cad3925c876e1b0b6c35c1337c93e12c0f366f55b0eaf \ + --hash=sha256:ed7284b21a7a0c8f1b6e5977ac05396c0d008b89e05498c8b7e8f4a1423bba0e \ + --hash=sha256:f77f853d584e72e874d87357ad70f44b437331507d1c311457bed8ed2b956126 + # via ado-core +scipy==1.16.3 ; python_full_version >= '3.11' \ + --hash=sha256:0151a0749efeaaab78711c78422d413c583b8cdd2011a3c1d6c794938ee9fdb2 \ + --hash=sha256:01e87659402762f43bd2fee13370553a17ada367d42e7487800bf2916535aecb \ + --hash=sha256:03192a35e661470197556de24e7cb1330d84b35b94ead65c46ad6f16f6b28f2a \ + --hash=sha256:0553371015692a898e1aa858fed67a3576c34edefa6b7ebdb4e9dde49ce5c203 \ + --hash=sha256:062246acacbe9f8210de8e751b16fc37458213f124bef161a5a02c7a39284304 \ + --hash=sha256:0c3b4dd3d9b08dbce0f3440032c52e9e2ab9f96ade2d3943313dfe51a7056959 \ + --hash=sha256:0c623a54f7b79dd88ef56da19bc2873afec9673a48f3b85b18e4d402bdd29a5a \ + --hash=sha256:16b8bc35a4cc24db80a0ec836a9286d0e31b2503cb2fd7ff7fb0e0374a97081d \ + --hash=sha256:1fb2472e72e24d1530debe6ae078db70fb1605350c88a3d14bc401d6306dbffe \ + --hash=sha256:2a207a6ce9c24f1951241f4693ede2d393f59c07abc159b2cb2be980820e01fb \ + --hash=sha256:2b71d93c8a9936046866acebc915e2af2e292b883ed6e2cbe5c34beb094b82d9 \ + --hash=sha256:2d1ae2cf0c350e7705168ff2429962a89ad90c2d49d1dd300686d8b2a5af22fc \ + --hash=sha256:3d4a07a8e785d80289dfe66b7c27d8634a773020742ec7187b85ccc4b0e7b686 \ + --hash=sha256:40be6cf99e68b6c4321e9f8782e7d5ff8265af28ef2cd56e9c9b2638fa08ad97 \ + --hash=sha256:50a3dbf286dbc7d84f176f9a1574c705f277cb6565069f88f60db9eafdbe3ee2 \ + --hash=sha256:532fb5ad6a87e9e9cd9c959b106b73145a03f04c7d57ea3e6f6bb60b86ab0876 \ + --hash=sha256:53c3844d527213631e886621df5695d35e4f6a75f620dca412bcd292f6b87d78 \ + --hash=sha256:56edc65510d1331dae01ef9b658d428e33ed48b4f77b1d51caf479a0253f96dc \ + --hash=sha256:57d01cb6f85e34f0946b33caa66e892aae072b64b034183f3d87c4025802a119 \ + --hash=sha256:5803c5fadd29de0cf27fa08ccbfe7a9e5d741bf63e4ab1085437266f12460ff9 \ + --hash=sha256:63d3cdacb8a824a295191a723ee5e4ea7768ca5ca5f2838532d9f2e2b3ce2135 \ + --hash=sha256:663b8d66a8748051c3ee9c96465fb417509315b99c71550fda2591d7dd634234 \ + --hash=sha256:72d1717fd3b5e6ec747327ce9bda32d5463f472c9dce9f54499e81fbd50245a1 \ + --hash=sha256:7dc1360c06535ea6116a2220f760ae572db9f661aba2d88074fe30ec2aa1ff88 \ + --hash=sha256:81fc5827606858cf71446a5e98715ba0e11f0dbc83d71c7409d05486592a45d6 \ + --hash=sha256:8be1ca9170fcb6223cc7c27f4305d680ded114a1567c0bd2bfcbf947d1b17511 \ + --hash=sha256:8d09d72dc92742988b0e7750bddb8060b0c7079606c0d24a8cc8e9c9c11f9079 \ + --hash=sha256:9452781bd879b14b6f055b26643703551320aa8d79ae064a71df55c00286a184 \ + --hash=sha256:96491a6a54e995f00a28a3c3badfff58fd093bf26cd5fb34a2188c8c756a3a2c \ + --hash=sha256:a8a26c78ef223d3e30920ef759e25625a0ecdd0d60e5a8818b7513c3e5384cf2 \ + --hash=sha256:b7180967113560cca57418a7bc719e30366b47959dd845a93206fbed693c867e \ + --hash=sha256:b81c27fc41954319a943d43b20e07c40bdcd3ff7cf013f4fb86286faefe546c4 \ + --hash=sha256:bea0a62734d20d67608660f69dcda23e7f90fb4ca20974ab80b6ed40df87a005 \ + --hash=sha256:c5192722cffe15f9329a3948c4b1db789fbb1f05c97899187dcf009b283aea70 \ + --hash=sha256:c97176013d404c7346bf57874eaac5187d969293bf40497140b0a2b2b7482e07 \ + --hash=sha256:cd13e354df9938598af2be05822c323e97132d5e6306b83a3b4ee6724c6e522e \ + --hash=sha256:d2ec56337675e61b312179a1ad124f5f570c00f920cc75e1000025451b88241c \ + --hash=sha256:deb3841c925eeddb6afc1e4e4a45e418d19ec7b87c5df177695224078e8ec733 \ + --hash=sha256:e7efa2681ea410b10dde31a52b18b0154d66f2485328830e45fdf183af5aefc6 \ + --hash=sha256:eab43fae33a0c39006a88096cd7b4f4ef545ea0447d250d5ac18202d40b6611d \ + --hash=sha256:fb4b29f4cf8cc5a8d628bc8d8e26d12d7278cd1f219f22698a378c3d67db5e4b + # via ado-core shellingham==1.5.4 \ --hash=sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686 \ --hash=sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de From 843eb4115c385a3a652468f5eab9b7e120698f26 Mon Sep 17 00:00:00 2001 From: Daniele Lotito Date: Wed, 22 Apr 2026 11:26:39 +0100 Subject: [PATCH 09/23] build: add scipy pt 2 --- pyproject.toml | 1 + uv.lock | 9 ++++++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 0923d6933..47703c217 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,6 +29,7 @@ dependencies = [ "pymysql[rsa]>=1.1.1", "pyyaml>=6.0.2", "ray[serve]>=2.9", + "scipy", "sqlalchemy>2", "typer>=0.22.0", "uv>=0.10.4", diff --git a/uv.lock b/uv.lock index 0f835af53..b57f24c79 100644 --- a/uv.lock +++ b/uv.lock @@ -79,6 +79,8 @@ dependencies = [ { name = "pymysql", extra = ["rsa"] }, { name = "pyyaml" }, { name = "ray", extra = ["serve"] }, + { name = "scipy", version = "1.15.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, + { name = "scipy", version = "1.16.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, { name = "sqlalchemy" }, { name = "typer" }, { name = "uv" }, @@ -143,6 +145,7 @@ requires-dist = [ { name = "pymysql", extras = ["rsa"], specifier = ">=1.1.1" }, { name = "pyyaml", specifier = ">=6.0.2" }, { name = "ray", extras = ["serve"], specifier = ">=2.9" }, + { name = "scipy" }, { name = "sqlalchemy", specifier = ">2" }, { name = "typer", specifier = ">=0.22.0" }, { name = "uv", specifier = ">=0.10.4" }, @@ -8611,7 +8614,7 @@ wheels = [ [[package]] name = "ydata-profiling" -version = "4.18.3" +version = "4.18.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "dacite" }, @@ -8637,9 +8640,9 @@ dependencies = [ { name = "visions", extra = ["type-image-path"] }, { name = "wordcloud" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/1b/90/fecdf08148ce205075eae12380726655fd3c52f323940d90abe07f137471/ydata_profiling-4.18.3.tar.gz", hash = "sha256:9817caa784b4065455c2f527a106edd74719b2dfb3a69c0b8760d5ed4ee27613", size = 331603, upload-time = "2026-04-16T23:49:07.871Z" } +sdist = { url = "https://files.pythonhosted.org/packages/40/83/03011a81f7148a71b81dfd9afda9c497de9801a3af24a2f636e47701b052/ydata_profiling-4.18.1.tar.gz", hash = "sha256:6dc529893c84abecda2e6038564cabcd69a6b801a730e077b61b6d500a98f30c", size = 331100, upload-time = "2026-01-13T18:28:59.076Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/f3/0b/da00faee551dfc0b984b0b6b4fc049aa15e18dfdb987e94281d8c79f3112/ydata_profiling-4.18.3-py2.py3-none-any.whl", hash = "sha256:3482932fb6f49c182391bcdc564541406ffde8ead5c15305242e7dad419a34b2", size = 400782, upload-time = "2026-04-16T23:49:06.048Z" }, + { url = "https://files.pythonhosted.org/packages/64/2c/992c6ef2677b182b86157eef2124a0e837aa3d128b5b09f6fb03cd51bb0a/ydata_profiling-4.18.1-py2.py3-none-any.whl", hash = "sha256:cd816865930f090cc71bbd5c2045b90c727e7448e3402d22f397f7ed3d4e2927", size = 400396, upload-time = "2026-01-13T18:28:57.08Z" }, ] [[package]] From 42c086df376f25833a68723a428728c4f995571b Mon Sep 17 00:00:00 2001 From: Daniele Lotito Date: Wed, 22 Apr 2026 12:21:19 +0100 Subject: [PATCH 10/23] chore: remove unused import --- plugins/operators/trim/src/trim/operator.py | 1 - 1 file changed, 1 deletion(-) diff --git a/plugins/operators/trim/src/trim/operator.py b/plugins/operators/trim/src/trim/operator.py index 84e17f32c..594b27aa9 100644 --- a/plugins/operators/trim/src/trim/operator.py +++ b/plugins/operators/trim/src/trim/operator.py @@ -54,7 +54,6 @@ def trim( OperationOutput containing the operation resources and metadata """ # Lazy import to avoid circular import issues during plugin loading - import orchestrator.modules.operators.randomwalk # noqa: F401 - registers explore.random_walk from orchestrator.modules.operators.collections import explore from orchestrator.modules.operators.randomwalk import ( CustomSamplerConfiguration, From 9b57782669c054f8e43dba74ac19e779ce1ac192 Mon Sep 17 00:00:00 2001 From: Daniele Lotito Date: Wed, 22 Apr 2026 12:44:06 +0100 Subject: [PATCH 11/23] docs: rephrase a sentence --- website/docs/operators/random-walk.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/website/docs/operators/random-walk.md b/website/docs/operators/random-walk.md index 5d2568ac8..4767f4702 100644 --- a/website/docs/operators/random-walk.md +++ b/website/docs/operators/random-walk.md @@ -299,9 +299,8 @@ space: independently without replacement, cycling through all values before repeating. This ensures each dimension is uniformly covered. -**Collision Handling**: Sobol sampling may produce collisions (duplicate points) -when mapping continuous Sobol sequences to discrete integer coordinates. When -collisions are detected, the sampler automatically falls back to CLHS to ensure +**Collision Handling**: Sobol sampling may produce collisions (duplicate points), +when this happens the sampler automatically falls back to CLHS to ensure the requested number of unique samples. #### Example: Sobol Sampling From c869839d10c4ecddece0d62dcdbcde68acbd0c8e Mon Sep 17 00:00:00 2001 From: Daniele Lotito Date: Thu, 23 Apr 2026 11:22:13 +0100 Subject: [PATCH 12/23] refactor: migrate no_priors modules from orchestrator to trim plugin - Move no_priors_parameters.py, no_priors_sampler.py, no_priors_utils.py from orchestrator/core/discoveryspace/ to plugins/operators/trim/src/trim/samplers/ - Update all imports in trim plugin to reference new location - Update module name in operator.py from orchestrator.core.discoveryspace.no_priors_sampler to trim.samplers.no_priors_sampler - Delete old files from orchestrator/core/discoveryspace/ - Delete corresponding test file from tests/core/discoveryspace/ This change encapsulates no_priors functionality within the trim plugin where it belongs. --- .../randomwalk_clhs_operation.yaml | 26 + .../randomwalk_sobol_operation.yaml | 26 + .../trim/src/trim/samplers/__init__.py | 2 + .../src/trim/samplers/no_priors_parameters.py | 65 ++ .../src/trim/samplers/no_priors_sampler.py | 139 +++ .../trim/src/trim/samplers/no_priors_utils.py | 953 ++++++++++++++++++ 6 files changed, 1211 insertions(+) create mode 100644 examples/trim/example_yamls/randomwalk_clhs_operation.yaml create mode 100644 examples/trim/example_yamls/randomwalk_sobol_operation.yaml create mode 100644 plugins/operators/trim/src/trim/samplers/__init__.py create mode 100644 plugins/operators/trim/src/trim/samplers/no_priors_parameters.py create mode 100644 plugins/operators/trim/src/trim/samplers/no_priors_sampler.py create mode 100644 plugins/operators/trim/src/trim/samplers/no_priors_utils.py diff --git a/examples/trim/example_yamls/randomwalk_clhs_operation.yaml b/examples/trim/example_yamls/randomwalk_clhs_operation.yaml new file mode 100644 index 000000000..35efa1b2e --- /dev/null +++ b/examples/trim/example_yamls/randomwalk_clhs_operation.yaml @@ -0,0 +1,26 @@ +# Copyright IBM Corporation 2025, 2026 +# SPDX-License-Identifier: MIT +metadata: + name: 'randomwalk-sobol' + description: 'Perform a random walk using Sobol quasi-random sampling for better space coverage' +spaces: + - space-2fa5d0-2905f9 +operation: + module: + operatorName: "random_walk" + operationType: "search" + parameters: + numberEntities: 20 + batchSize: 5 + singleMeasurement: true + samplerConfig: + module: + moduleName: trim.samplers.no_priors_sampler + moduleClass: NoPriorsSampleSelector + parameters: + targetOutput: pressure + samples: 20 + batchSize: 1 + sampling_strategy: clhs + +# Made with Bob diff --git a/examples/trim/example_yamls/randomwalk_sobol_operation.yaml b/examples/trim/example_yamls/randomwalk_sobol_operation.yaml new file mode 100644 index 000000000..19ba89791 --- /dev/null +++ b/examples/trim/example_yamls/randomwalk_sobol_operation.yaml @@ -0,0 +1,26 @@ +# Copyright IBM Corporation 2025, 2026 +# SPDX-License-Identifier: MIT +metadata: + name: 'randomwalk-sobol' + description: 'Perform a random walk using Sobol quasi-random sampling for better space coverage' +spaces: + - space-2fa5d0-2905f9 +operation: + module: + operatorName: "random_walk" + operationType: "search" + parameters: + numberEntities: 20 + batchSize: 5 + singleMeasurement: true + samplerConfig: + module: + moduleName: trim.samplers.no_priors_sampler + moduleClass: NoPriorsSampleSelector + parameters: + targetOutput: pressure + samples: 20 + batchSize: 1 + sampling_strategy: sobol + +# Made with Bob diff --git a/plugins/operators/trim/src/trim/samplers/__init__.py b/plugins/operators/trim/src/trim/samplers/__init__.py new file mode 100644 index 000000000..2a4c79e8b --- /dev/null +++ b/plugins/operators/trim/src/trim/samplers/__init__.py @@ -0,0 +1,2 @@ +# Copyright IBM Corporation 2025, 2026 +# SPDX-License-Identifier: MIT diff --git a/plugins/operators/trim/src/trim/samplers/no_priors_parameters.py b/plugins/operators/trim/src/trim/samplers/no_priors_parameters.py new file mode 100644 index 000000000..1271ac769 --- /dev/null +++ b/plugins/operators/trim/src/trim/samplers/no_priors_parameters.py @@ -0,0 +1,65 @@ +# Copyright IBM Corporation 2025, 2026 +# SPDX-License-Identifier: MIT + +from typing import Annotated, Literal + +from pydantic import BaseModel, BeforeValidator, Field + + +class NoPriorsParameters(BaseModel): + """ + Parameters for sampling high-dimensional spaces without prior model structure. + + The `sampling_strategy` must be one of the Literals supported. + Source of truth for supported strategies is the comment block right here: + + strategy (str): sampling subroutine: + - 'random': selects random points from the beginning + - 'one_shift': refer to one_shift_then_random_points_high_dimensional_sampling + - 'recursive_aggregation': refer to recursive_aggregation_high_dimensional_sampling + - 'clhs': refer to concatenated_latin_hypercube_sampling + - 'sobol': sobol sampling + """ + + targetOutput: Annotated[ + str, + Field( + description="The measured property you will treat as a target variable.", + ), + ] + + samples: Annotated[ + int, + Field( + ge=1, + description="Number of unique points to sample (must be >= 1).", + ), + ] = 20 + + batchSize: Annotated[ + int, + Field( + ge=1, + description=( + "Batch size parameter used by certain samplers (e.g., randomWalk) via continuous batching; " + "by default set equal to iterationSize in those contexts. Must be >= 1." + ), + ), + ] = 1 + + sampling_strategy: Annotated[ + Literal["random", "one_shift", "recursive_aggregation", "clhs", "sobol"], + BeforeValidator(lambda s: s.lower()), + Field( + description=( + "Sampling subroutine. Supported values:\n" + " - 'random': selects random points from the beginning\n" + " - 'one_shift': see one_shift_then_random_points_high_dimensional_sampling\n" + " - 'recursive_aggregation': see recursive_aggregation_high_dimensional_sampling\n" + " - 'clhs': dimension-wise random without replacement until each dim cycles\n" + " - 'sobol': sobol sampling via scipy\n" + "Aliases: 'random_shifts' → 'recursive_aggregation'.\n" + "Validation is case-insensitive; value is normalized to lowercase." + ), + ), + ] = "clhs" diff --git a/plugins/operators/trim/src/trim/samplers/no_priors_sampler.py b/plugins/operators/trim/src/trim/samplers/no_priors_sampler.py new file mode 100644 index 000000000..d685b4908 --- /dev/null +++ b/plugins/operators/trim/src/trim/samplers/no_priors_sampler.py @@ -0,0 +1,139 @@ +# Copyright IBM Corporation 2025, 2026 +# SPDX-License-Identifier: MIT + +import asyncio +import logging +import typing + +from pydantic import BaseModel + +from orchestrator.core.discoveryspace.samplers import BaseSampler +from orchestrator.core.discoveryspace.space import DiscoverySpace, Entity +from orchestrator.modules.operators.discovery_space_manager import DiscoverySpaceManager +from trim.samplers.no_priors_parameters import NoPriorsParameters +from trim.samplers.no_priors_utils import ( + get_list_of_entities_from_df_and_space, + get_source_and_target, + order_df_for_sampling_with_no_priors, +) + +logger_no_priors = logging.getLogger(__name__) + + +# NOTE: to repeat the operation on the same space I can delete the operation if the output of this operation +# are not used by another operation +class NoPriorsSampleSelector(BaseSampler): + @classmethod + def samplerCompatibleWithDiscoverySpaceRemote( + cls, remoteDiscoverySpace: DiscoverySpaceManager # type: ignore[name-defined] + ) -> bool: + return True + + async def remoteEntityIterator( + self, remoteDiscoverySpace: DiscoverySpaceManager, batchsize: int = 1 + ) -> typing.AsyncGenerator[list[Entity], None]: + """ + Generate entities for no-priors characterization sampling. + + Orders the target space using a high-dimensional sampling strategy (e.g., CLHS, Sobol) + without relying on prior model knowledge or feature importance. + + Args: + remoteDiscoverySpace: Manager for the discovery space state + batchsize: Number of entities to yield per iteration + + Yields: + List of Entity objects to be measured, in the determined order + """ + + async def iterator_closure( + stateHandle: DiscoverySpaceManager, # type: ignore[name-defined] + ) -> typing.Callable[[], typing.AsyncGenerator[list[Entity], None]]: + + logger_no_priors.info("Characterization with no-priors starts.\n") + logger_no_priors.info(f"Parameters are:\n{self.params}\n\n") + + discoverySpace = await stateHandle.discoverySpace.remote() + source_df, target_df = get_source_and_target( + discoverySpace, self.params.targetOutput + ) + logger_no_priors.info(f"Target dataframe has length {len(target_df)}") + + # The 'samples' parameter specifies the number of NEW entities to sample, + # regardless of how many entities have already been measured in the space + logger_no_priors.info( + f"Space has {len(source_df)} measured entities. " + f"Sampling {self.params.samples} new entities as requested." + ) + target_df = order_df_for_sampling_with_no_priors( + target_df, + [ + cp.identifier + for cp in discoverySpace.entitySpace.constitutiveProperties + ], + self.params.samples, + strategy=self.params.sampling_strategy, + ) + list_of_entities_for_no_prior_characterization = ( + get_list_of_entities_from_df_and_space( + df=target_df, space=discoverySpace + ) + ) + + logger_no_priors.info( + "\n\nCharacterization with no-priors finished. Starting Iterative Modeling.\n" + ) + + async def iterator() -> typing.AsyncGenerator[list[Entity], None]: # type: ignore[name-defined] + logger_no_priors.info( + "\n\nIteration over sorted entities for no priors characterization starts.\n" + ) + await asyncio.sleep(0.1) + for i in range( + 0, len(list_of_entities_for_no_prior_characterization), batchsize + ): + entities = list_of_entities_for_no_prior_characterization[ + i : i + batchsize + ] + if len(entities) == 0: + logger_no_priors.info( + "\n\nCharacterization with no-priors finished.\n" + ) + break + else: + yield entities + logger_no_priors.info("\n\nCharacterization with no-priors finished.\n") + + return iterator + + retval = await iterator_closure(remoteDiscoverySpace) + + return retval() + + def entityIterator( + self, discoverySpace: DiscoverySpace, batchsize: int = 1 + ) -> typing.Generator[list[Entity], None, None]: + """Returns an remoteEntityIterator that returns entities in order""" + + def iterator_closure( + space: DiscoverySpace, + ) -> typing.Callable[[], typing.Generator[list[Entity], None, None]]: + + # list_of_entities = list(...) # type: ignore[name-defined] + # numberEntities = len(list_of_entities) + + def iterator() -> typing.Generator[list[Entity], None, None]: # type: ignore[name-defined] + raise NotImplementedError + # ...for i in range(0, numberEntities, batchsize): + + return iterator + + retval = iterator_closure(discoverySpace) + return retval() + + @classmethod + def parameters_model(cls) -> type[BaseModel] | None: + return NoPriorsParameters + + def __init__(self, parameters: NoPriorsParameters) -> None: + self.params = parameters diff --git a/plugins/operators/trim/src/trim/samplers/no_priors_utils.py b/plugins/operators/trim/src/trim/samplers/no_priors_utils.py new file mode 100644 index 000000000..36a06e829 --- /dev/null +++ b/plugins/operators/trim/src/trim/samplers/no_priors_utils.py @@ -0,0 +1,953 @@ +# Copyright IBM Corporation 2025, 2026 +# SPDX-License-Identifier: MIT + +""" +Utility functions for no-priors sampling, including: +- High-dimensional sampling strategies (CLHS, Sobol, random) +- DataFrame ordering and index mapping +- Entity/point conversion and validation +- Discovery space data extraction +""" + +from __future__ import annotations + +import itertools +import logging +import math +import random +from typing import TYPE_CHECKING, Any, Literal + +import numpy as np +import pandas as pd +from scipy.stats.qmc import Sobol + +from orchestrator.core.discoveryspace.space import DiscoverySpace +from orchestrator.schema.virtual_property import PropertyAggregationMethodEnum + +if TYPE_CHECKING: + from collections.abc import Hashable + + from orchestrator.metastore.project import ProjectContext + from orchestrator.schema.entity import Entity + +logger = logging.getLogger(__name__) + + +# ============================================================================ +# 1D Sampling Functions +# ============================================================================ + + +def get_index_list_van_der_corput( + length_segment: int, + tot_points_to_sample: int, + sampled_indices: list[int] | None = None, + sort: bool = False, + verbose: bool = False, +) -> list[int]: + """ + Selects indices from a 1D segment using a modified Van der Corput sequence. + + Args: + length_segment: Total number of units in the 1D segment + tot_points_to_sample: Total number of indices to sample + sampled_indices: List of indices already sampled + sort: If True, returns the final list sorted + verbose: If True, prints debug information + + Returns: + List of sampled indices + + Raises: + ValueError: If tot_points_to_sample exceeds length_segment + """ + if tot_points_to_sample == 0: + return [] + + if tot_points_to_sample > length_segment: + raise ValueError( + "ValueError: You are trying to sample more points than those that are available" + ) + + if sampled_indices is None: + sampled_indices = [] + + if len(sampled_indices) == length_segment: + maximal_indices_list = list(range(length_segment)) + if sampled_indices.sort() != maximal_indices_list: + logging.error( + "Sampled indices do not correspond to [0,..., max_n_indices -1]" + "Returning list(range(max_n_indices)" + ) + return maximal_indices_list + + if len(sampled_indices) > tot_points_to_sample: + logging.warning( + "Number of sampled indices is greater than the number of indices you want to sample" + "Returning sampled indices" + ) + return sampled_indices + + index_list = list(sampled_indices) + sampled_set = set(index_list) + + for point in [0, length_segment - 1]: + if point not in sampled_set: + index_list.append(point) + sampled_set.add(point) + if len(index_list) == tot_points_to_sample: + return sorted(index_list) + + def build_prefix_and_len(index_list: list[int]) -> tuple[list[int], int]: + if not index_list: + return [0], 0 + + M = max(index_list) + 1 + sampled_set = set(index_list) + prefix = [0] * (M + 1) + s = 0 + + for i in range(M): + s += 1 if i in sampled_set else 0 + prefix[i + 1] = s + + return prefix, M + + def get_list_min_weight( + prefix: list[int], M: int, d: int, selectable_indices: list[int] + ) -> list[int]: + vals = {} + for i in selectable_indices: + if i >= M: + break + left = max(0, i - d) + right = min(M - 1, i + d) + total = prefix[right + 1] - prefix[left] + denom = right - left + 1 + mean = total / denom + vals[i] = mean + + if not vals: + return [] + + min_val = min(vals.values()) + out = [] + for i in selectable_indices: + if i >= M: + break + if vals.get(i) == min_val: + out.append(i) + return out + + def get_selectable_indices() -> list[int]: + return [i for i in range(length_segment) if i not in sampled_set] + + max_d = length_segment + + while len(index_list) < tot_points_to_sample: + selection = 0 + selectable_indices = get_selectable_indices() + prefix, M = build_prefix_and_len(index_list=index_list) + d = 1 + previous_set = selectable_indices + + while selection == 0: + indices = get_list_min_weight(prefix, M, d, selectable_indices) + + if not indices: + if not previous_set: + raise ValueError( + "Previous candidate set should not be empty or None" + ) + if verbose: + logger.info( + f"No intersection found with d={d}. Using the previous set " + f"Appending to {index_list} the first element of {previous_set}" + ) + chosen = previous_set[0] + index_list.append(chosen) + sampled_set.add(chosen) + selection = 1 + else: + previous_set = selectable_indices + selectable_indices = indices + + if len(selectable_indices) == 1 or d == max_d: + if verbose: + logger.info( + f"Appending to {index_list} the first element of {selectable_indices}" + ) + chosen = selectable_indices[0] + index_list.append(chosen) + sampled_set.add(chosen) + selection = 1 + + d += 1 + + if sort: + return sorted(index_list) + return index_list + + +# ============================================================================ +# High-Dimensional Sampling Functions +# ============================================================================ + + +def concatenated_latin_hypercube_sampling( + dimensions: list[int], + final_sample_size: int, + seed: int | None = None, +) -> list[list[int]]: + """ + Generates samples using Concatenated Latin Hypercube Sampling. + + Args: + dimensions: Cardinality (size) of each dimension + final_sample_size: Total number of points to sample + seed: Optional PRNG seed for reproducibility + + Returns: + List of sampled points + + Raises: + ValueError: If any dimension size is less than 1 + """ + if any(d <= 0 for d in dimensions): + raise ValueError( + f"All dimensions must be >= 1, received dimensions={dimensions}" + ) + + if final_sample_size <= 0: + return [] + + rng = random.Random() if seed is None else random.Random(seed) # noqa: S311 + pools: list[list[int]] = [list(range(d)) for d in dimensions] + samples: list[list[int]] = [] + + for _ in range(final_sample_size): + point: list[int] = [] + for j, d in enumerate(dimensions): + if not pools[j]: + pools[j] = list(range(d)) + k = rng.randrange(len(pools[j])) + value = pools[j].pop(k) + point.append(value) + samples.append(point) + + return samples + + +def sobol_sampling( + dimensions: list[int], final_sample_size: int, seed: int | None = None +) -> list[list[int]]: + """ + Generates Sobol sampled points scaled to integer dimensions. + + Falls back to CLHS if collisions are detected. + + Args: + dimensions: Size of each dimension + final_sample_size: Number of points to sample + seed: Random seed for the Sobol scrambler + + Returns: + List of sampled points + """ + sampler = Sobol(d=len(dimensions), scramble=True, rng=seed) + points = sampler.random(final_sample_size) + + discrete_points = [ + [int(val * d) for val, d in zip(p, dimensions, strict=True)] for p in points + ] + + unique_points = {tuple(p) for p in discrete_points} + n_collisions = final_sample_size - len(unique_points) + + if n_collisions > 0: + logger.error( + f"Sobol sampling failed, {n_collisions} collisions detected, defaulting to clhs sampling" + ) + return concatenated_latin_hypercube_sampling( + dimensions=dimensions, final_sample_size=final_sample_size, seed=seed + ) + + return discrete_points + + +def random_high_dimensional_sampling( + dimensions: list[int], final_sample_size: int, seed: int | None = None +) -> list[list[int]]: + """ + Generate unique random samples from a high-dimensional space. + + Args: + dimensions: Cardinality of each dimension + final_sample_size: Total number of points to sample + seed: Optional PRNG seed + + Returns: + List of sampled points + + Raises: + ValueError: If final_sample_size exceeds total configurations + """ + if seed is not None: + random.seed(seed) + + num_configs = math.prod(dimensions) + if final_sample_size > num_configs: + raise ValueError( + f"Cannot generate {final_sample_size} unique samples. " + f"The sample space only contains {num_configs} possibilities." + ) + + configs = list(itertools.product(*[range(d) for d in dimensions])) + actual_sample_size = min(final_sample_size, len(configs)) + + if actual_sample_size < final_sample_size: + logger.warning( + f"Requested {final_sample_size} samples but only {len(configs)} unique " + f"configurations available. Sampling {actual_sample_size} instead." + ) + + samples = random.sample(configs, actual_sample_size) + return [list(s) for s in samples] + + +def get_sampling_indices_multi_dimensional( + dimensions: list[int], + n: int | Literal["all", "max"], + space: dict[str, int] | None = None, + strategy: Literal["random", "clhs", "sobol"] = "clhs", + seed: int | None = None, +) -> list[list[int]]: + """ + Generate sampling indices for a high-dimensional space. + + Args: + dimensions: Sizes of each dimension + n: Number of points to sample ('all', 'max', or integer) + space: Optional mapping of dimension names to sizes + strategy: Sampling strategy ('random', 'clhs', or 'sobol') + seed: Controls randomness + + Returns: + List of sampled multi-dimensional coordinates + """ + if seed is not None: + random.seed(seed) + + if space: + indices_dict = { + k: get_index_list_van_der_corput(v, v) for k, v in space.items() + } + if [len(indices) for indices in list(indices_dict.values())] != dimensions: + logger.error( + f"A space dict has been provided ->{space}. It is inconsistent with dimensions={dimensions}" + ) + raise ValueError("Space has inconsistent dimensions!") + logger.info( + "Sampling indices for each named dimension (ordered low to high): %s", + indices_dict, + ) + + orders = [get_index_list_van_der_corput(v, v) for v in dimensions] + + if logger.isEnabledFor(logging.DEBUG): + logger.debug("Dimensions: %s", dimensions) + logger.debug("Sampling orders for each dimension:") + for i, o in enumerate(orders): + logger.debug("Dimension %d order: %s", i, o) + + maximum_n = math.prod(dimensions) + lcm = math.lcm(*dimensions) + + if lcm != maximum_n: + logger.debug( + "Periodicity detected, the sampling subroutine will ensure that you will not sample" + "the same configuration more than once." + ) + + if isinstance(n, str): + if n == "all": + n = maximum_n + elif n == "max": + n = max(dimensions) + else: + raise ValueError(f"Unrecognized string for n: {n}") + + if n > maximum_n: + logger.warning( + f"Maximal sample size is {maximum_n}, you requested {n} sampling prescriptions." + f"Elaborating prescription for n_samples = {maximum_n}" + ) + + logger.debug("Preparing to sample %d out of %d possible points.", n, maximum_n) + + match strategy: + case "random": + return random_high_dimensional_sampling(dimensions, n, seed=seed) + case "clhs": + return concatenated_latin_hypercube_sampling( + dimensions=dimensions, final_sample_size=n, seed=seed + ) + case "sobol": + return sobol_sampling(dimensions=dimensions, final_sample_size=n, seed=seed) + case _: + raise NotImplementedError(f"Strategy {strategy} is unknown") + + +# ============================================================================ +# DataFrame Ordering and Index Mapping +# ============================================================================ + + +def get_index_list_nn_high_dimensional( + orders_to_sample: list[list[int]], dimensions: list[int] +) -> list[int]: + """ + Map high-dimensional sampling orders to linear (flattened) indices. + + Args: + orders_to_sample: List of multi-dimensional coordinates + dimensions: Size of each dimension + + Returns: + List of linear indices + + Warns: + If duplicate or out-of-bounds indices are detected + """ + indices = [] + cprod = np.cumprod(np.array(dimensions), dtype=int).tolist() + maximum_n = cprod[-1] + + for order in orders_to_sample: + index = 0 + multiplier = 1 + for i in reversed(range(len(dimensions))): + index += order[i] * multiplier + multiplier *= dimensions[i] + + if index > maximum_n: + logging.warning( + f"Out of bound index {index} computed from order {order}, dimensions are {dimensions}" + ) + indices.append(index) + + if len(set(indices)) != len(indices): + logger.error(f"{len(indices) - len(set(indices))} Duplicated indices!") + + out_of_bounds_list = [i for i in indices if i > maximum_n] + if out_of_bounds_list: + logger.error( + f"The following indices are out of bound: {out_of_bounds_list}, maximum admissible value is {maximum_n-1}" + ) + + return indices + + +def order_df_for_get_index_list_nn_high_dimensional( + df: pd.DataFrame, constitutive_properties: list[str], dimensions: list[int] +) -> pd.DataFrame: + """ + Ensure DataFrame is ordered and complete for high-dimensional index generation. + + Args: + df: Input DataFrame + constitutive_properties: Column names defining the space + dimensions: Expected cardinality for each property + + Returns: + DataFrame sorted and augmented with missing combinations + """ + df = df.sort_values(by=constitutive_properties).reset_index(drop=True) + expected_len = math.prod(dimensions) + + if len(df) == expected_len: + return df + + unique_values = [ + sorted(df[prop].dropna().unique()) for prop in constitutive_properties + ] + all_combinations = list(itertools.product(*unique_values)) + actual_expected_len = len(all_combinations) + + logger.warning( + f"DataFrame length mismatch: expected {expected_len} (product of {dimensions}), " + f"but got {len(df)}. Actual unique combinations: {actual_expected_len}." + ) + + existing_combinations = { + tuple(row[prop] for prop in constitutive_properties) for _, row in df.iterrows() + } + + missing_combinations = [ + comb for comb in all_combinations if comb not in existing_combinations + ] + + if missing_combinations: + logger.info( + f"Injecting {len(missing_combinations)} missing rows to satisfy the property." + ) + injected_rows = [] + for comb in missing_combinations: + row_data = dict(zip(constitutive_properties, comb, strict=False)) + for col in df.columns: + if col not in constitutive_properties: + row_data[col] = pd.NA + injected_rows.append(row_data) + + df = pd.concat([df, pd.DataFrame(injected_rows)], ignore_index=True) + df = df.sort_values(by=constitutive_properties).reset_index(drop=True) + logger.info(f"Injected rows: {injected_rows}") + + return df + + +def order_df_for_sampling_with_no_priors( + df: pd.DataFrame, + constitutive_properties: list[str], + n: int, + strategy: Literal["random", "clhs", "sobol"], +) -> pd.DataFrame: + """ + Orders a DataFrame for high-dimensional sampling without prior knowledge. + + Args: + df: Input dataset + constitutive_properties: Column names defining the configuration space + n: Number of samples to generate + strategy: Sampling strategy + + Returns: + DataFrame with n sampled rows + + Raises: + ValueError: If n <= 0 after adjustment or no samples available + """ + len_original = len(df) + df_unique = df.drop_duplicates(subset=constitutive_properties).reset_index( + drop=True + ) + delta_len = len_original - len(df_unique) + if delta_len > 0: + logging.warning( + f"Removing {delta_len} duplicate configurations." + f"They are characterized by the same combination of constitutive properties = {constitutive_properties}" + ) + + if n > len(df_unique): + logging.warning( + f"Requested {n} samples, but DataFrame has only {len(df_unique)} rows. Adjusting n to {len(df_unique)}." + ) + n = len(df_unique) + + if n <= 0: + logging.error( + f"No samples available to select. DataFrame has {len(df_unique)} rows and {n} samples were requested." + ) + return pd.DataFrame(columns=df_unique.columns) + + def _get_sorted_uniques(prop: str) -> list: + vals = df_unique[prop].unique() + try: + return sorted(vals) + except TypeError: + logging.warning( + f"Cannot sort mixed types for property '{prop}'. " + "Keeping original order." + ) + return list(vals) + + value_dict = {prop: _get_sorted_uniques(prop) for prop in constitutive_properties} + space_dict = {prop: len(vals) for prop, vals in value_dict.items()} + dimensions = list(space_dict.values()) + + df_unique = order_df_for_get_index_list_nn_high_dimensional( + df_unique, constitutive_properties, dimensions=dimensions + ).reset_index(drop=True) + + orders_to_sample = get_sampling_indices_multi_dimensional( + dimensions=dimensions, space=space_dict, n=n, strategy=strategy + ) + + indices_to_sample = get_index_list_nn_high_dimensional(orders_to_sample, dimensions) + + logger.info(f"Indexes are:\n {indices_to_sample}") + try: + return df_unique.iloc[indices_to_sample] + except IndexError: + logging.error( + f"Index Error detected. Length of the dataframe is {len(df_unique)}." + "The indices that cause the error are:" + ) + max_len = len(df_unique) + out_of_bounds_list = [i for i in indices_to_sample if i < 0 or i >= max_len] + logging.error(out_of_bounds_list) + logging.error("Returning empty dataset") + return pd.DataFrame({}) + + +# ============================================================================ +# Discovery Space Data Extraction +# ============================================================================ + + +def get_project_context() -> ProjectContext: + """Retrieve the current ADO project context from configuration.""" + import orchestrator.cli.core.config + + ado_configuration = orchestrator.cli.core.config.AdoConfiguration.load() + return ado_configuration.project_context # type: ignore[name-defined] + + +def get_space( + space_or_space_id: DiscoverySpace | str, +) -> DiscoverySpace: + """Get a DiscoverySpace object from either a space object or identifier string.""" + if isinstance(space_or_space_id, DiscoverySpace): + return space_or_space_id + + return DiscoverySpace.from_stored_configuration( + project_context=get_project_context(), + space_identifier=space_or_space_id, + ) + + +def get_df_all_entities_no_measurements( + discoverySpace: DiscoverySpace | str, +) -> pd.DataFrame: + """ + Return a DataFrame of all entities in the Discovery Space. + + Returns: + DataFrame with columns: ['identifier', ] + """ + space = get_space(space_or_space_id=discoverySpace) + entity_space = space.entitySpace + cp_ids = [cp.identifier for cp in entity_space.constitutiveProperties] + + list_of_dicts_to_convert = [] + for point_values in entity_space.sequential_point_iterator(): + point_dict = dict(zip(cp_ids, point_values, strict=True)) + entity = entity_space.entity_for_point(point_dict) + ed = {"identifier": entity.identifier} + ed.update(point_dict) + list_of_dicts_to_convert.append(ed) + + return pd.DataFrame(list_of_dicts_to_convert) + + +def get_df_at_least_one_measured_value( + discoverySpace: DiscoverySpace | str, + targetOutput_list: list[str] | None = None, + add_measurement_id: bool = False, +) -> pd.DataFrame: + """ + Return a DataFrame of entities with at least one measured target output. + + Returns: + DataFrame with columns: ['identifier' (optional), , ] + """ + if not targetOutput_list: + targetOutput_list = [] + space = get_space(space_or_space_id=discoverySpace) + col_list = [cp.identifier for cp in space.entitySpace.constitutiveProperties] + if add_measurement_id: + col_list = ["identifier", *col_list] + + discoverySpace.sample_store.refresh() + + df = pd.DataFrame( + space.matchingEntitiesTable( + property_type="target", + aggregationMethod=PropertyAggregationMethodEnum.mean, + ) + ) + + if df.empty: + logger.warning( + "No measured properties found in the discovery space\nReturning empty DataFrame\n " + ) + return df + + all_df_cols = list(df.columns) + valid_targetOutput_list = [] + for el in targetOutput_list: + if el in all_df_cols: + valid_targetOutput_list.append(el) + elif f"{el}-mean" in all_df_cols and el not in all_df_cols: + logger.warning( + f"Column named '{el}-mean' (instead of '{el}', which is not present)" + "found in the DataFrame obtained through matchingEntitiesTable. " + f"Renaming it to '{el}'." + ) + df.rename(columns={f"{el}-mean": el}, inplace=True) + valid_targetOutput_list += [el] + elif f"{el}-mean" in all_df_cols and el in all_df_cols: + logger.warning( + f"Columns named '{el}-mean' and '{el}'" + "found in the DataFrame obtained through matchingEntitiesTable. " + f"Renaming it to '{el}'." + ) + logger.error("Unexpected behavior can happen!") + df.rename(columns={f"{el}-mean": el}, inplace=True) + valid_targetOutput_list += [el] + col_list += valid_targetOutput_list + + if valid_targetOutput_list != targetOutput_list: + if len(valid_targetOutput_list) == 0: + logger.error( + "No valid target in the columns of the DataFrame." + f"columns are:\t{list(df.columns)}." + f"First rows are:\n{df.head(5)}" + ) + else: + not_found = [ + t for t in targetOutput_list if t not in valid_targetOutput_list + ] + logger.error( + f"Found measurements for the following valid targets:\t{valid_targetOutput_list}" + ) + logger.error( + f"No measurement found for the following valid targets:\t{not_found}" + ) + + removed_cols = [c for c in list(df.columns) if c not in col_list] + logger.debug( + "Obtaining df with at least one measured target." + f"Removed columns: {removed_cols}" + ) + + df = df[col_list] + df.dropna(inplace=True) + + if df.empty: + logger.warning( + "Although there were some measured properties in the discovery space." + ) + logger.warning( + "All measured properties in the discovery space" + f"are different from the desired outputs {targetOutput_list}.Returning empty DataFrame\n " + ) + + return df + + +def get_source_and_target( + discoverySpace: DiscoverySpace | str, + targetOutput: str, + log_string: str = "", +) -> tuple[pd.DataFrame, pd.DataFrame]: + """ + Build source (labeled) and target (unlabeled) DataFrames for a target output. + + Returns: + Tuple of (source_df, target_df) + """ + dfm = get_df_at_least_one_measured_value(discoverySpace, [targetOutput]) + dfu = get_df_all_entities_no_measurements(discoverySpace) + keys = [c for c in dfu.columns if c in dfm.columns and c != "identifier"] + + if dfm.empty: + logger.warning("The source space is empty") + return dfm, dfu + + df = dfu.merge(dfm, on=keys, how="left") + + if targetOutput not in list(df.columns): + logger.info( + f"""The target output was not present in the columns of the measured+unmeasured DataFrame,' \ + meaning that '{targetOutput}' has never been measured in this space. + dfm.empty = {df.empty}. Adding an empty column to the DataFrame. + """ + ) + logger.debug("Adding an empty column to the DataFrame.") + df[targetOutput] = pd.NA + + if targetOutput in list(df.columns): + df_measured_drop_na = df.dropna(subset=[targetOutput]) + df_unmeasured_drop_na = df[df[targetOutput].isna()].drop(columns=[targetOutput]) + n_rows_dropped = len(df) - len(df_measured_drop_na) + logger.debug( + f"Dropped {n_rows_dropped} rows. Function called with log_string={log_string}" + ) + if df_measured_drop_na.empty: + logger.warning( + f"Empty source after dropping rows that contain Nan in {targetOutput} column" + ) + if df_unmeasured_drop_na.empty: + logger.warning( + f"Empty target after filtering rows that contain Nan in {targetOutput} column" + ) + return df_measured_drop_na, df_unmeasured_drop_na + + save_path = "df_with_no_targetOutput_columns.csv" + logger.error( + f"'{targetOutput}' column is missing, saving df in {save_path}, returning unmerged DataFrames" + ) + df.to_csv(save_path) + return dfm, dfu + + +# ============================================================================ +# Entity/Point Conversion +# ============================================================================ + + +def validate_points_in_space( + points: list[dict], + space: DiscoverySpace, +) -> tuple[list[dict], list[int]]: + """ + Validate point dictionaries against a Discovery Space. + + Returns: + Tuple of (valid_points, invalid_indices) + """ + valid_points: list[dict] = [] + invalid_indices: list[int] = [] + + for i, p in enumerate(points): + if space.entitySpace.isPointInSpace(p): + valid_points.append(p) + else: + invalid_indices.append(i) + return valid_points, invalid_indices + + +def df_to_points( + df: pd.DataFrame, + cols: list[str] | None = None, + dropna: bool = True, + drop_duplicates: bool = False, +) -> list[dict[Hashable, Any]]: + """ + Convert DataFrame rows to list of point dictionaries. + + Args: + df: Input DataFrame + cols: Columns to include + dropna: If True, drop rows containing NaN + drop_duplicates: If True, drop duplicate rows + + Returns: + List of point dictionaries + """ + if cols is None: + cols = list(df.columns) + missing = set(cols) - set(df.columns) + if missing: + raise KeyError(f"Requested columns not present in DataFrame: {missing}") + + sub = df[cols].copy() + if dropna: + sub = sub.dropna(how="any") + if drop_duplicates: + sub = sub.drop_duplicates() + + def to_py(x: object) -> object: + if isinstance(x, (np.generic)): + return x.item() + return x + + for c in sub.columns: + sub[c] = sub[c].map(to_py) + + return sub.to_dict(orient="records") + + +def df_to_points_parsing( + df: pd.DataFrame, + cols: list[str] | None = None, + dropna: bool = True, + parse_values: bool = False, +) -> list[dict]: + """Convert DataFrame to points with optional string value parsing.""" + import ast + + points = df_to_points(df, cols=cols, dropna=dropna) + if not parse_values: + return points + + parsed = [] + for p in points: + newp = {} + for k, v in p.items(): + if isinstance(v, str): + try: + newp[k] = ast.literal_eval(v) + except Exception: + newp[k] = v + else: + newp[k] = v + parsed.append(newp) + return parsed + + +def make_points_from_df( + df: pd.DataFrame, + space: DiscoverySpace, + cols: list[str] | None = None, + dropna: bool = True, + parse_values: bool = True, +) -> list[dict]: + """ + Convert DataFrame of constitutive properties into point dictionaries. + + Args: + df: Input DataFrame + space: Discovery Space providing canonical order + cols: Explicit list of columns to use + dropna: If True, drop rows with NaN + parse_values: If True, parse string values + + Returns: + List of point dictionaries + """ + if cols is None: + cols = [cp.identifier for cp in space.entitySpace.constitutiveProperties] + + missing = set(cols) - set(df.columns) + if missing: + raise KeyError(f"Requested columns not present in DataFrame: {missing}") + + return df_to_points_parsing(df, cols=cols, dropna=dropna, parse_values=parse_values) + + +def get_list_of_entities_from_df_and_space( + df: pd.DataFrame, space: DiscoverySpace +) -> list[Entity]: + """ + Convert DataFrame rows to Entity objects validated against a discovery space. + + Args: + df: DataFrame containing constitutive property values + space: DiscoverySpace defining the entity space constraints + + Returns: + List of valid Entity objects + """ + points = make_points_from_df(df=df, space=space) + valid_points, __ = validate_points_in_space(points, space) + + list_of_entities = [] + from orchestrator.schema.point import SpacePoint + + for p in valid_points: + sp = SpacePoint(entity=p) + entity = sp.to_entity(generatorid="no_priors_characterization") + list_of_entities.append(entity) + + numberEntities = len(list_of_entities) + if numberEntities != len(df): + numberEntities_log = f"""Warning: number of valid entities {numberEntities} is different from the number of rows in the ordered df {len(df)}. + This means that some rows in the ordered df did not correspond to valid entities in the discovery space. + """ + logging.warning(numberEntities_log) + return list_of_entities + + +# Made with Bob From 4c7e9a1804a50765af7544cf9334a13acb1c25b2 Mon Sep 17 00:00:00 2001 From: Daniele Lotito Date: Thu, 23 Apr 2026 11:23:28 +0100 Subject: [PATCH 13/23] fix: update trim plugin imports after no_priors migration Update imports in trim operator source files to reference new location: - operator.py: update module name and import - trim_pydantic.py: update NoPriorsParameters import - trim_sampler.py: update no_priors_utils imports - utils/order.py: update get_sampling_indices_multi_dimensional import --- plugins/operators/trim/src/trim/operator.py | 4 ++-- plugins/operators/trim/src/trim/trim_pydantic.py | 2 +- plugins/operators/trim/src/trim/trim_sampler.py | 4 ++-- plugins/operators/trim/src/trim/utils/order.py | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/plugins/operators/trim/src/trim/operator.py b/plugins/operators/trim/src/trim/operator.py index 594b27aa9..248fd8806 100644 --- a/plugins/operators/trim/src/trim/operator.py +++ b/plugins/operators/trim/src/trim/operator.py @@ -5,11 +5,11 @@ import logging from importlib.metadata import version -from orchestrator.core.discoveryspace.no_priors_utils import get_source_and_target from orchestrator.core.discoveryspace.space import DiscoverySpace from orchestrator.core.operation.config import FunctionOperationInfo from orchestrator.core.operation.operation import OperationOutput from orchestrator.modules.operators.collections import characterize_operation +from trim.samplers.no_priors_utils import get_source_and_target from trim.trim_pydantic import ( TrimParameters, ) # Importing this way works when the package is installed @@ -98,7 +98,7 @@ def trim( # Use random-walk with no-priors sampler instead of direct operator call no_priors_module = SamplerModuleConf( moduleClass="NoPriorsSampleSelector", - moduleName="orchestrator.core.discoveryspace.no_priors_sampler", + moduleName="trim.samplers.no_priors_sampler", ) no_priors_sampler_config = CustomSamplerConfiguration( module=no_priors_module, diff --git a/plugins/operators/trim/src/trim/trim_pydantic.py b/plugins/operators/trim/src/trim/trim_pydantic.py index aac460b3d..05362a7ab 100644 --- a/plugins/operators/trim/src/trim/trim_pydantic.py +++ b/plugins/operators/trim/src/trim/trim_pydantic.py @@ -7,7 +7,7 @@ import pydantic from pydantic import BaseModel, ConfigDict, Field, model_validator -from orchestrator.core.discoveryspace.no_priors_parameters import NoPriorsParameters +from trim.samplers.no_priors_parameters import NoPriorsParameters class SamplingBudget(pydantic.BaseModel): diff --git a/plugins/operators/trim/src/trim/trim_sampler.py b/plugins/operators/trim/src/trim/trim_sampler.py index 116434931..9ca0f7833 100644 --- a/plugins/operators/trim/src/trim/trim_sampler.py +++ b/plugins/operators/trim/src/trim/trim_sampler.py @@ -19,12 +19,12 @@ import pandas as pd from autogluon.tabular import TabularDataset, TabularPredictor -from orchestrator.core.discoveryspace.no_priors_utils import ( +from orchestrator.core.discoveryspace.samplers import BaseSampler +from trim.samplers.no_priors_utils import ( get_index_list_van_der_corput, get_list_of_entities_from_df_and_space, get_source_and_target, ) -from orchestrator.core.discoveryspace.samplers import BaseSampler from trim.trim_pydantic import TrimParameters if TYPE_CHECKING: diff --git a/plugins/operators/trim/src/trim/utils/order.py b/plugins/operators/trim/src/trim/utils/order.py index 1a8779de7..eb7c2a8b8 100644 --- a/plugins/operators/trim/src/trim/utils/order.py +++ b/plugins/operators/trim/src/trim/utils/order.py @@ -10,7 +10,7 @@ import pandas as pd from autogluon.tabular import TabularPredictor -from orchestrator.core.discoveryspace.no_priors_utils import ( +from trim.samplers.no_priors_utils import ( get_sampling_indices_multi_dimensional, ) from trim.trim_pydantic import AutoGluonArgs From 099d34d620b13dcd0980f434da55f66fccbf9c14 Mon Sep 17 00:00:00 2001 From: Daniele Lotito Date: Thu, 23 Apr 2026 11:29:01 +0100 Subject: [PATCH 14/23] test: update trim plugin test imports after no_priors migration Update test imports to reference new module location: - test_high_dimensional_sampling.py: update concatenated_latin_hypercube_sampling import - test_sampling.py: update get_index_list_van_der_corput import --- .../operators/trim/tests/test_high_dimensional_sampling.py | 5 +---- plugins/operators/trim/tests/test_sampling.py | 5 +---- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/plugins/operators/trim/tests/test_high_dimensional_sampling.py b/plugins/operators/trim/tests/test_high_dimensional_sampling.py index 724a86d53..c8971692f 100644 --- a/plugins/operators/trim/tests/test_high_dimensional_sampling.py +++ b/plugins/operators/trim/tests/test_high_dimensional_sampling.py @@ -14,10 +14,7 @@ import pytest from test_data_documentation import TEST_DATAFRAMES - -from orchestrator.core.discoveryspace.no_priors_utils import ( - concatenated_latin_hypercube_sampling, -) +from trim.samplers.no_priors_utils import concatenated_latin_hypercube_sampling class TestConcatenatedLatinHypercubeSampling: diff --git a/plugins/operators/trim/tests/test_sampling.py b/plugins/operators/trim/tests/test_sampling.py index f381e1a9f..4fcc79486 100644 --- a/plugins/operators/trim/tests/test_sampling.py +++ b/plugins/operators/trim/tests/test_sampling.py @@ -2,10 +2,7 @@ # SPDX-License-Identifier: MIT import pytest - -from orchestrator.core.discoveryspace.no_priors_utils import ( - get_index_list_van_der_corput, -) +from trim.samplers.no_priors_utils import get_index_list_van_der_corput # --- Error Handling Tests --- From f4e51522ea6e7678e6f30258a654772bb587bff6 Mon Sep 17 00:00:00 2001 From: Daniele Lotito Date: Thu, 23 Apr 2026 11:32:53 +0100 Subject: [PATCH 15/23] docs: update documentation and remove old no_priors files - Remove old no_priors files from orchestrator/core/discoveryspace/ - Documentation in random-walk.md already references correct new location - Fix markdown line length issues --- .../discoveryspace/no_priors_parameters.py | 65 -- .../core/discoveryspace/no_priors_sampler.py | 139 --- .../core/discoveryspace/no_priors_utils.py | 953 ------------------ .../discoveryspace/test_no_priors_sampler.py | 97 -- website/docs/operators/random-walk.md | 126 ++- 5 files changed, 86 insertions(+), 1294 deletions(-) delete mode 100644 orchestrator/core/discoveryspace/no_priors_parameters.py delete mode 100644 orchestrator/core/discoveryspace/no_priors_sampler.py delete mode 100644 orchestrator/core/discoveryspace/no_priors_utils.py delete mode 100644 tests/core/discoveryspace/test_no_priors_sampler.py diff --git a/orchestrator/core/discoveryspace/no_priors_parameters.py b/orchestrator/core/discoveryspace/no_priors_parameters.py deleted file mode 100644 index 1271ac769..000000000 --- a/orchestrator/core/discoveryspace/no_priors_parameters.py +++ /dev/null @@ -1,65 +0,0 @@ -# Copyright IBM Corporation 2025, 2026 -# SPDX-License-Identifier: MIT - -from typing import Annotated, Literal - -from pydantic import BaseModel, BeforeValidator, Field - - -class NoPriorsParameters(BaseModel): - """ - Parameters for sampling high-dimensional spaces without prior model structure. - - The `sampling_strategy` must be one of the Literals supported. - Source of truth for supported strategies is the comment block right here: - - strategy (str): sampling subroutine: - - 'random': selects random points from the beginning - - 'one_shift': refer to one_shift_then_random_points_high_dimensional_sampling - - 'recursive_aggregation': refer to recursive_aggregation_high_dimensional_sampling - - 'clhs': refer to concatenated_latin_hypercube_sampling - - 'sobol': sobol sampling - """ - - targetOutput: Annotated[ - str, - Field( - description="The measured property you will treat as a target variable.", - ), - ] - - samples: Annotated[ - int, - Field( - ge=1, - description="Number of unique points to sample (must be >= 1).", - ), - ] = 20 - - batchSize: Annotated[ - int, - Field( - ge=1, - description=( - "Batch size parameter used by certain samplers (e.g., randomWalk) via continuous batching; " - "by default set equal to iterationSize in those contexts. Must be >= 1." - ), - ), - ] = 1 - - sampling_strategy: Annotated[ - Literal["random", "one_shift", "recursive_aggregation", "clhs", "sobol"], - BeforeValidator(lambda s: s.lower()), - Field( - description=( - "Sampling subroutine. Supported values:\n" - " - 'random': selects random points from the beginning\n" - " - 'one_shift': see one_shift_then_random_points_high_dimensional_sampling\n" - " - 'recursive_aggregation': see recursive_aggregation_high_dimensional_sampling\n" - " - 'clhs': dimension-wise random without replacement until each dim cycles\n" - " - 'sobol': sobol sampling via scipy\n" - "Aliases: 'random_shifts' → 'recursive_aggregation'.\n" - "Validation is case-insensitive; value is normalized to lowercase." - ), - ), - ] = "clhs" diff --git a/orchestrator/core/discoveryspace/no_priors_sampler.py b/orchestrator/core/discoveryspace/no_priors_sampler.py deleted file mode 100644 index f84da20e9..000000000 --- a/orchestrator/core/discoveryspace/no_priors_sampler.py +++ /dev/null @@ -1,139 +0,0 @@ -# Copyright IBM Corporation 2025, 2026 -# SPDX-License-Identifier: MIT - -import asyncio -import logging -import typing - -from pydantic import BaseModel - -from orchestrator.core.discoveryspace.no_priors_parameters import NoPriorsParameters -from orchestrator.core.discoveryspace.no_priors_utils import ( - get_list_of_entities_from_df_and_space, - get_source_and_target, - order_df_for_sampling_with_no_priors, -) -from orchestrator.core.discoveryspace.samplers import BaseSampler -from orchestrator.core.discoveryspace.space import DiscoverySpace, Entity -from orchestrator.modules.operators.discovery_space_manager import DiscoverySpaceManager - -logger_no_priors = logging.getLogger(__name__) - - -# NOTE: to repeat the operation on the same space I can delete the operation if the output of this operation -# are not used by another operation -class NoPriorsSampleSelector(BaseSampler): - @classmethod - def samplerCompatibleWithDiscoverySpaceRemote( - cls, remoteDiscoverySpace: DiscoverySpaceManager # type: ignore[name-defined] - ) -> bool: - return True - - async def remoteEntityIterator( - self, remoteDiscoverySpace: DiscoverySpaceManager, batchsize: int = 1 - ) -> typing.AsyncGenerator[list[Entity], None]: - """ - Generate entities for no-priors characterization sampling. - - Orders the target space using a high-dimensional sampling strategy (e.g., CLHS, Sobol) - without relying on prior model knowledge or feature importance. - - Args: - remoteDiscoverySpace: Manager for the discovery space state - batchsize: Number of entities to yield per iteration - - Yields: - List of Entity objects to be measured, in the determined order - """ - - async def iterator_closure( - stateHandle: DiscoverySpaceManager, # type: ignore[name-defined] - ) -> typing.Callable[[], typing.AsyncGenerator[list[Entity], None]]: - - logger_no_priors.info("Characterization with no-priors starts.\n") - logger_no_priors.info(f"Parameters are:\n{self.params}\n\n") - - discoverySpace = await stateHandle.discoverySpace.remote() - source_df, target_df = get_source_and_target( - discoverySpace, self.params.targetOutput - ) - logger_no_priors.info(f"Target dataframe has length {len(target_df)}") - - # The 'samples' parameter specifies the number of NEW entities to sample, - # regardless of how many entities have already been measured in the space - logger_no_priors.info( - f"Space has {len(source_df)} measured entities. " - f"Sampling {self.params.samples} new entities as requested." - ) - target_df = order_df_for_sampling_with_no_priors( - target_df, - [ - cp.identifier - for cp in discoverySpace.entitySpace.constitutiveProperties - ], - self.params.samples, - strategy=self.params.sampling_strategy, - ) - list_of_entities_for_no_prior_characterization = ( - get_list_of_entities_from_df_and_space( - df=target_df, space=discoverySpace - ) - ) - - logger_no_priors.info( - "\n\nCharacterization with no-priors finished. Starting Iterative Modeling.\n" - ) - - async def iterator() -> typing.AsyncGenerator[list[Entity], None]: # type: ignore[name-defined] - logger_no_priors.info( - "\n\nIteration over sorted entities for no priors characterization starts.\n" - ) - await asyncio.sleep(0.1) - for i in range( - 0, len(list_of_entities_for_no_prior_characterization), batchsize - ): - entities = list_of_entities_for_no_prior_characterization[ - i : i + batchsize - ] - if len(entities) == 0: - logger_no_priors.info( - "\n\nCharacterization with no-priors finished.\n" - ) - break - else: - yield entities - logger_no_priors.info("\n\nCharacterization with no-priors finished.\n") - - return iterator - - retval = await iterator_closure(remoteDiscoverySpace) - - return retval() - - def entityIterator( - self, discoverySpace: DiscoverySpace, batchsize: int = 1 - ) -> typing.Generator[list[Entity], None, None]: - """Returns an remoteEntityIterator that returns entities in order""" - - def iterator_closure( - space: DiscoverySpace, - ) -> typing.Callable[[], typing.Generator[list[Entity], None, None]]: - - # list_of_entities = list(...) # type: ignore[name-defined] - # numberEntities = len(list_of_entities) - - def iterator() -> typing.Generator[list[Entity], None, None]: # type: ignore[name-defined] - raise NotImplementedError - # ...for i in range(0, numberEntities, batchsize): - - return iterator - - retval = iterator_closure(discoverySpace) - return retval() - - @classmethod - def parameters_model(cls) -> type[BaseModel] | None: - return NoPriorsParameters - - def __init__(self, parameters: NoPriorsParameters) -> None: - self.params = parameters diff --git a/orchestrator/core/discoveryspace/no_priors_utils.py b/orchestrator/core/discoveryspace/no_priors_utils.py deleted file mode 100644 index 36a06e829..000000000 --- a/orchestrator/core/discoveryspace/no_priors_utils.py +++ /dev/null @@ -1,953 +0,0 @@ -# Copyright IBM Corporation 2025, 2026 -# SPDX-License-Identifier: MIT - -""" -Utility functions for no-priors sampling, including: -- High-dimensional sampling strategies (CLHS, Sobol, random) -- DataFrame ordering and index mapping -- Entity/point conversion and validation -- Discovery space data extraction -""" - -from __future__ import annotations - -import itertools -import logging -import math -import random -from typing import TYPE_CHECKING, Any, Literal - -import numpy as np -import pandas as pd -from scipy.stats.qmc import Sobol - -from orchestrator.core.discoveryspace.space import DiscoverySpace -from orchestrator.schema.virtual_property import PropertyAggregationMethodEnum - -if TYPE_CHECKING: - from collections.abc import Hashable - - from orchestrator.metastore.project import ProjectContext - from orchestrator.schema.entity import Entity - -logger = logging.getLogger(__name__) - - -# ============================================================================ -# 1D Sampling Functions -# ============================================================================ - - -def get_index_list_van_der_corput( - length_segment: int, - tot_points_to_sample: int, - sampled_indices: list[int] | None = None, - sort: bool = False, - verbose: bool = False, -) -> list[int]: - """ - Selects indices from a 1D segment using a modified Van der Corput sequence. - - Args: - length_segment: Total number of units in the 1D segment - tot_points_to_sample: Total number of indices to sample - sampled_indices: List of indices already sampled - sort: If True, returns the final list sorted - verbose: If True, prints debug information - - Returns: - List of sampled indices - - Raises: - ValueError: If tot_points_to_sample exceeds length_segment - """ - if tot_points_to_sample == 0: - return [] - - if tot_points_to_sample > length_segment: - raise ValueError( - "ValueError: You are trying to sample more points than those that are available" - ) - - if sampled_indices is None: - sampled_indices = [] - - if len(sampled_indices) == length_segment: - maximal_indices_list = list(range(length_segment)) - if sampled_indices.sort() != maximal_indices_list: - logging.error( - "Sampled indices do not correspond to [0,..., max_n_indices -1]" - "Returning list(range(max_n_indices)" - ) - return maximal_indices_list - - if len(sampled_indices) > tot_points_to_sample: - logging.warning( - "Number of sampled indices is greater than the number of indices you want to sample" - "Returning sampled indices" - ) - return sampled_indices - - index_list = list(sampled_indices) - sampled_set = set(index_list) - - for point in [0, length_segment - 1]: - if point not in sampled_set: - index_list.append(point) - sampled_set.add(point) - if len(index_list) == tot_points_to_sample: - return sorted(index_list) - - def build_prefix_and_len(index_list: list[int]) -> tuple[list[int], int]: - if not index_list: - return [0], 0 - - M = max(index_list) + 1 - sampled_set = set(index_list) - prefix = [0] * (M + 1) - s = 0 - - for i in range(M): - s += 1 if i in sampled_set else 0 - prefix[i + 1] = s - - return prefix, M - - def get_list_min_weight( - prefix: list[int], M: int, d: int, selectable_indices: list[int] - ) -> list[int]: - vals = {} - for i in selectable_indices: - if i >= M: - break - left = max(0, i - d) - right = min(M - 1, i + d) - total = prefix[right + 1] - prefix[left] - denom = right - left + 1 - mean = total / denom - vals[i] = mean - - if not vals: - return [] - - min_val = min(vals.values()) - out = [] - for i in selectable_indices: - if i >= M: - break - if vals.get(i) == min_val: - out.append(i) - return out - - def get_selectable_indices() -> list[int]: - return [i for i in range(length_segment) if i not in sampled_set] - - max_d = length_segment - - while len(index_list) < tot_points_to_sample: - selection = 0 - selectable_indices = get_selectable_indices() - prefix, M = build_prefix_and_len(index_list=index_list) - d = 1 - previous_set = selectable_indices - - while selection == 0: - indices = get_list_min_weight(prefix, M, d, selectable_indices) - - if not indices: - if not previous_set: - raise ValueError( - "Previous candidate set should not be empty or None" - ) - if verbose: - logger.info( - f"No intersection found with d={d}. Using the previous set " - f"Appending to {index_list} the first element of {previous_set}" - ) - chosen = previous_set[0] - index_list.append(chosen) - sampled_set.add(chosen) - selection = 1 - else: - previous_set = selectable_indices - selectable_indices = indices - - if len(selectable_indices) == 1 or d == max_d: - if verbose: - logger.info( - f"Appending to {index_list} the first element of {selectable_indices}" - ) - chosen = selectable_indices[0] - index_list.append(chosen) - sampled_set.add(chosen) - selection = 1 - - d += 1 - - if sort: - return sorted(index_list) - return index_list - - -# ============================================================================ -# High-Dimensional Sampling Functions -# ============================================================================ - - -def concatenated_latin_hypercube_sampling( - dimensions: list[int], - final_sample_size: int, - seed: int | None = None, -) -> list[list[int]]: - """ - Generates samples using Concatenated Latin Hypercube Sampling. - - Args: - dimensions: Cardinality (size) of each dimension - final_sample_size: Total number of points to sample - seed: Optional PRNG seed for reproducibility - - Returns: - List of sampled points - - Raises: - ValueError: If any dimension size is less than 1 - """ - if any(d <= 0 for d in dimensions): - raise ValueError( - f"All dimensions must be >= 1, received dimensions={dimensions}" - ) - - if final_sample_size <= 0: - return [] - - rng = random.Random() if seed is None else random.Random(seed) # noqa: S311 - pools: list[list[int]] = [list(range(d)) for d in dimensions] - samples: list[list[int]] = [] - - for _ in range(final_sample_size): - point: list[int] = [] - for j, d in enumerate(dimensions): - if not pools[j]: - pools[j] = list(range(d)) - k = rng.randrange(len(pools[j])) - value = pools[j].pop(k) - point.append(value) - samples.append(point) - - return samples - - -def sobol_sampling( - dimensions: list[int], final_sample_size: int, seed: int | None = None -) -> list[list[int]]: - """ - Generates Sobol sampled points scaled to integer dimensions. - - Falls back to CLHS if collisions are detected. - - Args: - dimensions: Size of each dimension - final_sample_size: Number of points to sample - seed: Random seed for the Sobol scrambler - - Returns: - List of sampled points - """ - sampler = Sobol(d=len(dimensions), scramble=True, rng=seed) - points = sampler.random(final_sample_size) - - discrete_points = [ - [int(val * d) for val, d in zip(p, dimensions, strict=True)] for p in points - ] - - unique_points = {tuple(p) for p in discrete_points} - n_collisions = final_sample_size - len(unique_points) - - if n_collisions > 0: - logger.error( - f"Sobol sampling failed, {n_collisions} collisions detected, defaulting to clhs sampling" - ) - return concatenated_latin_hypercube_sampling( - dimensions=dimensions, final_sample_size=final_sample_size, seed=seed - ) - - return discrete_points - - -def random_high_dimensional_sampling( - dimensions: list[int], final_sample_size: int, seed: int | None = None -) -> list[list[int]]: - """ - Generate unique random samples from a high-dimensional space. - - Args: - dimensions: Cardinality of each dimension - final_sample_size: Total number of points to sample - seed: Optional PRNG seed - - Returns: - List of sampled points - - Raises: - ValueError: If final_sample_size exceeds total configurations - """ - if seed is not None: - random.seed(seed) - - num_configs = math.prod(dimensions) - if final_sample_size > num_configs: - raise ValueError( - f"Cannot generate {final_sample_size} unique samples. " - f"The sample space only contains {num_configs} possibilities." - ) - - configs = list(itertools.product(*[range(d) for d in dimensions])) - actual_sample_size = min(final_sample_size, len(configs)) - - if actual_sample_size < final_sample_size: - logger.warning( - f"Requested {final_sample_size} samples but only {len(configs)} unique " - f"configurations available. Sampling {actual_sample_size} instead." - ) - - samples = random.sample(configs, actual_sample_size) - return [list(s) for s in samples] - - -def get_sampling_indices_multi_dimensional( - dimensions: list[int], - n: int | Literal["all", "max"], - space: dict[str, int] | None = None, - strategy: Literal["random", "clhs", "sobol"] = "clhs", - seed: int | None = None, -) -> list[list[int]]: - """ - Generate sampling indices for a high-dimensional space. - - Args: - dimensions: Sizes of each dimension - n: Number of points to sample ('all', 'max', or integer) - space: Optional mapping of dimension names to sizes - strategy: Sampling strategy ('random', 'clhs', or 'sobol') - seed: Controls randomness - - Returns: - List of sampled multi-dimensional coordinates - """ - if seed is not None: - random.seed(seed) - - if space: - indices_dict = { - k: get_index_list_van_der_corput(v, v) for k, v in space.items() - } - if [len(indices) for indices in list(indices_dict.values())] != dimensions: - logger.error( - f"A space dict has been provided ->{space}. It is inconsistent with dimensions={dimensions}" - ) - raise ValueError("Space has inconsistent dimensions!") - logger.info( - "Sampling indices for each named dimension (ordered low to high): %s", - indices_dict, - ) - - orders = [get_index_list_van_der_corput(v, v) for v in dimensions] - - if logger.isEnabledFor(logging.DEBUG): - logger.debug("Dimensions: %s", dimensions) - logger.debug("Sampling orders for each dimension:") - for i, o in enumerate(orders): - logger.debug("Dimension %d order: %s", i, o) - - maximum_n = math.prod(dimensions) - lcm = math.lcm(*dimensions) - - if lcm != maximum_n: - logger.debug( - "Periodicity detected, the sampling subroutine will ensure that you will not sample" - "the same configuration more than once." - ) - - if isinstance(n, str): - if n == "all": - n = maximum_n - elif n == "max": - n = max(dimensions) - else: - raise ValueError(f"Unrecognized string for n: {n}") - - if n > maximum_n: - logger.warning( - f"Maximal sample size is {maximum_n}, you requested {n} sampling prescriptions." - f"Elaborating prescription for n_samples = {maximum_n}" - ) - - logger.debug("Preparing to sample %d out of %d possible points.", n, maximum_n) - - match strategy: - case "random": - return random_high_dimensional_sampling(dimensions, n, seed=seed) - case "clhs": - return concatenated_latin_hypercube_sampling( - dimensions=dimensions, final_sample_size=n, seed=seed - ) - case "sobol": - return sobol_sampling(dimensions=dimensions, final_sample_size=n, seed=seed) - case _: - raise NotImplementedError(f"Strategy {strategy} is unknown") - - -# ============================================================================ -# DataFrame Ordering and Index Mapping -# ============================================================================ - - -def get_index_list_nn_high_dimensional( - orders_to_sample: list[list[int]], dimensions: list[int] -) -> list[int]: - """ - Map high-dimensional sampling orders to linear (flattened) indices. - - Args: - orders_to_sample: List of multi-dimensional coordinates - dimensions: Size of each dimension - - Returns: - List of linear indices - - Warns: - If duplicate or out-of-bounds indices are detected - """ - indices = [] - cprod = np.cumprod(np.array(dimensions), dtype=int).tolist() - maximum_n = cprod[-1] - - for order in orders_to_sample: - index = 0 - multiplier = 1 - for i in reversed(range(len(dimensions))): - index += order[i] * multiplier - multiplier *= dimensions[i] - - if index > maximum_n: - logging.warning( - f"Out of bound index {index} computed from order {order}, dimensions are {dimensions}" - ) - indices.append(index) - - if len(set(indices)) != len(indices): - logger.error(f"{len(indices) - len(set(indices))} Duplicated indices!") - - out_of_bounds_list = [i for i in indices if i > maximum_n] - if out_of_bounds_list: - logger.error( - f"The following indices are out of bound: {out_of_bounds_list}, maximum admissible value is {maximum_n-1}" - ) - - return indices - - -def order_df_for_get_index_list_nn_high_dimensional( - df: pd.DataFrame, constitutive_properties: list[str], dimensions: list[int] -) -> pd.DataFrame: - """ - Ensure DataFrame is ordered and complete for high-dimensional index generation. - - Args: - df: Input DataFrame - constitutive_properties: Column names defining the space - dimensions: Expected cardinality for each property - - Returns: - DataFrame sorted and augmented with missing combinations - """ - df = df.sort_values(by=constitutive_properties).reset_index(drop=True) - expected_len = math.prod(dimensions) - - if len(df) == expected_len: - return df - - unique_values = [ - sorted(df[prop].dropna().unique()) for prop in constitutive_properties - ] - all_combinations = list(itertools.product(*unique_values)) - actual_expected_len = len(all_combinations) - - logger.warning( - f"DataFrame length mismatch: expected {expected_len} (product of {dimensions}), " - f"but got {len(df)}. Actual unique combinations: {actual_expected_len}." - ) - - existing_combinations = { - tuple(row[prop] for prop in constitutive_properties) for _, row in df.iterrows() - } - - missing_combinations = [ - comb for comb in all_combinations if comb not in existing_combinations - ] - - if missing_combinations: - logger.info( - f"Injecting {len(missing_combinations)} missing rows to satisfy the property." - ) - injected_rows = [] - for comb in missing_combinations: - row_data = dict(zip(constitutive_properties, comb, strict=False)) - for col in df.columns: - if col not in constitutive_properties: - row_data[col] = pd.NA - injected_rows.append(row_data) - - df = pd.concat([df, pd.DataFrame(injected_rows)], ignore_index=True) - df = df.sort_values(by=constitutive_properties).reset_index(drop=True) - logger.info(f"Injected rows: {injected_rows}") - - return df - - -def order_df_for_sampling_with_no_priors( - df: pd.DataFrame, - constitutive_properties: list[str], - n: int, - strategy: Literal["random", "clhs", "sobol"], -) -> pd.DataFrame: - """ - Orders a DataFrame for high-dimensional sampling without prior knowledge. - - Args: - df: Input dataset - constitutive_properties: Column names defining the configuration space - n: Number of samples to generate - strategy: Sampling strategy - - Returns: - DataFrame with n sampled rows - - Raises: - ValueError: If n <= 0 after adjustment or no samples available - """ - len_original = len(df) - df_unique = df.drop_duplicates(subset=constitutive_properties).reset_index( - drop=True - ) - delta_len = len_original - len(df_unique) - if delta_len > 0: - logging.warning( - f"Removing {delta_len} duplicate configurations." - f"They are characterized by the same combination of constitutive properties = {constitutive_properties}" - ) - - if n > len(df_unique): - logging.warning( - f"Requested {n} samples, but DataFrame has only {len(df_unique)} rows. Adjusting n to {len(df_unique)}." - ) - n = len(df_unique) - - if n <= 0: - logging.error( - f"No samples available to select. DataFrame has {len(df_unique)} rows and {n} samples were requested." - ) - return pd.DataFrame(columns=df_unique.columns) - - def _get_sorted_uniques(prop: str) -> list: - vals = df_unique[prop].unique() - try: - return sorted(vals) - except TypeError: - logging.warning( - f"Cannot sort mixed types for property '{prop}'. " - "Keeping original order." - ) - return list(vals) - - value_dict = {prop: _get_sorted_uniques(prop) for prop in constitutive_properties} - space_dict = {prop: len(vals) for prop, vals in value_dict.items()} - dimensions = list(space_dict.values()) - - df_unique = order_df_for_get_index_list_nn_high_dimensional( - df_unique, constitutive_properties, dimensions=dimensions - ).reset_index(drop=True) - - orders_to_sample = get_sampling_indices_multi_dimensional( - dimensions=dimensions, space=space_dict, n=n, strategy=strategy - ) - - indices_to_sample = get_index_list_nn_high_dimensional(orders_to_sample, dimensions) - - logger.info(f"Indexes are:\n {indices_to_sample}") - try: - return df_unique.iloc[indices_to_sample] - except IndexError: - logging.error( - f"Index Error detected. Length of the dataframe is {len(df_unique)}." - "The indices that cause the error are:" - ) - max_len = len(df_unique) - out_of_bounds_list = [i for i in indices_to_sample if i < 0 or i >= max_len] - logging.error(out_of_bounds_list) - logging.error("Returning empty dataset") - return pd.DataFrame({}) - - -# ============================================================================ -# Discovery Space Data Extraction -# ============================================================================ - - -def get_project_context() -> ProjectContext: - """Retrieve the current ADO project context from configuration.""" - import orchestrator.cli.core.config - - ado_configuration = orchestrator.cli.core.config.AdoConfiguration.load() - return ado_configuration.project_context # type: ignore[name-defined] - - -def get_space( - space_or_space_id: DiscoverySpace | str, -) -> DiscoverySpace: - """Get a DiscoverySpace object from either a space object or identifier string.""" - if isinstance(space_or_space_id, DiscoverySpace): - return space_or_space_id - - return DiscoverySpace.from_stored_configuration( - project_context=get_project_context(), - space_identifier=space_or_space_id, - ) - - -def get_df_all_entities_no_measurements( - discoverySpace: DiscoverySpace | str, -) -> pd.DataFrame: - """ - Return a DataFrame of all entities in the Discovery Space. - - Returns: - DataFrame with columns: ['identifier', ] - """ - space = get_space(space_or_space_id=discoverySpace) - entity_space = space.entitySpace - cp_ids = [cp.identifier for cp in entity_space.constitutiveProperties] - - list_of_dicts_to_convert = [] - for point_values in entity_space.sequential_point_iterator(): - point_dict = dict(zip(cp_ids, point_values, strict=True)) - entity = entity_space.entity_for_point(point_dict) - ed = {"identifier": entity.identifier} - ed.update(point_dict) - list_of_dicts_to_convert.append(ed) - - return pd.DataFrame(list_of_dicts_to_convert) - - -def get_df_at_least_one_measured_value( - discoverySpace: DiscoverySpace | str, - targetOutput_list: list[str] | None = None, - add_measurement_id: bool = False, -) -> pd.DataFrame: - """ - Return a DataFrame of entities with at least one measured target output. - - Returns: - DataFrame with columns: ['identifier' (optional), , ] - """ - if not targetOutput_list: - targetOutput_list = [] - space = get_space(space_or_space_id=discoverySpace) - col_list = [cp.identifier for cp in space.entitySpace.constitutiveProperties] - if add_measurement_id: - col_list = ["identifier", *col_list] - - discoverySpace.sample_store.refresh() - - df = pd.DataFrame( - space.matchingEntitiesTable( - property_type="target", - aggregationMethod=PropertyAggregationMethodEnum.mean, - ) - ) - - if df.empty: - logger.warning( - "No measured properties found in the discovery space\nReturning empty DataFrame\n " - ) - return df - - all_df_cols = list(df.columns) - valid_targetOutput_list = [] - for el in targetOutput_list: - if el in all_df_cols: - valid_targetOutput_list.append(el) - elif f"{el}-mean" in all_df_cols and el not in all_df_cols: - logger.warning( - f"Column named '{el}-mean' (instead of '{el}', which is not present)" - "found in the DataFrame obtained through matchingEntitiesTable. " - f"Renaming it to '{el}'." - ) - df.rename(columns={f"{el}-mean": el}, inplace=True) - valid_targetOutput_list += [el] - elif f"{el}-mean" in all_df_cols and el in all_df_cols: - logger.warning( - f"Columns named '{el}-mean' and '{el}'" - "found in the DataFrame obtained through matchingEntitiesTable. " - f"Renaming it to '{el}'." - ) - logger.error("Unexpected behavior can happen!") - df.rename(columns={f"{el}-mean": el}, inplace=True) - valid_targetOutput_list += [el] - col_list += valid_targetOutput_list - - if valid_targetOutput_list != targetOutput_list: - if len(valid_targetOutput_list) == 0: - logger.error( - "No valid target in the columns of the DataFrame." - f"columns are:\t{list(df.columns)}." - f"First rows are:\n{df.head(5)}" - ) - else: - not_found = [ - t for t in targetOutput_list if t not in valid_targetOutput_list - ] - logger.error( - f"Found measurements for the following valid targets:\t{valid_targetOutput_list}" - ) - logger.error( - f"No measurement found for the following valid targets:\t{not_found}" - ) - - removed_cols = [c for c in list(df.columns) if c not in col_list] - logger.debug( - "Obtaining df with at least one measured target." - f"Removed columns: {removed_cols}" - ) - - df = df[col_list] - df.dropna(inplace=True) - - if df.empty: - logger.warning( - "Although there were some measured properties in the discovery space." - ) - logger.warning( - "All measured properties in the discovery space" - f"are different from the desired outputs {targetOutput_list}.Returning empty DataFrame\n " - ) - - return df - - -def get_source_and_target( - discoverySpace: DiscoverySpace | str, - targetOutput: str, - log_string: str = "", -) -> tuple[pd.DataFrame, pd.DataFrame]: - """ - Build source (labeled) and target (unlabeled) DataFrames for a target output. - - Returns: - Tuple of (source_df, target_df) - """ - dfm = get_df_at_least_one_measured_value(discoverySpace, [targetOutput]) - dfu = get_df_all_entities_no_measurements(discoverySpace) - keys = [c for c in dfu.columns if c in dfm.columns and c != "identifier"] - - if dfm.empty: - logger.warning("The source space is empty") - return dfm, dfu - - df = dfu.merge(dfm, on=keys, how="left") - - if targetOutput not in list(df.columns): - logger.info( - f"""The target output was not present in the columns of the measured+unmeasured DataFrame,' \ - meaning that '{targetOutput}' has never been measured in this space. - dfm.empty = {df.empty}. Adding an empty column to the DataFrame. - """ - ) - logger.debug("Adding an empty column to the DataFrame.") - df[targetOutput] = pd.NA - - if targetOutput in list(df.columns): - df_measured_drop_na = df.dropna(subset=[targetOutput]) - df_unmeasured_drop_na = df[df[targetOutput].isna()].drop(columns=[targetOutput]) - n_rows_dropped = len(df) - len(df_measured_drop_na) - logger.debug( - f"Dropped {n_rows_dropped} rows. Function called with log_string={log_string}" - ) - if df_measured_drop_na.empty: - logger.warning( - f"Empty source after dropping rows that contain Nan in {targetOutput} column" - ) - if df_unmeasured_drop_na.empty: - logger.warning( - f"Empty target after filtering rows that contain Nan in {targetOutput} column" - ) - return df_measured_drop_na, df_unmeasured_drop_na - - save_path = "df_with_no_targetOutput_columns.csv" - logger.error( - f"'{targetOutput}' column is missing, saving df in {save_path}, returning unmerged DataFrames" - ) - df.to_csv(save_path) - return dfm, dfu - - -# ============================================================================ -# Entity/Point Conversion -# ============================================================================ - - -def validate_points_in_space( - points: list[dict], - space: DiscoverySpace, -) -> tuple[list[dict], list[int]]: - """ - Validate point dictionaries against a Discovery Space. - - Returns: - Tuple of (valid_points, invalid_indices) - """ - valid_points: list[dict] = [] - invalid_indices: list[int] = [] - - for i, p in enumerate(points): - if space.entitySpace.isPointInSpace(p): - valid_points.append(p) - else: - invalid_indices.append(i) - return valid_points, invalid_indices - - -def df_to_points( - df: pd.DataFrame, - cols: list[str] | None = None, - dropna: bool = True, - drop_duplicates: bool = False, -) -> list[dict[Hashable, Any]]: - """ - Convert DataFrame rows to list of point dictionaries. - - Args: - df: Input DataFrame - cols: Columns to include - dropna: If True, drop rows containing NaN - drop_duplicates: If True, drop duplicate rows - - Returns: - List of point dictionaries - """ - if cols is None: - cols = list(df.columns) - missing = set(cols) - set(df.columns) - if missing: - raise KeyError(f"Requested columns not present in DataFrame: {missing}") - - sub = df[cols].copy() - if dropna: - sub = sub.dropna(how="any") - if drop_duplicates: - sub = sub.drop_duplicates() - - def to_py(x: object) -> object: - if isinstance(x, (np.generic)): - return x.item() - return x - - for c in sub.columns: - sub[c] = sub[c].map(to_py) - - return sub.to_dict(orient="records") - - -def df_to_points_parsing( - df: pd.DataFrame, - cols: list[str] | None = None, - dropna: bool = True, - parse_values: bool = False, -) -> list[dict]: - """Convert DataFrame to points with optional string value parsing.""" - import ast - - points = df_to_points(df, cols=cols, dropna=dropna) - if not parse_values: - return points - - parsed = [] - for p in points: - newp = {} - for k, v in p.items(): - if isinstance(v, str): - try: - newp[k] = ast.literal_eval(v) - except Exception: - newp[k] = v - else: - newp[k] = v - parsed.append(newp) - return parsed - - -def make_points_from_df( - df: pd.DataFrame, - space: DiscoverySpace, - cols: list[str] | None = None, - dropna: bool = True, - parse_values: bool = True, -) -> list[dict]: - """ - Convert DataFrame of constitutive properties into point dictionaries. - - Args: - df: Input DataFrame - space: Discovery Space providing canonical order - cols: Explicit list of columns to use - dropna: If True, drop rows with NaN - parse_values: If True, parse string values - - Returns: - List of point dictionaries - """ - if cols is None: - cols = [cp.identifier for cp in space.entitySpace.constitutiveProperties] - - missing = set(cols) - set(df.columns) - if missing: - raise KeyError(f"Requested columns not present in DataFrame: {missing}") - - return df_to_points_parsing(df, cols=cols, dropna=dropna, parse_values=parse_values) - - -def get_list_of_entities_from_df_and_space( - df: pd.DataFrame, space: DiscoverySpace -) -> list[Entity]: - """ - Convert DataFrame rows to Entity objects validated against a discovery space. - - Args: - df: DataFrame containing constitutive property values - space: DiscoverySpace defining the entity space constraints - - Returns: - List of valid Entity objects - """ - points = make_points_from_df(df=df, space=space) - valid_points, __ = validate_points_in_space(points, space) - - list_of_entities = [] - from orchestrator.schema.point import SpacePoint - - for p in valid_points: - sp = SpacePoint(entity=p) - entity = sp.to_entity(generatorid="no_priors_characterization") - list_of_entities.append(entity) - - numberEntities = len(list_of_entities) - if numberEntities != len(df): - numberEntities_log = f"""Warning: number of valid entities {numberEntities} is different from the number of rows in the ordered df {len(df)}. - This means that some rows in the ordered df did not correspond to valid entities in the discovery space. - """ - logging.warning(numberEntities_log) - return list_of_entities - - -# Made with Bob diff --git a/tests/core/discoveryspace/test_no_priors_sampler.py b/tests/core/discoveryspace/test_no_priors_sampler.py deleted file mode 100644 index a31730426..000000000 --- a/tests/core/discoveryspace/test_no_priors_sampler.py +++ /dev/null @@ -1,97 +0,0 @@ -# Copyright IBM Corporation 2025, 2026 -# SPDX-License-Identifier: MIT - -"""Tests for the no-priors sampler in core discoveryspace.""" - -import pytest -from pydantic import ValidationError - -from orchestrator.core.discoveryspace.no_priors_parameters import NoPriorsParameters -from orchestrator.core.discoveryspace.no_priors_sampler import NoPriorsSampleSelector - - -class TestNoPriorsParameters: - """Test NoPriorsParameters model.""" - - def test_default_parameters(self) -> None: - """Test default parameter values.""" - params = NoPriorsParameters(targetOutput="test_target") - assert params.targetOutput == "test_target" - assert params.samples == 20 - assert params.batchSize == 1 - assert params.sampling_strategy == "clhs" - - def test_custom_parameters(self) -> None: - """Test custom parameter values.""" - params = NoPriorsParameters( - targetOutput="custom_target", - samples=50, - batchSize=5, - sampling_strategy="sobol", - ) - assert params.targetOutput == "custom_target" - assert params.samples == 50 - assert params.batchSize == 5 - assert params.sampling_strategy == "sobol" - - def test_case_insensitive_strategy(self) -> None: - """Test that sampling_strategy is case-insensitive.""" - params = NoPriorsParameters(targetOutput="test", sampling_strategy="CLHS") - assert params.sampling_strategy == "clhs" - - params = NoPriorsParameters(targetOutput="test", sampling_strategy="Sobol") - assert params.sampling_strategy == "sobol" - - def test_invalid_strategy(self) -> None: - """Test that invalid strategy raises validation error.""" - with pytest.raises(ValidationError, match="sampling_strategy"): - NoPriorsParameters(targetOutput="test", sampling_strategy="invalid") - - def test_samples_validation(self) -> None: - """Test that samples must be >= 1.""" - with pytest.raises(ValidationError, match="samples"): - NoPriorsParameters(targetOutput="test", samples=0) - - with pytest.raises(ValidationError, match="samples"): - NoPriorsParameters(targetOutput="test", samples=-1) - - def test_batch_size_validation(self) -> None: - """Test that batchSize must be >= 1.""" - with pytest.raises(ValidationError, match="batchSize"): - NoPriorsParameters(targetOutput="test", batchSize=0) - - -class TestNoPriorsSampleSelector: - """Test NoPriorsSampleSelector sampler.""" - - def test_sampler_initialization(self) -> None: - """Test sampler can be initialized with parameters.""" - params = NoPriorsParameters(targetOutput="test_target", samples=10) - sampler = NoPriorsSampleSelector(parameters=params) - assert sampler.params == params - assert sampler.params.targetOutput == "test_target" - assert sampler.params.samples == 10 - - def test_parameters_model(self) -> None: - """Test that parameters_model returns correct type.""" - assert NoPriorsSampleSelector.parameters_model() == NoPriorsParameters - - def test_sampler_compatible_with_discovery_space_remote(self) -> None: - """Test that sampler reports compatibility with any discovery space.""" - # This is a simple compatibility check - always returns True - # We don't need a real DiscoverySpaceManager for this test - assert NoPriorsSampleSelector.samplerCompatibleWithDiscoverySpaceRemote(None) - - def test_entity_iterator_not_implemented(self) -> None: - """Test that entityIterator raises NotImplementedError.""" - params = NoPriorsParameters(targetOutput="test_target") - sampler = NoPriorsSampleSelector(parameters=params) - - # entityIterator is not implemented for this sampler - # The NotImplementedError is raised when the iterator is called - iterator = sampler.entityIterator(discoverySpace=None, batchsize=1) - with pytest.raises(NotImplementedError): - next(iterator) - - -# Made with Bob diff --git a/website/docs/operators/random-walk.md b/website/docs/operators/random-walk.md index 4767f4702..34e438032 100644 --- a/website/docs/operators/random-walk.md +++ b/website/docs/operators/random-walk.md @@ -279,46 +279,6 @@ spaces: - your-spaces ``` -### Advanced Samplers - -When the base samplers are not enough, `random_walk` can also use more -specialized samplers that still integrate with its normal batching, filtering, and -memoization. - -#### Quasi-Random Sampling Strategies - -The `NoPriorsSampleSelector` provides quasi-random sampling strategies designed -for high-dimensional discrete spaces. These strategies produce sequences where -consecutive elements are maximally dispersed, favoring uniform coverage of the -space: - -- **`sobol`**: Sobol sequences are low-discrepancy quasi-random sequences widely - used for space-filling designs. They provide better coverage than pure random - sampling by ensuring points are well-distributed across all dimensions. -- **`clhs`**: Concatenated Latin Hypercube Sampling (CLHS) samples each dimension - independently without replacement, cycling through all values before repeating. - This ensures each dimension is uniformly covered. - -**Collision Handling**: Sobol sampling may produce collisions (duplicate points), -when this happens the sampler automatically falls back to CLHS to ensure -the requested number of unique samples. - -#### Example: Sobol Sampling - -Example using Sobol ordering for quasi-random low-discrepancy coverage: - -```yaml -samplerConfig: - module: - moduleName: orchestrator.core.discoveryspace.no_priors_sampler - moduleClass: NoPriorsSampleSelector - parameters: - targetOutput: yield - samples: 100 - batchSize: 1 - sampling_strategy: sobol -``` - ### Custom Samplers It is also possible to specify that `random_walk` uses a custom sampler. This is @@ -377,6 +337,92 @@ class MySampler(BaseSampler): ... ``` +#### Quasi-Random Sampling Strategies + +Some useful custom samplers are provided through the TRIM plugin. +To use these samplers, you must first install TRIM, from root the command is: + +```bash +pip install plugins/operators/trim/ +``` + +The `NoPriorsSampleSelector` provides quasi-random sampling strategies designed +for high-dimensional discrete spaces. These strategies produce sequences where +consecutive elements are maximally dispersed, favoring uniform coverage of the +space: + +- **`sobol`**: Sobol sequences are low-discrepancy quasi-random sequences widely + used for space-filling designs. They provide better coverage than pure random + sampling by ensuring points are well-distributed across all dimensions. +- **`clhs`**: Concatenated Latin Hypercube Sampling (CLHS) samples each dimension + independently without replacement, cycling through all values before repeating. + This ensures each dimension is uniformly covered. + +**Collision Handling**: Sobol sampling may produce collisions (duplicate points), +when this happens the sampler automatically falls back to CLHS to ensure +the requested number of unique samples. + +#### Example: Sobol Sampling + +Here we write an example using Sobol ordering for quasi-random +low-discrepancy coverage. Make sure to install the TRIM package first. +Then install TRIM custom experiments with + +```bash +pip install examples/trim/custom_experiments/ +``` + +To create a discoveryspace and explore it with the TRIM operator, execute the +following from the root of the ado repository: + +```bash +ado create space -f examples/trim/example_yamls/space_pressure.yaml --new-sample-store + +ado create operation -f \ + examples/trim/example_yamls/randomwalk_sobol_operation.yaml \ + --use-latest space +``` + +The configuration file `randomwalk_sobol_operation.yaml` contains the following +to specify which points to sample + +```yaml +samplerConfig: + module: + moduleName: trim.samplers.no_priors_sampler + moduleClass: NoPriorsSampleSelector + parameters: + targetOutput: pressure + samples: 20 + batchSize: 1 + sampling_strategy: sobol +``` + +Since `batchSize: 1` the operation will sample one point at a time, this +ensures that the sequence of measurements has the desired uniform coverage + +```bash +ado show entities operation --use-latest -o csv --output-file your_file.csv +``` + +The file `your_file.csv` will contain the sequence of sampled points, you +will see something like this: + + + +```csv +request_index,result_index,identifier,experiment_id,generatorid,mol,temperature,volume,pressure,request_id,entity_index,valid +0,0,mol.0.2-temperature.274-volume.8,custom_experiments.calculate_pressure_ideal_gas,no_priors_characterization,0.2,274,8,56.9540689333,c8f814,0,True +1,0,mol.0.7-temperature.284-volume.1,custom_experiments.calculate_pressure_ideal_gas,no_priors_characterization,0.7,284,1,1652.9151684584,232c8e,0,True +2,0,mol.0.4-temperature.294-volume.7,custom_experiments.calculate_pressure_ideal_gas,no_priors_characterization,0.4,294,7,139.6829719824,9c6ae3,0,True +3,0,mol.0.9-temperature.284-volume.5,custom_experiments.calculate_pressure_ideal_gas,no_priors_characterization,0.9,284,5,425.03532903216,83a93d,0,True +4,0,mol.0.5-temperature.280-volume.6,custom_experiments.calculate_pressure_ideal_gas,no_priors_characterization,0.5,280,6,194.00412775333334,9e8ecd,0,True +5,0,mol.0.1-temperature.298-volume.4,custom_experiments.calculate_pressure_ideal_gas,no_priors_characterization,0.1,298,4,61.9427465041,db9284,0,True +... +``` + + + ### Sampling all Entities If either of the following conditions are true you can specify a value of "all" From 13b9b75ef6c657ccb3156f4009325a6db31e8518 Mon Sep 17 00:00:00 2001 From: Daniele Lotito Date: Thu, 23 Apr 2026 12:22:36 +0100 Subject: [PATCH 16/23] docs: update docs --- website/docs/operators/random-walk.md | 87 +++++++++++++-------------- 1 file changed, 42 insertions(+), 45 deletions(-) diff --git a/website/docs/operators/random-walk.md b/website/docs/operators/random-walk.md index 34e438032..674e17193 100644 --- a/website/docs/operators/random-walk.md +++ b/website/docs/operators/random-walk.md @@ -279,14 +279,10 @@ spaces: - your-spaces ``` -### Custom Samplers +## Custom Samplers -It is also possible to specify that `random_walk` uses a custom sampler. This is -a class that inherits from -`orchestrator.core.discoveryspace.samplers.BaseSampler`. This is useful for -implementing more complex sampling schemes. For example, for developers who want -to use random_walk to drive an exploration but have custom logic to execute -before choosing each sample/entity. +`random_walk` can also use custom samplers for +more complex sampling schemes. For custom samplers the `samplerConfig` field has the following structure: @@ -302,45 +298,11 @@ parameters: # A dictionary of key value pairs with the values for the custom sam -#### Implementing a Custom Sampler - -To implement a custom sampler create a sub-class of -`orchestrator.core.discovery.samplers.BaseSampler` and implement all required -methods - -The `BaseSampler` class does not specify any `__init__` parameters. If your -custom class requires initialization parameters then - -- define a pydantic model for them -- override the `parameters_model` class method to return this model -- add a non keyword parameter to your custom classes `__init__` that is this - type. +### Available Custom Samplers -For example: +#### No Priors Sample Selector -```python -# Class for the custom samplers parameters -class MySamplerParams(BaseModel): - ... - -# Subclass of BaseSampler implementing the custom sampling logic -class MySampler(BaseSampler): - - @classmethod - def parameters_model(cls) -> Optional[Type[BaseModel]]: - - # Return the custom samplers parameters model - return MySamplerParams - - # Add an init arg to take the parameters model - def __init__(self, parameters: MySamplerParams): - ... -``` - -#### Quasi-Random Sampling Strategies - -Some useful custom samplers are provided through the TRIM plugin. -To use these samplers, you must first install TRIM, from root the command is: +To install `NoPriorsSampleSelector` execute ```bash pip install plugins/operators/trim/ @@ -362,7 +324,7 @@ space: when this happens the sampler automatically falls back to CLHS to ensure the requested number of unique samples. -#### Example: Sobol Sampling +##### Example: Sobol Sampling Here we write an example using Sobol ordering for quasi-random low-discrepancy coverage. Make sure to install the TRIM package first. @@ -423,6 +385,41 @@ request_index,result_index,identifier,experiment_id,generatorid,mol,temperature, +#### Implementing a Custom Sampler + +To implement a custom sampler create a sub-class of +`orchestrator.core.discovery.samplers.BaseSampler` and implement all required +methods + +The `BaseSampler` class does not specify any `__init__` parameters. If your +custom class requires initialization parameters then + +- define a pydantic model for them +- override the `parameters_model` class method to return this model +- add a non keyword parameter to your custom classes `__init__` that is this + type. + +For example: + +```python +# Class for the custom samplers parameters +class MySamplerParams(BaseModel): + ... + +# Subclass of BaseSampler implementing the custom sampling logic +class MySampler(BaseSampler): + + @classmethod + def parameters_model(cls) -> Optional[Type[BaseModel]]: + + # Return the custom samplers parameters model + return MySamplerParams + + # Add an init arg to take the parameters model + def __init__(self, parameters: MySamplerParams): + ... +``` + ### Sampling all Entities If either of the following conditions are true you can specify a value of "all" From 214230288cc1af4dd641aba2a415bd6b07240dfc Mon Sep 17 00:00:00 2001 From: Daniele Lotito Date: Fri, 24 Apr 2026 10:00:55 +0100 Subject: [PATCH 17/23] fix(test): complete import refactoring --- tests/operators/test_trim_example_integration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/operators/test_trim_example_integration.py b/tests/operators/test_trim_example_integration.py index 7c75471d0..260e58074 100644 --- a/tests/operators/test_trim_example_integration.py +++ b/tests/operators/test_trim_example_integration.py @@ -13,7 +13,6 @@ import orchestrator.modules.operators.randomwalk # noqa: F401 from orchestrator.core.discoveryspace.config import DiscoverySpaceConfiguration -from orchestrator.core.discoveryspace.no_priors_parameters import NoPriorsParameters from orchestrator.core.discoveryspace.space import DiscoverySpace from orchestrator.core.operation.resource import ( OperationExitStateEnum, @@ -31,6 +30,7 @@ pytest.importorskip("autogluon") +from trim.samplers.no_priors_parameters import NoPriorsParameters from trim.trim_pydantic import ( AutoGluonArgs, SamplingBudget, From 1a70404cf156267efe861bc7e7d5628844adc104 Mon Sep 17 00:00:00 2001 From: Daniele Lotito Date: Fri, 24 Apr 2026 15:22:50 +0100 Subject: [PATCH 18/23] build: remove scipy deps as it is no longer needed --- pyproject.toml | 1 - requirements.txt | 100 ----------------------------------------------- uv.lock | 11 ------ 3 files changed, 112 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 5251c82ed..2300b3854 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,7 +29,6 @@ dependencies = [ "pymysql[rsa]>=1.1.1", "pyyaml>=6.0.2", "ray[serve]>=2.9", - "scipy", "sqlalchemy>2", "typer>=0.22.0", "uv>=0.10.4", diff --git a/requirements.txt b/requirements.txt index a518082bd..199703253 100644 --- a/requirements.txt +++ b/requirements.txt @@ -422,7 +422,6 @@ googleapis-common-protos==1.74.0 \ # via google-api-core greenlet==3.4.0 ; platform_machine == 'AMD64' or platform_machine == 'WIN32' or platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'ppc64le' or platform_machine == 'win32' or platform_machine == 'x86_64' \ --hash=sha256:04403ac74fe295a361f650818de93be11b5038a78f49ccfb64d3b1be8fbf1267 \ - --hash=sha256:0e1254cf0cbaa17b04320c3a78575f29f3c161ef38f59c977108f19ffddaf077 \ --hash=sha256:1054c5a3c78e2ab599d452f23f7adafef55062a783a8e241d24f3b633ba6ff82 \ --hash=sha256:16dec271460a9a2b154e3b1c2fa1050ce6280878430320e85e08c166772e3f97 \ --hash=sha256:1a54a921561dd9518d31d2d3db4d7f80e589083063ab4d3e2e950756ef809e1a \ @@ -436,27 +435,20 @@ greenlet==3.4.0 ; platform_machine == 'AMD64' or platform_machine == 'WIN32' or --hash=sha256:5b99e87be7eba788dd5b75ba1cde5639edffdec5f91fe0d734a249535ec3408c \ --hash=sha256:5cb614ace7c27571270354e9c9f696554d073f8aa9319079dcba466bbdead711 \ --hash=sha256:636d2f95c309e35f650e421c23297d5011716be15d966e6328b367c9fc513a82 \ - --hash=sha256:6f0def07ec9a71d72315cf26c061aceee53b306c36ed38c35caba952ea1b319d \ --hash=sha256:805bebb4945094acbab757d34d6e1098be6de8966009ab9ca54f06ff492def58 \ --hash=sha256:8424683caf46eb0eb6f626cb95e008e8cc30d0cb675bdfa48200925c79b38a08 \ --hash=sha256:849f8bc17acd6295fcb5de8e46d55cc0e52381c56eaf50a2afd258e97bc65940 \ - --hash=sha256:89995ce5ddcd2896d89615116dd39b9703bfa0c07b583b85b89bf1b5d6eddf81 \ - --hash=sha256:8c5696c42e6bb5cfb7c6ff4453789081c66b9b91f061e5e9367fa15792644e76 \ --hash=sha256:90036ce224ed6fe75508c1907a77e4540176dcf0744473627785dd519c6f9996 \ --hash=sha256:9390ad88b652b1903814eaabd629ca184db15e0eeb6fe8a390bbf8b9106ae15a \ --hash=sha256:956215d5e355fffa7c021d168728321fd4d31fd730ac609b1653b450f6a4bc71 \ - --hash=sha256:98eedd1803353daf1cd9ef23eef23eda5a4d22f99b1f998d273a8b78b70dd47f \ --hash=sha256:9b2d9a138ffa0e306d0e2b72976d2fb10b97e690d40ab36a472acaab0838e2de \ --hash=sha256:a0a53fb071531d003b075c444014ff8f8b1a9898d36bb88abd9ac7b3524648a2 \ --hash=sha256:a19093fbad824ed7c0f355b5ff4214bffda5f1a7f35f29b31fcaa240cc0135ab \ --hash=sha256:a1c4f6b453006efb8310affb2d132832e9bbb4fc01ce6df6b70d810d38f1f6dc \ --hash=sha256:a70ed1cb0295bee1df57b63bf7f46b4e56a5c93709eea769c1fec1bb23a95875 \ - --hash=sha256:ac6a5f618be581e1e0713aecec8e54093c235e5fa17d6d8eb7ffc487e2300508 \ --hash=sha256:b45e45fe47a19051a396abb22e19e7836a59ee6c5a90f3be427343c37908d65b \ - --hash=sha256:b7857e2202aae67bc5725e0c1f6403c20a8ff46094ece015e7d474f5f7020b55 \ --hash=sha256:c660bce1940a1acae5f51f0a064f1bc785d07ea16efcb4bc708090afc4d69e83 \ --hash=sha256:d18eae9a7fb0f499efcd146b8c9750a2e1f6e0e93b5a382b3481875354a430e6 \ - --hash=sha256:d336d46878e486de7d9458653c722875547ac8d36a1cff9ffaf4a74a3c1f62eb \ --hash=sha256:ee407d4d1ca9dc632265aee1c8732c4a2d60adff848057cdebfe5fe94eb2c8a2 \ --hash=sha256:f38b81880ba28f232f1f675893a39cf7b6db25b31cc0a09bb50787ecf957e85e \ --hash=sha256:f50a96b64dafd6169e595a5c56c9146ef80333e67d4476a65a9c55f400fc22ff \ @@ -766,7 +758,6 @@ numpy==2.2.6 \ # via # ado-core # pandas - # scipy opencensus==0.11.4 \ --hash=sha256:a18487ce68bc19900336e0ff4655c5a116daf10c1b3685ece8d971bddad6a864 \ --hash=sha256:cbef87d8b8773064ab60e5c2a1ced58bbaa38a6d052c41aec224958ce544eff2 @@ -1257,97 +1248,6 @@ rpds-py==0.30.0 \ # via # jsonschema # referencing -scipy==1.15.3 ; python_full_version < '3.11' \ - --hash=sha256:05dc6abcd105e1a29f95eada46d4a3f251743cfd7d3ae8ddb4088047f24ea477 \ - --hash=sha256:06efcba926324df1696931a57a176c80848ccd67ce6ad020c810736bfd58eb1c \ - --hash=sha256:0a769105537aa07a69468a0eefcd121be52006db61cdd8cac8a0e68980bbb723 \ - --hash=sha256:0bdd905264c0c9cfa74a4772cdb2070171790381a5c4d312c973382fc6eaf730 \ - --hash=sha256:0ff17c0bb1cb32952c09217d8d1eed9b53d1463e5f1dd6052c7857f83127d539 \ - --hash=sha256:14ed70039d182f411ffc74789a16df3835e05dc469b898233a245cdfd7f162cb \ - --hash=sha256:185cd3d6d05ca4b44a8f1595af87f9c372bb6acf9c808e99aa3e9aa03bd98cf6 \ - --hash=sha256:18aaacb735ab38b38db42cb01f6b92a2d0d4b6aabefeb07f02849e47f8fb3594 \ - --hash=sha256:1c832e1bd78dea67d5c16f786681b28dd695a8cb1fb90af2e27580d3d0967e92 \ - --hash=sha256:263961f658ce2165bbd7b99fa5135195c3a12d9bef045345016b8b50c315cb82 \ - --hash=sha256:271e3713e645149ea5ea3e97b57fdab61ce61333f97cfae392c28ba786f9bb49 \ - --hash=sha256:2c620736bcc334782e24d173c0fdbb7590a0a436d2fdf39310a8902505008759 \ - --hash=sha256:34716e281f181a02341ddeaad584205bd2fd3c242063bd3423d61ac259ca7eba \ - --hash=sha256:39cb9c62e471b1bb3750066ecc3a3f3052b37751c7c3dfd0fd7e48900ed52982 \ - --hash=sha256:3ac07623267feb3ae308487c260ac684b32ea35fd81e12845039952f558047b8 \ - --hash=sha256:3b0334816afb8b91dab859281b1b9786934392aa3d527cd847e41bb6f45bee65 \ - --hash=sha256:40e54d5c7e7ebf1aa596c374c49fa3135f04648a0caabcb66c52884b943f02b4 \ - --hash=sha256:50f9e62461c95d933d5c5ef4a1f2ebf9a2b4e83b0db374cb3f1de104d935922e \ - --hash=sha256:52092bc0472cfd17df49ff17e70624345efece4e1a12b23783a1ac59a1b728ed \ - --hash=sha256:5380741e53df2c566f4d234b100a484b420af85deb39ea35a1cc1be84ff53a5c \ - --hash=sha256:5e721fed53187e71d0ccf382b6bf977644c533e506c4d33c3fb24de89f5c3ed5 \ - --hash=sha256:6487aa99c2a3d509a5227d9a5e889ff05830a06b2ce08ec30df6d79db5fcd5c5 \ - --hash=sha256:6ac6310fdbfb7aa6612408bd2f07295bcbd3fda00d2d702178434751fe48e019 \ - --hash=sha256:6cfd56fc1a8e53f6e89ba3a7a7251f7396412d655bca2aa5611c8ec9a6784a1e \ - --hash=sha256:6db907c7368e3092e24919b5e31c76998b0ce1684d51a90943cb0ed1b4ffd6c1 \ - --hash=sha256:721d6b4ef5dc82ca8968c25b111e307083d7ca9091bc38163fb89243e85e3889 \ - --hash=sha256:76ad1fb5f8752eabf0fa02e4cc0336b4e8f021e2d5f061ed37d6d264db35e3ca \ - --hash=sha256:79167bba085c31f38603e11a267d862957cbb3ce018d8b38f79ac043bc92d825 \ - --hash=sha256:795c46999bae845966368a3c013e0e00947932d68e235702b5c3f6ea799aa8c9 \ - --hash=sha256:7e11270a000969409d37ed399585ee530b9ef6aa99d50c019de4cb01e8e54e62 \ - --hash=sha256:8c9ed3ba2c8a2ce098163a9bdb26f891746d02136995df25227a20e71c396ebb \ - --hash=sha256:993439ce220d25e3696d1b23b233dd010169b62f6456488567e830654ee37a6b \ - --hash=sha256:9d61e97b186a57350f6d6fd72640f9e99d5a4a2b8fbf4b9ee9a841eab327dc13 \ - --hash=sha256:9db984639887e3dffb3928d118145ffe40eff2fa40cb241a306ec57c219ebbbb \ - --hash=sha256:9e2abc762b0811e09a0d3258abee2d98e0c703eee49464ce0069590846f31d40 \ - --hash=sha256:a345928c86d535060c9c2b25e71e87c39ab2f22fc96e9636bd74d1dbf9de448c \ - --hash=sha256:ad3432cb0f9ed87477a8d97f03b763fd1d57709f1bbde3c9369b1dff5503b253 \ - --hash=sha256:ae48a786a28412d744c62fd7816a4118ef97e5be0bee968ce8f0a2fba7acf3bb \ - --hash=sha256:aef683a9ae6eb00728a542b796f52a5477b78252edede72b8327a886ab63293f \ - --hash=sha256:b90ab29d0c37ec9bf55424c064312930ca5f4bde15ee8619ee44e69319aab163 \ - --hash=sha256:c05045d8b9bfd807ee1b9f38761993297b10b245f012b11b13b91ba8945f7e45 \ - --hash=sha256:c9deabd6d547aee2c9a81dee6cc96c6d7e9a9b1953f74850c179f91fdc729cb7 \ - --hash=sha256:dde4fc32993071ac0c7dd2d82569e544f0bdaff66269cb475e0f369adad13f11 \ - --hash=sha256:eae3cf522bc7df64b42cad3925c876e1b0b6c35c1337c93e12c0f366f55b0eaf \ - --hash=sha256:ed7284b21a7a0c8f1b6e5977ac05396c0d008b89e05498c8b7e8f4a1423bba0e \ - --hash=sha256:f77f853d584e72e874d87357ad70f44b437331507d1c311457bed8ed2b956126 - # via ado-core -scipy==1.16.3 ; python_full_version >= '3.11' \ - --hash=sha256:0151a0749efeaaab78711c78422d413c583b8cdd2011a3c1d6c794938ee9fdb2 \ - --hash=sha256:01e87659402762f43bd2fee13370553a17ada367d42e7487800bf2916535aecb \ - --hash=sha256:03192a35e661470197556de24e7cb1330d84b35b94ead65c46ad6f16f6b28f2a \ - --hash=sha256:0553371015692a898e1aa858fed67a3576c34edefa6b7ebdb4e9dde49ce5c203 \ - --hash=sha256:062246acacbe9f8210de8e751b16fc37458213f124bef161a5a02c7a39284304 \ - --hash=sha256:0c3b4dd3d9b08dbce0f3440032c52e9e2ab9f96ade2d3943313dfe51a7056959 \ - --hash=sha256:0c623a54f7b79dd88ef56da19bc2873afec9673a48f3b85b18e4d402bdd29a5a \ - --hash=sha256:16b8bc35a4cc24db80a0ec836a9286d0e31b2503cb2fd7ff7fb0e0374a97081d \ - --hash=sha256:1fb2472e72e24d1530debe6ae078db70fb1605350c88a3d14bc401d6306dbffe \ - --hash=sha256:2a207a6ce9c24f1951241f4693ede2d393f59c07abc159b2cb2be980820e01fb \ - --hash=sha256:2b71d93c8a9936046866acebc915e2af2e292b883ed6e2cbe5c34beb094b82d9 \ - --hash=sha256:2d1ae2cf0c350e7705168ff2429962a89ad90c2d49d1dd300686d8b2a5af22fc \ - --hash=sha256:3d4a07a8e785d80289dfe66b7c27d8634a773020742ec7187b85ccc4b0e7b686 \ - --hash=sha256:40be6cf99e68b6c4321e9f8782e7d5ff8265af28ef2cd56e9c9b2638fa08ad97 \ - --hash=sha256:50a3dbf286dbc7d84f176f9a1574c705f277cb6565069f88f60db9eafdbe3ee2 \ - --hash=sha256:532fb5ad6a87e9e9cd9c959b106b73145a03f04c7d57ea3e6f6bb60b86ab0876 \ - --hash=sha256:53c3844d527213631e886621df5695d35e4f6a75f620dca412bcd292f6b87d78 \ - --hash=sha256:56edc65510d1331dae01ef9b658d428e33ed48b4f77b1d51caf479a0253f96dc \ - --hash=sha256:57d01cb6f85e34f0946b33caa66e892aae072b64b034183f3d87c4025802a119 \ - --hash=sha256:5803c5fadd29de0cf27fa08ccbfe7a9e5d741bf63e4ab1085437266f12460ff9 \ - --hash=sha256:63d3cdacb8a824a295191a723ee5e4ea7768ca5ca5f2838532d9f2e2b3ce2135 \ - --hash=sha256:663b8d66a8748051c3ee9c96465fb417509315b99c71550fda2591d7dd634234 \ - --hash=sha256:72d1717fd3b5e6ec747327ce9bda32d5463f472c9dce9f54499e81fbd50245a1 \ - --hash=sha256:7dc1360c06535ea6116a2220f760ae572db9f661aba2d88074fe30ec2aa1ff88 \ - --hash=sha256:81fc5827606858cf71446a5e98715ba0e11f0dbc83d71c7409d05486592a45d6 \ - --hash=sha256:8be1ca9170fcb6223cc7c27f4305d680ded114a1567c0bd2bfcbf947d1b17511 \ - --hash=sha256:8d09d72dc92742988b0e7750bddb8060b0c7079606c0d24a8cc8e9c9c11f9079 \ - --hash=sha256:9452781bd879b14b6f055b26643703551320aa8d79ae064a71df55c00286a184 \ - --hash=sha256:96491a6a54e995f00a28a3c3badfff58fd093bf26cd5fb34a2188c8c756a3a2c \ - --hash=sha256:a8a26c78ef223d3e30920ef759e25625a0ecdd0d60e5a8818b7513c3e5384cf2 \ - --hash=sha256:b7180967113560cca57418a7bc719e30366b47959dd845a93206fbed693c867e \ - --hash=sha256:b81c27fc41954319a943d43b20e07c40bdcd3ff7cf013f4fb86286faefe546c4 \ - --hash=sha256:bea0a62734d20d67608660f69dcda23e7f90fb4ca20974ab80b6ed40df87a005 \ - --hash=sha256:c5192722cffe15f9329a3948c4b1db789fbb1f05c97899187dcf009b283aea70 \ - --hash=sha256:c97176013d404c7346bf57874eaac5187d969293bf40497140b0a2b2b7482e07 \ - --hash=sha256:cd13e354df9938598af2be05822c323e97132d5e6306b83a3b4ee6724c6e522e \ - --hash=sha256:d2ec56337675e61b312179a1ad124f5f570c00f920cc75e1000025451b88241c \ - --hash=sha256:deb3841c925eeddb6afc1e4e4a45e418d19ec7b87c5df177695224078e8ec733 \ - --hash=sha256:e7efa2681ea410b10dde31a52b18b0154d66f2485328830e45fdf183af5aefc6 \ - --hash=sha256:eab43fae33a0c39006a88096cd7b4f4ef545ea0447d250d5ac18202d40b6611d \ - --hash=sha256:fb4b29f4cf8cc5a8d628bc8d8e26d12d7278cd1f219f22698a378c3d67db5e4b - # via ado-core shellingham==1.5.4 \ --hash=sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686 \ --hash=sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de diff --git a/uv.lock b/uv.lock index 4465980c2..853607571 100644 --- a/uv.lock +++ b/uv.lock @@ -80,8 +80,6 @@ dependencies = [ { name = "pymysql", extra = ["rsa"] }, { name = "pyyaml" }, { name = "ray", extra = ["serve"] }, - { name = "scipy", version = "1.15.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, - { name = "scipy", version = "1.16.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, { name = "sqlalchemy" }, { name = "typer" }, { name = "uv" }, @@ -147,7 +145,6 @@ requires-dist = [ { name = "pymysql", extras = ["rsa"], specifier = ">=1.1.1" }, { name = "pyyaml", specifier = ">=6.0.2" }, { name = "ray", extras = ["serve"], specifier = ">=2.9" }, - { name = "scipy" }, { name = "sqlalchemy", specifier = ">2" }, { name = "typer", specifier = ">=0.22.0" }, { name = "uv", specifier = ">=0.10.4" }, @@ -2597,18 +2594,14 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/0c/bc/e30e1e3d5e8860b0e0ce4d2b16b2681b77fd13542fc0d72f7e3c22d16eff/greenlet-3.4.0-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:d18eae9a7fb0f499efcd146b8c9750a2e1f6e0e93b5a382b3481875354a430e6", size = 284315, upload-time = "2026-04-08T17:02:52.322Z" }, { url = "https://files.pythonhosted.org/packages/5b/cc/e023ae1967d2a26737387cac083e99e47f65f58868bd155c4c80c01ec4e0/greenlet-3.4.0-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:636d2f95c309e35f650e421c23297d5011716be15d966e6328b367c9fc513a82", size = 601916, upload-time = "2026-04-08T16:24:35.533Z" }, { url = "https://files.pythonhosted.org/packages/67/32/5be1677954b6d8810b33abe94e3eb88726311c58fa777dc97e390f7caf5a/greenlet-3.4.0-cp310-cp310-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:234582c20af9742583c3b2ddfbdbb58a756cfff803763ffaae1ac7990a9fac31", size = 616399, upload-time = "2026-04-08T16:30:54.536Z" }, - { url = "https://files.pythonhosted.org/packages/82/0a/3a4af092b09ea02bcda30f33fd7db397619132fe52c6ece24b9363130d34/greenlet-3.4.0-cp310-cp310-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:ac6a5f618be581e1e0713aecec8e54093c235e5fa17d6d8eb7ffc487e2300508", size = 621077, upload-time = "2026-04-08T16:40:34.946Z" }, { url = "https://files.pythonhosted.org/packages/74/bf/2d58d5ea515704f83e34699128c9072a34bea27d2b6a556e102105fe62a5/greenlet-3.4.0-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:523677e69cd4711b5a014e37bc1fb3a29947c3e3a5bb6a527e1cc50312e5a398", size = 611978, upload-time = "2026-04-08T15:56:31.335Z" }, - { url = "https://files.pythonhosted.org/packages/8c/39/3786520a7d5e33ee87b3da2531f589a3882abf686a42a3773183a41ef010/greenlet-3.4.0-cp310-cp310-manylinux_2_39_riscv64.whl", hash = "sha256:d336d46878e486de7d9458653c722875547ac8d36a1cff9ffaf4a74a3c1f62eb", size = 416893, upload-time = "2026-04-08T16:43:02.392Z" }, { url = "https://files.pythonhosted.org/packages/bd/69/6525049b6c179d8a923256304d8387b8bdd4acab1acf0407852463c6d514/greenlet-3.4.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:b45e45fe47a19051a396abb22e19e7836a59ee6c5a90f3be427343c37908d65b", size = 1571957, upload-time = "2026-04-08T16:26:17.041Z" }, { url = "https://files.pythonhosted.org/packages/4e/6c/bbfb798b05fec736a0d24dc23e81b45bcee87f45a83cfb39db031853bddc/greenlet-3.4.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:5434271357be07f3ad0936c312645853b7e689e679e29310e2de09a9ea6c3adf", size = 1637223, upload-time = "2026-04-08T15:57:27.556Z" }, { url = "https://files.pythonhosted.org/packages/b7/7d/981fe0e7c07bd9d5e7eb18decb8590a11e3955878291f7a7de2e9c668eb7/greenlet-3.4.0-cp310-cp310-win_amd64.whl", hash = "sha256:a19093fbad824ed7c0f355b5ff4214bffda5f1a7f35f29b31fcaa240cc0135ab", size = 237902, upload-time = "2026-04-08T17:03:14.16Z" }, { url = "https://files.pythonhosted.org/packages/fb/c6/dba32cab7e3a625b011aa5647486e2d28423a48845a2998c126dd69c85e1/greenlet-3.4.0-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:805bebb4945094acbab757d34d6e1098be6de8966009ab9ca54f06ff492def58", size = 285504, upload-time = "2026-04-08T15:52:14.071Z" }, { url = "https://files.pythonhosted.org/packages/54/f4/7cb5c2b1feb9a1f50e038be79980dfa969aa91979e5e3a18fdbcfad2c517/greenlet-3.4.0-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:439fc2f12b9b512d9dfa681c5afe5f6b3232c708d13e6f02c845e0d9f4c2d8c6", size = 605476, upload-time = "2026-04-08T16:24:37.064Z" }, { url = "https://files.pythonhosted.org/packages/d6/af/b66ab0b2f9a4c5a867c136bf66d9599f34f21a1bcca26a2884a29c450bd9/greenlet-3.4.0-cp311-cp311-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a70ed1cb0295bee1df57b63bf7f46b4e56a5c93709eea769c1fec1bb23a95875", size = 618336, upload-time = "2026-04-08T16:30:56.59Z" }, - { url = "https://files.pythonhosted.org/packages/6d/31/56c43d2b5de476f77d36ceeec436328533bff960a4cba9a07616e93063ab/greenlet-3.4.0-cp311-cp311-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:8c5696c42e6bb5cfb7c6ff4453789081c66b9b91f061e5e9367fa15792644e76", size = 625045, upload-time = "2026-04-08T16:40:37.111Z" }, { url = "https://files.pythonhosted.org/packages/e5/5c/8c5633ece6ba611d64bf2770219a98dd439921d6424e4e8cf16b0ac74ea5/greenlet-3.4.0-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c660bce1940a1acae5f51f0a064f1bc785d07ea16efcb4bc708090afc4d69e83", size = 613515, upload-time = "2026-04-08T15:56:32.478Z" }, - { url = "https://files.pythonhosted.org/packages/80/ca/704d4e2c90acb8bdf7ae593f5cbc95f58e82de95cc540fb75631c1054533/greenlet-3.4.0-cp311-cp311-manylinux_2_39_riscv64.whl", hash = "sha256:89995ce5ddcd2896d89615116dd39b9703bfa0c07b583b85b89bf1b5d6eddf81", size = 419745, upload-time = "2026-04-08T16:43:04.022Z" }, { url = "https://files.pythonhosted.org/packages/a9/df/950d15bca0d90a0e7395eb777903060504cdb509b7b705631e8fb69ff415/greenlet-3.4.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:ee407d4d1ca9dc632265aee1c8732c4a2d60adff848057cdebfe5fe94eb2c8a2", size = 1574623, upload-time = "2026-04-08T16:26:18.596Z" }, { url = "https://files.pythonhosted.org/packages/1a/e7/0839afab829fcb7333c9ff6d80c040949510055d2d4d63251f0d1c7c804e/greenlet-3.4.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:956215d5e355fffa7c021d168728321fd4d31fd730ac609b1653b450f6a4bc71", size = 1639579, upload-time = "2026-04-08T15:57:29.231Z" }, { url = "https://files.pythonhosted.org/packages/d9/2b/b4482401e9bcaf9f5c97f67ead38db89c19520ff6d0d6699979c6efcc200/greenlet-3.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:5cb614ace7c27571270354e9c9f696554d073f8aa9319079dcba466bbdead711", size = 238233, upload-time = "2026-04-08T17:02:54.286Z" }, @@ -2616,9 +2609,7 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/65/8b/3669ad3b3f247a791b2b4aceb3aa5a31f5f6817bf547e4e1ff712338145a/greenlet-3.4.0-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:1a54a921561dd9518d31d2d3db4d7f80e589083063ab4d3e2e950756ef809e1a", size = 286902, upload-time = "2026-04-08T15:52:12.138Z" }, { url = "https://files.pythonhosted.org/packages/38/3e/3c0e19b82900873e2d8469b590a6c4b3dfd2b316d0591f1c26b38a4879a5/greenlet-3.4.0-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:16dec271460a9a2b154e3b1c2fa1050ce6280878430320e85e08c166772e3f97", size = 606099, upload-time = "2026-04-08T16:24:38.408Z" }, { url = "https://files.pythonhosted.org/packages/b5/33/99fef65e7754fc76a4ed14794074c38c9ed3394a5bd129d7f61b705f3168/greenlet-3.4.0-cp312-cp312-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:90036ce224ed6fe75508c1907a77e4540176dcf0744473627785dd519c6f9996", size = 618837, upload-time = "2026-04-08T16:30:58.298Z" }, - { url = "https://files.pythonhosted.org/packages/44/57/eae2cac10421feae6c0987e3dc106c6d86262b1cb379e171b017aba893a6/greenlet-3.4.0-cp312-cp312-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:6f0def07ec9a71d72315cf26c061aceee53b306c36ed38c35caba952ea1b319d", size = 624901, upload-time = "2026-04-08T16:40:38.981Z" }, { url = "https://files.pythonhosted.org/packages/36/f7/229f3aed6948faa20e0616a0b8568da22e365ede6a54d7d369058b128afd/greenlet-3.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a1c4f6b453006efb8310affb2d132832e9bbb4fc01ce6df6b70d810d38f1f6dc", size = 615062, upload-time = "2026-04-08T15:56:33.766Z" }, - { url = "https://files.pythonhosted.org/packages/6a/8a/0e73c9b94f31d1cc257fe79a0eff621674141cdae7d6d00f40de378a1e42/greenlet-3.4.0-cp312-cp312-manylinux_2_39_riscv64.whl", hash = "sha256:0e1254cf0cbaa17b04320c3a78575f29f3c161ef38f59c977108f19ffddaf077", size = 423927, upload-time = "2026-04-08T16:43:05.293Z" }, { url = "https://files.pythonhosted.org/packages/08/97/d988180011aa40135c46cd0d0cf01dd97f7162bae14139b4a3ef54889ba5/greenlet-3.4.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:9b2d9a138ffa0e306d0e2b72976d2fb10b97e690d40ab36a472acaab0838e2de", size = 1573511, upload-time = "2026-04-08T16:26:20.058Z" }, { url = "https://files.pythonhosted.org/packages/d4/0f/a5a26fe152fb3d12e6a474181f6e9848283504d0afd095f353d85726374b/greenlet-3.4.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:8424683caf46eb0eb6f626cb95e008e8cc30d0cb675bdfa48200925c79b38a08", size = 1640396, upload-time = "2026-04-08T15:57:30.88Z" }, { url = "https://files.pythonhosted.org/packages/42/cf/bb2c32d9a100e36ee9f6e38fad6b1e082b8184010cb06259b49e1266ca01/greenlet-3.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:a0a53fb071531d003b075c444014ff8f8b1a9898d36bb88abd9ac7b3524648a2", size = 238892, upload-time = "2026-04-08T17:03:10.094Z" }, @@ -2626,9 +2617,7 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/7a/75/7e9cd1126a1e1f0cd67b0eda02e5221b28488d352684704a78ed505bd719/greenlet-3.4.0-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:43748988b097f9c6f09364f260741aa73c80747f63389824435c7a50bfdfd5c1", size = 285856, upload-time = "2026-04-08T15:52:45.82Z" }, { url = "https://files.pythonhosted.org/packages/9d/c4/3e2df392e5cb199527c4d9dbcaa75c14edcc394b45040f0189f649631e3c/greenlet-3.4.0-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5566e4e2cd7a880e8c27618e3eab20f3494452d12fd5129edef7b2f7aa9a36d1", size = 610208, upload-time = "2026-04-08T16:24:39.674Z" }, { url = "https://files.pythonhosted.org/packages/da/af/750cdfda1d1bd30a6c28080245be8d0346e669a98fdbae7f4102aa95fff3/greenlet-3.4.0-cp313-cp313-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:1054c5a3c78e2ab599d452f23f7adafef55062a783a8e241d24f3b633ba6ff82", size = 621269, upload-time = "2026-04-08T16:30:59.767Z" }, - { url = "https://files.pythonhosted.org/packages/e0/93/c8c508d68ba93232784bbc1b5474d92371f2897dfc6bc281b419f2e0d492/greenlet-3.4.0-cp313-cp313-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:98eedd1803353daf1cd9ef23eef23eda5a4d22f99b1f998d273a8b78b70dd47f", size = 628455, upload-time = "2026-04-08T16:40:40.698Z" }, { url = "https://files.pythonhosted.org/packages/54/78/0cbc693622cd54ebe25207efbb3a0eb07c2639cb8594f6e3aaaa0bb077a8/greenlet-3.4.0-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f82cb6cddc27dd81c96b1506f4aa7def15070c3b2a67d4e46fd19016aacce6cf", size = 617549, upload-time = "2026-04-08T15:56:34.893Z" }, - { url = "https://files.pythonhosted.org/packages/7f/46/cfaaa0ade435a60550fd83d07dfd5c41f873a01da17ede5c4cade0b9bab8/greenlet-3.4.0-cp313-cp313-manylinux_2_39_riscv64.whl", hash = "sha256:b7857e2202aae67bc5725e0c1f6403c20a8ff46094ece015e7d474f5f7020b55", size = 426238, upload-time = "2026-04-08T16:43:06.865Z" }, { url = "https://files.pythonhosted.org/packages/ba/c0/8966767de01343c1ff47e8b855dc78e7d1a8ed2b7b9c83576a57e289f81d/greenlet-3.4.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:227a46251ecba4ff46ae742bc5ce95c91d5aceb4b02f885487aff269c127a729", size = 1575310, upload-time = "2026-04-08T16:26:21.671Z" }, { url = "https://files.pythonhosted.org/packages/b8/38/bcdc71ba05e9a5fda87f63ffc2abcd1f15693b659346df994a48c968003d/greenlet-3.4.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5b99e87be7eba788dd5b75ba1cde5639edffdec5f91fe0d734a249535ec3408c", size = 1640435, upload-time = "2026-04-08T15:57:32.572Z" }, { url = "https://files.pythonhosted.org/packages/a1/c2/19b664b7173b9e4ef5f77e8cef9f14c20ec7fce7920dc1ccd7afd955d093/greenlet-3.4.0-cp313-cp313-win_amd64.whl", hash = "sha256:849f8bc17acd6295fcb5de8e46d55cc0e52381c56eaf50a2afd258e97bc65940", size = 238760, upload-time = "2026-04-08T17:04:03.878Z" }, From 0b30157e59755b8fe7e0e28d1695b012725e223c Mon Sep 17 00:00:00 2001 From: Daniele Lotito Date: Fri, 24 Apr 2026 15:25:53 +0100 Subject: [PATCH 19/23] chore: remove legacy content --- .../trim/src/trim/samplers/no_priors_parameters.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/plugins/operators/trim/src/trim/samplers/no_priors_parameters.py b/plugins/operators/trim/src/trim/samplers/no_priors_parameters.py index 1271ac769..c1240c4b1 100644 --- a/plugins/operators/trim/src/trim/samplers/no_priors_parameters.py +++ b/plugins/operators/trim/src/trim/samplers/no_priors_parameters.py @@ -15,8 +15,6 @@ class NoPriorsParameters(BaseModel): strategy (str): sampling subroutine: - 'random': selects random points from the beginning - - 'one_shift': refer to one_shift_then_random_points_high_dimensional_sampling - - 'recursive_aggregation': refer to recursive_aggregation_high_dimensional_sampling - 'clhs': refer to concatenated_latin_hypercube_sampling - 'sobol': sobol sampling """ @@ -48,17 +46,14 @@ class NoPriorsParameters(BaseModel): ] = 1 sampling_strategy: Annotated[ - Literal["random", "one_shift", "recursive_aggregation", "clhs", "sobol"], + Literal["random", "clhs", "sobol"], BeforeValidator(lambda s: s.lower()), Field( description=( "Sampling subroutine. Supported values:\n" " - 'random': selects random points from the beginning\n" - " - 'one_shift': see one_shift_then_random_points_high_dimensional_sampling\n" - " - 'recursive_aggregation': see recursive_aggregation_high_dimensional_sampling\n" " - 'clhs': dimension-wise random without replacement until each dim cycles\n" " - 'sobol': sobol sampling via scipy\n" - "Aliases: 'random_shifts' → 'recursive_aggregation'.\n" "Validation is case-insensitive; value is normalized to lowercase." ), ), From 9766e6050047ce3918d4deaf3f12ea3f5dea812d Mon Sep 17 00:00:00 2001 From: Daniele Lotito Date: Fri, 24 Apr 2026 15:45:18 +0100 Subject: [PATCH 20/23] fix: missing check bug --- plugins/operators/trim/src/trim/samplers/no_priors_utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/plugins/operators/trim/src/trim/samplers/no_priors_utils.py b/plugins/operators/trim/src/trim/samplers/no_priors_utils.py index 36a06e829..ccf6a2544 100644 --- a/plugins/operators/trim/src/trim/samplers/no_priors_utils.py +++ b/plugins/operators/trim/src/trim/samplers/no_priors_utils.py @@ -74,10 +74,10 @@ def get_index_list_van_der_corput( if len(sampled_indices) == length_segment: maximal_indices_list = list(range(length_segment)) - if sampled_indices.sort() != maximal_indices_list: + if sorted(sampled_indices) != maximal_indices_list: logging.error( - "Sampled indices do not correspond to [0,..., max_n_indices -1]" - "Returning list(range(max_n_indices)" + "Sampled indices do not correspond to [0,..., max_n_indices -1]. " + "Returning list(range(max_n_indices))" ) return maximal_indices_list From ba84268e9cefd828d1a39987f222eb030091fa62 Mon Sep 17 00:00:00 2001 From: Daniele Lotito Date: Fri, 24 Apr 2026 15:46:55 +0100 Subject: [PATCH 21/23] feat: enable synchronous entity iterator --- .../src/trim/samplers/no_priors_sampler.py | 64 +++++++++++++++++-- 1 file changed, 58 insertions(+), 6 deletions(-) diff --git a/plugins/operators/trim/src/trim/samplers/no_priors_sampler.py b/plugins/operators/trim/src/trim/samplers/no_priors_sampler.py index d685b4908..3030d4965 100644 --- a/plugins/operators/trim/src/trim/samplers/no_priors_sampler.py +++ b/plugins/operators/trim/src/trim/samplers/no_priors_sampler.py @@ -113,18 +113,70 @@ async def iterator() -> typing.AsyncGenerator[list[Entity], None]: # type: igno def entityIterator( self, discoverySpace: DiscoverySpace, batchsize: int = 1 ) -> typing.Generator[list[Entity], None, None]: - """Returns an remoteEntityIterator that returns entities in order""" + """ + Generate entities for no-priors characterization sampling (synchronous version). + + Orders the target space using a high-dimensional sampling strategy (e.g., CLHS, Sobol) + without relying on prior model knowledge or feature importance. + + Args: + discoverySpace: The discovery space to sample from + batchsize: Number of entities to yield per iteration + + Yields: + List of Entity objects to be measured, in the determined order + """ def iterator_closure( space: DiscoverySpace, ) -> typing.Callable[[], typing.Generator[list[Entity], None, None]]: - # list_of_entities = list(...) # type: ignore[name-defined] - # numberEntities = len(list_of_entities) + logger_no_priors.info("Characterization with no-priors starts.\n") + logger_no_priors.info(f"Parameters are:\n{self.params}\n\n") + + source_df, target_df = get_source_and_target( + space, self.params.targetOutput + ) + logger_no_priors.info(f"Target dataframe has length {len(target_df)}") + + # The 'samples' parameter specifies the number of NEW entities to sample, + # regardless of how many entities have already been measured in the space + logger_no_priors.info( + f"Space has {len(source_df)} measured entities. " + f"Sampling {self.params.samples} new entities as requested." + ) + target_df = order_df_for_sampling_with_no_priors( + target_df, + [cp.identifier for cp in space.entitySpace.constitutiveProperties], + self.params.samples, + strategy=self.params.sampling_strategy, + ) + list_of_entities_for_no_prior_characterization = ( + get_list_of_entities_from_df_and_space(df=target_df, space=space) + ) + + logger_no_priors.info( + "\n\nCharacterization with no-priors finished. Starting Iterative Modeling.\n" + ) - def iterator() -> typing.Generator[list[Entity], None, None]: # type: ignore[name-defined] - raise NotImplementedError - # ...for i in range(0, numberEntities, batchsize): + def iterator() -> typing.Generator[list[Entity], None, None]: + logger_no_priors.info( + "\n\nIteration over sorted entities for no priors characterization starts.\n" + ) + for i in range( + 0, len(list_of_entities_for_no_prior_characterization), batchsize + ): + entities = list_of_entities_for_no_prior_characterization[ + i : i + batchsize + ] + if len(entities) == 0: + logger_no_priors.info( + "\n\nCharacterization with no-priors finished.\n" + ) + break + else: + yield entities + logger_no_priors.info("\n\nCharacterization with no-priors finished.\n") return iterator From b25f4f862642ba79e4f13c762f5d551e03b85ca9 Mon Sep 17 00:00:00 2001 From: michaelj Date: Sat, 25 Apr 2026 18:39:55 +0100 Subject: [PATCH 22/23] docs(website): random_walk Reorder sections and header levels. --- website/docs/operators/random-walk.md | 137 +++++++++++++------------- 1 file changed, 70 insertions(+), 67 deletions(-) diff --git a/website/docs/operators/random-walk.md b/website/docs/operators/random-walk.md index 674e17193..e9327a250 100644 --- a/website/docs/operators/random-walk.md +++ b/website/docs/operators/random-walk.md @@ -59,25 +59,6 @@ After the second operation: replayed (as they were already measured during the first operation) - The timeseries of this second operation is stored. It has 200 entities in it. -## Controlling sampling and measurements: Continuous batching - -When a `random_walk` operation encounters an unmeasured entity in the -`discoveryspace`, it applies the experiments defined by its `measurementspace`. -Depending on the experiments, you may want to control how many concurrent -experiments are being executed. - -`random_walk` uses continuous batching to set the number of concurrent -**requested** experiments and ensure that, as far as possible, there is always -this number of experiments in flight. - -This approach maximizes throughput compared to standard batch-wise submission. -In the normal case the time to finish measuring batch of N entities is, at a -minimum, the time taken for the longest experiment to complete. This means if -one experiment is very long and the others short, there can be capacity in the -system for (N-1) additional entities to be measured but it will not be used. - -The next section explains more about configuring continuous batching - ## Configuring a `random_walk` operation The parameters for a `random_walk` operation are (default values shown): @@ -123,6 +104,8 @@ spaces: - your-spaces ``` +The following sections explain the different options + !!! info end You can get a default `random_walk` operation template and the schema of its @@ -131,10 +114,27 @@ spaces: The information output by this command should always be preferred over the information presented here if there is an inconsistency. +## Continuous batching + +When a `random_walk` operation encounters an unmeasured entity in the +`discoveryspace`, it applies the experiments defined by its `measurementspace`. +Depending on the experiments, you may want to control how many concurrent +experiments are being executed. + +`random_walk` uses continuous batching to set the number of concurrent +**requested** experiments and ensure that, as far as possible, there is always +this number of experiments in flight. + +This approach maximizes throughput compared to standard batch-wise submission. +In the normal case the time to finish measuring batch of N entities is, at a +minimum, the time taken for the longest experiment to complete. This means if +one experiment is very long and the others short, there can be capacity in the +system for (N-1) additional entities to be measured but it will not be used. + ### Batch Size and Concurrent Experiments -When it comes to managing resources during an exploration, the key variable one -wants to control is the number of concurrent experiments. +When it comes to managing resources during an exploration, the key variable +to control is the number of concurrent experiments. For the `random_walk` operator, this number is its `batchSize` parameter (the number of initial entities submitted) multiplied by the number of experiments in @@ -151,7 +151,37 @@ this many concurrent experiment requests during the operation. Hence, continuous batching can only maintain that there are N experiments requested at any time. -### Base Sampling Types and Modes +### Sampling all Entities + +If either of the following conditions are true you can specify a value of "all" +for the `numberOfEntities` field in the random walk configuration: + +- All dimensions in the `entityspace`s are discrete and bounded or categorical +- The sampling type is `selector` i.e. you are iterating over an existing set + number of entities in a `samplestore` + +In the first case `all` will be converted to the size of the space. In the +second case `all` will be converted to the number of matching entities in the +`samplestore`. + +If both of these conditions is False the `random_walk` operator will raise a +ValueError when the execution starts. + +!!! info end + + Depending on the Filter settings a randomwalk operation may not sample "all" + entities even if "all" is specified. This is because the filter may filter out + some entities. + +!!! warning end + + For `discoveryspaces` where one/both of the above conditions are True setting + `numberOfEntities` greater than the corresponding size (size of space, or number + of matching entities in `samplestore`) will raise a ValueError. This means you + cannot set `numberOfEntities` to an arbitrarily large number to ensure sampling + all of them - use `all` instead. + +## Basic Sampling The `samplerConfig` field controls how Entities are sampled during the operation. The base `samplerConfig` is shown in the examples above and has the @@ -163,7 +193,7 @@ samplerType: selector grouping: [] ``` -#### Sampling Types +### Sampling Types There are two sampling types: `generator` and `selector`. @@ -175,7 +205,7 @@ are bounded. The `selector` sampling type draws _existing matching entities_ from the `samplestore` of the `discoveryspace` i.e. it doesn't use the entity space. -#### Sampler Modes +### Sampler Modes Both sampling types support four modes, which can be categorised as flat or grouped: @@ -230,7 +260,7 @@ for x in propertyN.values: entity({'propertyN':x, 'propertyN_1':y, ..., 'property1':z}) ``` -#### Why Grouped Modes? +### Why Grouped Modes? The advantage of the group modes is that they can allow [actuators](../actuators/working-with-actuators.md) to reuse their test @@ -248,7 +278,7 @@ allows. See the docs of the specific actuator you are using to see if and how it can benefit from grouping. -#### Enabling Grouping +### Enabling Grouping To use the grouped modes (`randomgrouped`, `sequentialgrouped`) you need to supply a list of constitutive properties to group by using the `grouping` @@ -385,7 +415,7 @@ request_index,result_index,identifier,experiment_id,generatorid,mol,temperature, -#### Implementing a Custom Sampler +### Implementing a Custom Sampler To implement a custom sampler create a sub-class of `orchestrator.core.discovery.samplers.BaseSampler` and implement all required @@ -420,37 +450,7 @@ class MySampler(BaseSampler): ... ``` -### Sampling all Entities - -If either of the following conditions are true you can specify a value of "all" -for the `numberOfEntities` field in the random walk configuration: - -- All dimensions in the `entityspace`s are discrete and bounded or categorical -- The sampling type is `selector` i.e. you are iterating over an existing set - number of entities in a `samplestore` - -In the first case `all` will be converted to the size of the space. In the -second case `all` will be converted to the number of matching entities in the -`samplestore`. - -If both of these conditions is False the `random_walk` operator will raise a -ValueError when the execution starts. - -!!! info end - - Depending on the Filter settings a randomwalk operation may not sample "all" - entities even if "all" is specified. This is because the filter may filter out - some entities. - -!!! warning end - - For `discoveryspaces` where one/both of the above conditions are True setting - `numberOfEntities` greater than the corresponding size (size of space, or number - of matching entities in `samplestore`) will raise a ValueError. This means you - cannot set `numberOfEntities` to an arbitrarily large number to ensure sampling - all of them - use `all` instead. - -### Filtering Entities +## Filtering Entities In some circumstance you may want to only sample a subset of Entities. Some examples include @@ -474,26 +474,29 @@ which can take the following values: - `measured`: Only Entities fully measured by the experiments in the `measurementspace` will be sampled -### Multiple Measurement +## Memoization: Reusing existing measurements -By setting `singleMeasurement:` to False the random walk operation will measure -ALL entities it samples, even if they already have measurements. +If `singleMeasurement:` is False, all experiments are applied to +ALL entities sampled, even if they already have the results for that +experiment. -If entities have multiple measurements e.g. you turned this off and then turned -it on again, then if an entity has multiple measurements each one will be -replayed. +By setting `singleMeasurement:` to True (the default) a random walk operation +will check if an experiment has already been applied to an entity and, +if it has, reuse a.k.a. replay, the result. -Check [replayed measurements](explore_operators.md#memoization-replaying-measurements) +If the entity has multiple results for the same experiment, each one will be +replayed. +See [replayed measurements](explore_operators.md#memoization-replaying-measurements) for more details. -### Retrying Failed Measurements +## Retrying Failed Measurements If the measurement of an entity by an experiment fails `random_walk` can retry it. The parameter controlling this is `maxRetries` which by default is 0 - no retries. If `maxRetries` is N then failing measurements will be retried up to `N` times. -#### Experiment request index v number of experiments requested +### Experiment request index v number of experiments requested To understand a `random_walk` operations logs when maxRetries is greater than 0 it's necessary to understand how it tracks the entity+experiment combinations it From eb2defa5adb0581bb6109952c0dffa2bfc17044f Mon Sep 17 00:00:00 2001 From: Daniele Lotito Date: Mon, 27 Apr 2026 09:21:57 +0100 Subject: [PATCH 23/23] fix: operationInfo is None Added ternary check: operationInfo.actuatorConfigurationIdentifiers if operationInfo else [] Mirrors the guard already present in the no-priors block (lines 122-126) --- plugins/operators/trim/src/trim/operator.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/plugins/operators/trim/src/trim/operator.py b/plugins/operators/trim/src/trim/operator.py index 248fd8806..5873e8616 100644 --- a/plugins/operators/trim/src/trim/operator.py +++ b/plugins/operators/trim/src/trim/operator.py @@ -171,7 +171,11 @@ def trim( operationInfo=FunctionOperationInfo.model_validate( { "metadata": {"completed operation": "Iterative Modeling Operation"}, - "actuatorConfigurationIdentifiers": operationInfo.actuatorConfigurationIdentifiers, + "actuatorConfigurationIdentifiers": ( + operationInfo.actuatorConfigurationIdentifiers + if operationInfo + else [] + ), } ), **trim_rwparams.model_dump(),