IBM · VassilisVassiliadis · Sep 8, 2025 · Sep 2, 2025 · Sep 5, 2025 · Sep 5, 2025
diff --git a/plugins/actuators/sfttrainer/ado_actuators/sfttrainer/README.md b/plugins/actuators/sfttrainer/ado_actuators/sfttrainer/README.md
@@ -229,6 +229,13 @@ Optional:
 - stop_after_seconds: Default is `-1.0`. If set, the optimizer will be asked to
   stop after the specified time elapses. The check is performed after the end of
   each training step.
+- auto_stop_method: The default value is `None`. This parameter defines the
+  method used to automatically stop the fine-tuning job. Supported values are
+  `WARMUP_60S_STABLE_120S_OR_10_STEPS` and `None`. If set to
+  `WARMUP_60S_STABLE_120S_OR_10_STEPS`, the job stops after spending at least 60
+  seconds in the warmup phase plus the longer of 120 seconds or the duration of
+  10 optimization steps. This method excludes the first 60 seconds of training
+  when calculating throughput and system metrics.
 - distributed_backend: Default is `FSDP` for multi-gpu measurements, `None`
   (i.e. Data Parallel (DP)) for single-gpu measurements. Which pytorch backend
   to use when training with multiple GPU devices.
@@ -839,6 +846,13 @@ Optional:
 - stop_after_seconds: Default is `-1.0`. If set, the optimizer will be asked to
   stop after the specified time elapses. The check is performed after the end of
   each training step.
+- auto_stop_method: The default value is `None`. This parameter defines the
+  method used to automatically stop the fine-tuning job. Supported values are
+  `WARMUP_60S_STABLE_120S_OR_10_STEPS` and `None`. If set to
+  `WARMUP_60S_STABLE_120S_OR_10_STEPS`, the job stops after spending at least 60
+  seconds in the warmup phase plus the longer of 120 seconds or the duration of
+  10 optimization steps. This method excludes the first 60 seconds of training
+  when calculating throughput and system metrics.
 - distributed_backend: Default is `FSDP` for multi-gpu measurements, `None`
   (i.e. Data Parallel (DP)) for single-gpu measurements. Which pytorch backend
   to use when training with multiple GPU devices.
@@ -1263,6 +1277,13 @@ Optional:
 - stop_after_seconds: Default is `-1.0`. If set, the optimizer will be asked to
   stop after the specified time elapses. The check is performed after the end of
   each training step.
+- auto_stop_method: The default value is `None`. This parameter defines the
+  method used to automatically stop the fine-tuning job. Supported values are
+  `WARMUP_60S_STABLE_120S_OR_10_STEPS` and `None`. If set to
+  `WARMUP_60S_STABLE_120S_OR_10_STEPS`, the job stops after spending at least 60
+  seconds in the warmup phase plus the longer of 120 seconds or the duration of
+  10 optimization steps. This method excludes the first 60 seconds of training
+  when calculating throughput and system metrics.
 - distributed_backend: Default is `FSDP` for multi-gpu measurements, `None`
   (i.e. Data Parallel (DP)) for single-gpu measurements. Which pytorch backend
   to use when training with multiple GPU devices.
@@ -1667,6 +1688,13 @@ Optional:
 - stop_after_seconds: Default is `-1.0`. If set, the optimizer will be asked to
   stop after the specified time elapses. The check is performed after the end of
   each training step.
+- auto_stop_method: The default value is `None`. This parameter defines the
+  method used to automatically stop the fine-tuning job. Supported values are
+  `WARMUP_60S_STABLE_120S_OR_10_STEPS` and `None`. If set to
+  `WARMUP_60S_STABLE_120S_OR_10_STEPS`, the job stops after spending at least 60
+  seconds in the warmup phase plus the longer of 120 seconds or the duration of
+  10 optimization steps. This method excludes the first 60 seconds of training
+  when calculating throughput and system metrics.
 - distributed_backend: Default is `FSDP` for multi-gpu measurements, `None`
   (i.e. Data Parallel (DP)) for single-gpu measurements. Which pytorch backend
   to use when training with multiple GPU devices.

diff --git a/plugins/actuators/sfttrainer/ado_actuators/sfttrainer/actuators.py b/plugins/actuators/sfttrainer/ado_actuators/sfttrainer/actuators.py
@@ -22,6 +22,28 @@
 import ray.util.placement_group
 import ray.util.state
 import yaml
+from ado_actuators.sfttrainer.experiments import (
+    full_finetune,
+    gptq_lora,
+    lora,
+    prompt_tuning,
+)
+from ado_actuators.sfttrainer.experiments.common import (
+    ACTUATOR_IDENTIFIER,
+    FMS_HF_TUNING_COMMIT,
+    PATH_PINNED_PACKAGES,
+    DatasetMap,
+    EntitySpace,
+    ExperimentParameters,
+    InternalInconsistencyError,
+    InvalidEntityError,
+    ModelMap,
+    WeightsFormat,
+    experiment_parameters_from_experiment,
+    get_fms_hf_tuning_package,
+    get_ray_environment,
+    packages_requiring_nvidia_development_binaries,
+)
 from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
 
 import orchestrator.modules.actuators.catalog
@@ -43,24 +65,6 @@
 from orchestrator.schema.result import InvalidMeasurementResult, ValidMeasurementResult
 from orchestrator.utilities.environment import enable_ray_actor_coverage
 
-from .experiments import full_finetune, gptq_lora, lora, prompt_tuning
-from .experiments.common import (
-    ACTUATOR_IDENTIFIER,
-    FMS_HF_TUNING_COMMIT,
-    PATH_PINNED_PACKAGES,
-    DatasetMap,
-    EntitySpace,
-    ExperimentParameters,
-    InternalInconsistencyError,
-    InvalidEntityError,
-    ModelMap,
-    WeightsFormat,
-    experiment_parameters_from_experiment,
-    get_fms_hf_tuning_package,
-    get_ray_environment,
-    packages_requiring_nvidia_development_binaries,
-)
-
 # VV: Required module variables
 identifier = ACTUATOR_IDENTIFIER
 
@@ -1226,17 +1230,21 @@ async def _evaluate_one_entity(
             await self._stateUpdateQueue.put_async(request, block=False)
             return request.requestid
 
-        if context is not None and context.args.stop_after_seconds > 0.0:
-            # VV: When we switch on stop_after_seconds we are effectively dynamically terminating the training job
-            # in turn this confuses transformers causing it to report the wrong number of train tokens
-            # as a result we should just omit train_tokens_per_second and train_tokens_per_second_per_gpu entirely
+        if context is not None and (
+            (context.args.stop_after_seconds > 0.0)
+            or (context.args.auto_stop_method is not None)
+        ):
+            # VV: Dynamically terminating the training job confuses transformers and causes it to report the wrong
+            # throughput. So here we're getting rid of these values.
             scalar_observations = {
                 k: v
                 for k, v in scalar_observations.items()
                 if k
                 not in [
                     "train_tokens_per_second",
                     "train_tokens_per_gpu_per_second",
+                    "train_samples_per_second",
+                    "train_steps_per_second",
                 ]
             }
 

diff --git a/plugins/actuators/sfttrainer/ado_actuators/sfttrainer/experiments/common.py b/plugins/actuators/sfttrainer/ado_actuators/sfttrainer/experiments/common.py
@@ -13,6 +13,7 @@
 import os
 import typing
 
+import ado_actuators.sfttrainer.wrapper_fms_hf_tuning.constants as constants
 import pydantic
 import pydantic.fields
 import pydantic_core
@@ -185,6 +186,7 @@ def get_default_measured_properties() -> list[str]:
         "cpu_compute_utilization",
         "cpu_memory_utilization",
         "train_runtime",
+        # VV: the next 4 are inaccurate when terminating the job early
         "train_samples_per_second",
         "train_steps_per_second",
         "train_tokens_per_second",
@@ -645,7 +647,9 @@ def property_domain_for_prop(
 class SFTTrainerCLIArgs(pydantic.BaseModel):
     """These are Entity properties which map to a CLI arg"""
 
-    model_config = pydantic.ConfigDict(extra="forbid", protected_namespaces=())
+    model_config = pydantic.ConfigDict(
+        extra="forbid", protected_namespaces=(), use_enum_values=True
+    )
 
     # VV: If you're updating these, then make sure you also update domain_for_constitutive_property()
     # the code uses `examples` to populate the categorical values of the constitutive property's domain
@@ -713,6 +717,20 @@ class SFTTrainerCLIArgs(pydantic.BaseModel):
         "The check is performed after the end of each training step.",
     )
 
+    auto_stop_method: constants.AutoStopMethod | None = pydantic.Field(
+        default=None,
+        examples=[
+            constants.AutoStopMethod.WARMUP_60S_STABLE_120S_OR_10_STEPS.value,
+            None,
+        ],
+        description="The default value is `None`. This parameter defines the method used to automatically "
+        "stop the fine-tuning job. Supported values are `WARMUP_60S_STABLE_120S_OR_10_STEPS` and "
+        "`None`. If set to `WARMUP_60S_STABLE_120S_OR_10_STEPS`, the job stops after spending at least "
+        "60 seconds in the warmup phase plus the longer of 120 seconds or the duration of 10 "
+        "optimization steps. This method excludes the first 60 seconds of training when calculating "
+        "throughput and system metrics.",
+    )
+
     # VV: lora specific parameters
     r: int = pydantic.Field(4, examples=[4, 8, 16], description="The LORA rank")