Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions plugins/actuators/sfttrainer/ado_actuators/sfttrainer/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,13 @@ Optional:
- stop_after_seconds: Default is `-1.0`. If set, the optimizer will be asked to
stop after the specified time elapses. The check is performed after the end of
each training step.
- auto_stop_method: The default value is `None`. This parameter defines the
method used to automatically stop the fine-tuning job. Supported values are
`WARMUP_60S_STABLE_120S_OR_10_STEPS` and `None`. If set to
`WARMUP_60S_STABLE_120S_OR_10_STEPS`, the job stops after spending at least 60
seconds in the warmup phase plus the longer of 120 seconds or the duration of
10 optimization steps. This method excludes the first 60 seconds of training
when calculating throughput and system metrics.
- distributed_backend: Default is `FSDP` for multi-gpu measurements, `None`
(i.e. Data Parallel (DP)) for single-gpu measurements. Which pytorch backend
to use when training with multiple GPU devices.
Expand Down Expand Up @@ -839,6 +846,13 @@ Optional:
- stop_after_seconds: Default is `-1.0`. If set, the optimizer will be asked to
stop after the specified time elapses. The check is performed after the end of
each training step.
- auto_stop_method: The default value is `None`. This parameter defines the
method used to automatically stop the fine-tuning job. Supported values are
`WARMUP_60S_STABLE_120S_OR_10_STEPS` and `None`. If set to
`WARMUP_60S_STABLE_120S_OR_10_STEPS`, the job stops after spending at least 60
seconds in the warmup phase plus the longer of 120 seconds or the duration of
10 optimization steps. This method excludes the first 60 seconds of training
when calculating throughput and system metrics.
- distributed_backend: Default is `FSDP` for multi-gpu measurements, `None`
(i.e. Data Parallel (DP)) for single-gpu measurements. Which pytorch backend
to use when training with multiple GPU devices.
Expand Down Expand Up @@ -1263,6 +1277,13 @@ Optional:
- stop_after_seconds: Default is `-1.0`. If set, the optimizer will be asked to
stop after the specified time elapses. The check is performed after the end of
each training step.
- auto_stop_method: The default value is `None`. This parameter defines the
method used to automatically stop the fine-tuning job. Supported values are
`WARMUP_60S_STABLE_120S_OR_10_STEPS` and `None`. If set to
`WARMUP_60S_STABLE_120S_OR_10_STEPS`, the job stops after spending at least 60
seconds in the warmup phase plus the longer of 120 seconds or the duration of
10 optimization steps. This method excludes the first 60 seconds of training
when calculating throughput and system metrics.
- distributed_backend: Default is `FSDP` for multi-gpu measurements, `None`
(i.e. Data Parallel (DP)) for single-gpu measurements. Which pytorch backend
to use when training with multiple GPU devices.
Expand Down Expand Up @@ -1667,6 +1688,13 @@ Optional:
- stop_after_seconds: Default is `-1.0`. If set, the optimizer will be asked to
stop after the specified time elapses. The check is performed after the end of
each training step.
- auto_stop_method: The default value is `None`. This parameter defines the
method used to automatically stop the fine-tuning job. Supported values are
`WARMUP_60S_STABLE_120S_OR_10_STEPS` and `None`. If set to
`WARMUP_60S_STABLE_120S_OR_10_STEPS`, the job stops after spending at least 60
seconds in the warmup phase plus the longer of 120 seconds or the duration of
10 optimization steps. This method excludes the first 60 seconds of training
when calculating throughput and system metrics.
- distributed_backend: Default is `FSDP` for multi-gpu measurements, `None`
(i.e. Data Parallel (DP)) for single-gpu measurements. Which pytorch backend
to use when training with multiple GPU devices.
Expand Down
52 changes: 30 additions & 22 deletions plugins/actuators/sfttrainer/ado_actuators/sfttrainer/actuators.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,28 @@
import ray.util.placement_group
import ray.util.state
import yaml
from ado_actuators.sfttrainer.experiments import (
full_finetune,
gptq_lora,
lora,
prompt_tuning,
)
from ado_actuators.sfttrainer.experiments.common import (
ACTUATOR_IDENTIFIER,
FMS_HF_TUNING_COMMIT,
PATH_PINNED_PACKAGES,
DatasetMap,
EntitySpace,
ExperimentParameters,
InternalInconsistencyError,
InvalidEntityError,
ModelMap,
WeightsFormat,
experiment_parameters_from_experiment,
get_fms_hf_tuning_package,
get_ray_environment,
packages_requiring_nvidia_development_binaries,
)
from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy

import orchestrator.modules.actuators.catalog
Expand All @@ -43,24 +65,6 @@
from orchestrator.schema.result import InvalidMeasurementResult, ValidMeasurementResult
from orchestrator.utilities.environment import enable_ray_actor_coverage

from .experiments import full_finetune, gptq_lora, lora, prompt_tuning
from .experiments.common import (
ACTUATOR_IDENTIFIER,
FMS_HF_TUNING_COMMIT,
PATH_PINNED_PACKAGES,
DatasetMap,
EntitySpace,
ExperimentParameters,
InternalInconsistencyError,
InvalidEntityError,
ModelMap,
WeightsFormat,
experiment_parameters_from_experiment,
get_fms_hf_tuning_package,
get_ray_environment,
packages_requiring_nvidia_development_binaries,
)

# VV: Required module variables
identifier = ACTUATOR_IDENTIFIER

Expand Down Expand Up @@ -1226,17 +1230,21 @@ async def _evaluate_one_entity(
await self._stateUpdateQueue.put_async(request, block=False)
return request.requestid

if context is not None and context.args.stop_after_seconds > 0.0:
# VV: When we switch on stop_after_seconds we are effectively dynamically terminating the training job
# in turn this confuses transformers causing it to report the wrong number of train tokens
# as a result we should just omit train_tokens_per_second and train_tokens_per_second_per_gpu entirely
if context is not None and (
(context.args.stop_after_seconds > 0.0)
or (context.args.auto_stop_method is not None)
):
# VV: Dynamically terminating the training job confuses transformers and causes it to report the wrong
# throughput. So here we're getting rid of these values.
scalar_observations = {
k: v
for k, v in scalar_observations.items()
if k
not in [
"train_tokens_per_second",
"train_tokens_per_gpu_per_second",
"train_samples_per_second",
"train_steps_per_second",
]
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import os
import typing

import ado_actuators.sfttrainer.wrapper_fms_hf_tuning.constants as constants
import pydantic
import pydantic.fields
import pydantic_core
Expand Down Expand Up @@ -185,6 +186,7 @@ def get_default_measured_properties() -> list[str]:
"cpu_compute_utilization",
"cpu_memory_utilization",
"train_runtime",
# VV: the next 4 are inaccurate when terminating the job early
"train_samples_per_second",
"train_steps_per_second",
"train_tokens_per_second",
Expand Down Expand Up @@ -645,7 +647,9 @@ def property_domain_for_prop(
class SFTTrainerCLIArgs(pydantic.BaseModel):
"""These are Entity properties which map to a CLI arg"""

model_config = pydantic.ConfigDict(extra="forbid", protected_namespaces=())
model_config = pydantic.ConfigDict(
extra="forbid", protected_namespaces=(), use_enum_values=True
)

# VV: If you're updating these, then make sure you also update domain_for_constitutive_property()
# the code uses `examples` to populate the categorical values of the constitutive property's domain
Expand Down Expand Up @@ -713,6 +717,20 @@ class SFTTrainerCLIArgs(pydantic.BaseModel):
"The check is performed after the end of each training step.",
)

auto_stop_method: constants.AutoStopMethod | None = pydantic.Field(
default=None,
examples=[
constants.AutoStopMethod.WARMUP_60S_STABLE_120S_OR_10_STEPS.value,
None,
],
description="The default value is `None`. This parameter defines the method used to automatically "
"stop the fine-tuning job. Supported values are `WARMUP_60S_STABLE_120S_OR_10_STEPS` and "
"`None`. If set to `WARMUP_60S_STABLE_120S_OR_10_STEPS`, the job stops after spending at least "
"60 seconds in the warmup phase plus the longer of 120 seconds or the duration of 10 "
"optimization steps. This method excludes the first 60 seconds of training when calculating "
"throughput and system metrics.",
)

# VV: lora specific parameters
r: int = pydantic.Field(4, examples=[4, 8, 16], description="The LORA rank")

Expand Down
Loading