Skip to content

Commit

Permalink
Fabric: auto default (#16842)
Browse files Browse the repository at this point in the history
  • Loading branch information
carmocca committed Feb 23, 2023
1 parent bc96513 commit d486f94
Show file tree
Hide file tree
Showing 8 changed files with 147 additions and 69 deletions.
6 changes: 3 additions & 3 deletions docs/source-pytorch/fabric/fundamentals/accelerators.rst
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,12 @@ Fabric enables you to take full advantage of the hardware on your system. It sup
- GPU (NVIDIA, AMD, Apple Silicon)
- TPU

By default, Fabric recognizes the accelerator(s) on your system
By default, Fabric tries to maximize the hardware utilization of your system

.. code-block:: python
# Default settings
fabric = Fabric(accelerator="auto", devices="auto")
fabric = Fabric(accelerator="auto", devices="auto", strategy="auto")
# Same as
fabric = Fabric()
Expand All @@ -40,7 +40,7 @@ You can also explicitly set which accelerator to use:
fabric = Fabric(accelerator="gpu", devices=8)
# GPU: Apple M1/M2 only
fabric = Fabric(accelerator="mps", devices=8)
fabric = Fabric(accelerator="mps")
# GPU: NVIDIA CUDA only
fabric = Fabric(accelerator="cuda", devices=8)
Expand Down
3 changes: 3 additions & 0 deletions src/lightning/fabric/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).

### Changed

- Fabric now chooses `accelerator="auto", strategy="auto", devices="auto"` as defaults ([#16842](https://github.com/Lightning-AI/lightning/pull/16842))


- Checkpoint saving and loading redesign ([#16434](https://github.com/Lightning-AI/lightning/pull/16434))
* Changed the method signatrue of `Fabric.save` and `Fabric.load`
* Changed the method signature of `Strategy.save_checkpoint` and `Fabric.load_checkpoint`
Expand Down
68 changes: 28 additions & 40 deletions src/lightning/fabric/connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,18 +100,18 @@ class _Connector:

def __init__(
self,
accelerator: Optional[Union[str, Accelerator]] = None,
strategy: Optional[Union[str, Strategy]] = None,
devices: Optional[Union[List[int], str, int]] = None,
accelerator: Union[str, Accelerator] = "auto",
strategy: Union[str, Strategy] = "auto",
devices: Union[List[int], str, int] = "auto",
num_nodes: int = 1,
precision: _PRECISION_INPUT = "32-true",
plugins: Optional[Union[_PLUGIN_INPUT, List[_PLUGIN_INPUT]]] = None,
) -> None:

# These arguments can be set through environment variables set by the CLI
accelerator = self._argument_from_env("accelerator", accelerator, default=None)
strategy = self._argument_from_env("strategy", strategy, default=None)
devices = self._argument_from_env("devices", devices, default=None)
accelerator = self._argument_from_env("accelerator", accelerator, default="auto")
strategy = self._argument_from_env("strategy", strategy, default="auto")
devices = self._argument_from_env("devices", devices, default="auto")
num_nodes = self._argument_from_env("num_nodes", num_nodes, default=1)
precision = self._argument_from_env("precision", precision, default="32-true")

Expand All @@ -123,8 +123,8 @@ def __init__(
# Raise an exception if there are conflicts between flags
# Set each valid flag to `self._x_flag` after validation
# For devices: Assign gpus, etc. to the accelerator flag and devices flag
self._strategy_flag: Optional[Union[Strategy, str]] = None
self._accelerator_flag: Optional[Union[Accelerator, str]] = None
self._strategy_flag: Union[Strategy, str] = "auto"
self._accelerator_flag: Union[Accelerator, str] = "auto"
self._precision_input: _PRECISION_INPUT_STR = "32-true"
self._precision_instance: Optional[Precision] = None
self._cluster_environment_flag: Optional[Union[ClusterEnvironment, str]] = None
Expand All @@ -141,7 +141,7 @@ def __init__(

# 2. Instantiate Accelerator
# handle `auto`, `None` and `gpu`
if self._accelerator_flag == "auto" or self._accelerator_flag is None:
if self._accelerator_flag == "auto":
self._accelerator_flag = self._choose_auto_accelerator()
elif self._accelerator_flag == "gpu":
self._accelerator_flag = self._choose_gpu_accelerator_backend()
Expand All @@ -152,7 +152,7 @@ def __init__(
self.cluster_environment: ClusterEnvironment = self._choose_and_init_cluster_environment()

# 4. Instantiate Strategy - Part 1
if self._strategy_flag is None:
if self._strategy_flag == "auto":
self._strategy_flag = self._choose_strategy()
# In specific cases, ignore user selection and fall back to a different strategy
self._check_strategy_and_fallback()
Expand All @@ -166,8 +166,8 @@ def __init__(

def _check_config_and_set_final_flags(
self,
strategy: Optional[Union[str, Strategy]],
accelerator: Optional[Union[str, Accelerator]],
strategy: Union[str, Strategy],
accelerator: Union[str, Accelerator],
precision: _PRECISION_INPUT,
plugins: Optional[Union[_PLUGIN_INPUT, List[_PLUGIN_INPUT]]],
) -> None:
Expand All @@ -188,26 +188,24 @@ def _check_config_and_set_final_flags(
if isinstance(strategy, str):
strategy = strategy.lower()

if strategy is not None:
self._strategy_flag = strategy
self._strategy_flag = strategy

if strategy is not None and strategy not in self._registered_strategies and not isinstance(strategy, Strategy):
if strategy != "auto" and strategy not in self._registered_strategies and not isinstance(strategy, Strategy):
raise ValueError(
f"You selected an invalid strategy name: `strategy={strategy!r}`."
" It must be either a string or an instance of `lightning.fabric.strategies.Strategy`."
" Example choices: ddp, ddp_spawn, deepspeed, dp, ..."
" Example choices: auto, ddp, ddp_spawn, deepspeed, dp, ..."
" Find a complete list of options in our documentation at https://lightning.ai"
)

if (
accelerator is not None
and accelerator not in self._registered_accelerators
accelerator not in self._registered_accelerators
and accelerator not in ("auto", "gpu")
and not isinstance(accelerator, Accelerator)
):
raise ValueError(
f"You selected an invalid accelerator name: `accelerator={accelerator!r}`."
f" Available names are: {', '.join(self._registered_accelerators)}."
f" Available names are: auto, {', '.join(self._registered_accelerators)}."
)

# MPS accelerator is incompatible with DDP family of strategies. It supports single-device operation only.
Expand Down Expand Up @@ -256,9 +254,9 @@ def _check_config_and_set_final_flags(
# handle the case when the user passes in a strategy instance which has an accelerator, precision,
# checkpoint io or cluster env set up
# TODO: improve the error messages below
if self._strategy_flag and isinstance(self._strategy_flag, Strategy):
if isinstance(self._strategy_flag, Strategy):
if self._strategy_flag._accelerator:
if self._accelerator_flag:
if self._accelerator_flag != "auto":
raise ValueError("accelerator set through both strategy class and accelerator flag, choose one")
else:
self._accelerator_flag = self._strategy_flag._accelerator
Expand Down Expand Up @@ -297,9 +295,7 @@ def _check_config_and_set_final_flags(
self._accelerator_flag = "cuda"
self._parallel_devices = self._strategy_flag.parallel_devices

def _check_device_config_and_set_final_flags(
self, devices: Optional[Union[List[int], str, int]], num_nodes: int
) -> None:
def _check_device_config_and_set_final_flags(self, devices: Union[List[int], str, int], num_nodes: int) -> None:
self._num_nodes_flag = int(num_nodes) if num_nodes is not None else 1
self._devices_flag = devices

Expand All @@ -314,21 +310,14 @@ def _check_device_config_and_set_final_flags(
f" using {accelerator_name} accelerator."
)

if self._devices_flag == "auto" and self._accelerator_flag is None:
raise ValueError(
f"You passed `devices={devices}` but haven't specified"
" `accelerator=('auto'|'tpu'|'gpu'|'cpu'|'mps')` for the devices mapping."
)

def _choose_auto_accelerator(self) -> str:
"""Choose the accelerator type (str) based on availability when ``accelerator='auto'``."""
if self._accelerator_flag == "auto":
if TPUAccelerator.is_available():
return "tpu"
if MPSAccelerator.is_available():
return "mps"
if CUDAAccelerator.is_available():
return "cuda"
if TPUAccelerator.is_available():
return "tpu"
if MPSAccelerator.is_available():
return "mps"
if CUDAAccelerator.is_available():
return "cuda"
return "cpu"

@staticmethod
Expand All @@ -337,7 +326,6 @@ def _choose_gpu_accelerator_backend() -> str:
return "mps"
if CUDAAccelerator.is_available():
return "cuda"

raise RuntimeError("No supported gpu backend found!")

def _set_parallel_devices_and_init_accelerator(self) -> None:
Expand Down Expand Up @@ -368,7 +356,7 @@ def _set_parallel_devices_and_init_accelerator(self) -> None:
self._parallel_devices = accelerator_cls.get_parallel_devices(self._devices_flag)

def _set_devices_flag_if_auto_passed(self) -> None:
if self._devices_flag == "auto" or self._devices_flag is None:
if self._devices_flag == "auto":
self._devices_flag = self.accelerator.auto_device_count()

def _choose_and_init_cluster_environment(self) -> ClusterEnvironment:
Expand Down Expand Up @@ -527,7 +515,7 @@ def _lazy_init_strategy(self) -> None:
raise RuntimeError(
f"`Fabric(strategy={self._strategy_flag!r})` is not compatible with an interactive"
" environment. Run your code as a script, or choose one of the compatible strategies:"
f" `Fabric(strategy=None|'dp'|'ddp_notebook')`."
f" `Fabric(strategy='dp'|'ddp_notebook')`."
" In case you are spawning processes yourself, make sure to include the Fabric"
" creation inside the worker function."
)
Expand Down
6 changes: 3 additions & 3 deletions src/lightning/fabric/fabric.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,9 +78,9 @@ class Fabric:

def __init__(
self,
accelerator: Optional[Union[str, Accelerator]] = None,
strategy: Optional[Union[str, Strategy]] = None,
devices: Optional[Union[List[int], str, int]] = None,
accelerator: Union[str, Accelerator] = "auto",
strategy: Union[str, Strategy] = "auto",
devices: Union[List[int], str, int] = "auto",
num_nodes: int = 1,
precision: _PRECISION_INPUT = "32-true",
plugins: Optional[Union[_PLUGIN_INPUT, List[_PLUGIN_INPUT]]] = None,
Expand Down
21 changes: 15 additions & 6 deletions tests/tests_fabric/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,17 +75,26 @@ def reset_deterministic_algorithm():
torch.use_deterministic_algorithms(False)


def mock_xla_available(monkeypatch: pytest.MonkeyPatch, value: bool = True) -> None:
monkeypatch.setattr(lightning.fabric.accelerators.tpu, "_XLA_AVAILABLE", value)
monkeypatch.setattr(lightning.fabric.plugins.environments.xla, "_XLA_AVAILABLE", value)
monkeypatch.setattr(lightning.fabric.strategies.xla, "_XLA_AVAILABLE", value)
monkeypatch.setattr(lightning.fabric.strategies.launchers.xla, "_XLA_AVAILABLE", value)


@pytest.fixture(scope="function")
def xla_available(monkeypatch: pytest.MonkeyPatch) -> None:
monkeypatch.setattr(lightning.fabric.accelerators.tpu, "_XLA_AVAILABLE", True)
monkeypatch.setattr(lightning.fabric.plugins.environments.xla, "_XLA_AVAILABLE", True)
monkeypatch.setattr(lightning.fabric.strategies.xla, "_XLA_AVAILABLE", True)
monkeypatch.setattr(lightning.fabric.strategies.launchers.xla, "_XLA_AVAILABLE", True)
mock_xla_available(monkeypatch)


def mock_tpu_available(monkeypatch: pytest.MonkeyPatch, value: bool = True) -> None:
mock_xla_available(monkeypatch, value)
monkeypatch.setattr(lightning.fabric.accelerators.tpu.TPUAccelerator, "is_available", lambda: value)


@pytest.fixture(scope="function")
def tpu_available(xla_available, monkeypatch) -> None:
monkeypatch.setattr(lightning.fabric.accelerators.tpu.TPUAccelerator, "is_available", lambda: True)
def tpu_available(monkeypatch: pytest.MonkeyPatch) -> None:
mock_tpu_available(monkeypatch)


@pytest.fixture
Expand Down
4 changes: 3 additions & 1 deletion tests/tests_fabric/plugins/precision/test_amp_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,9 @@ def after_backward(self, model):
],
)
def test_amp(accelerator, precision, expected_dtype):
fabric = MixedPrecisionBoringFabric(accelerator=accelerator, precision=precision)
# TODO: devices>1 fails with:
# DDP expects same model across all ranks, but Rank 0 has 2 params, while rank 1 has inconsistent 1 params
fabric = MixedPrecisionBoringFabric(accelerator=accelerator, precision=precision, devices=1)
fabric.expected_dtype = expected_dtype
fabric.run()

Expand Down

0 comments on commit d486f94

Please sign in to comment.