Fabric: auto default (#16842)

Lightning-AI · Feb 23, 2023 · d486f94 · d486f94
1 parent bc96513
commit d486f94
Show file tree

Hide file tree

Showing 8 changed files with 147 additions and 69 deletions.
diff --git a/docs/source-pytorch/fabric/fundamentals/accelerators.rst b/docs/source-pytorch/fabric/fundamentals/accelerators.rst
@@ -15,12 +15,12 @@ Fabric enables you to take full advantage of the hardware on your system. It sup
 - GPU (NVIDIA, AMD, Apple Silicon)
 - TPU
 
-By default, Fabric recognizes the accelerator(s) on your system
+By default, Fabric tries to maximize the hardware utilization of your system
 
 .. code-block:: python
 
     # Default settings
-    fabric = Fabric(accelerator="auto", devices="auto")
+    fabric = Fabric(accelerator="auto", devices="auto", strategy="auto")
 
     # Same as
     fabric = Fabric()
@@ -40,7 +40,7 @@ You can also explicitly set which accelerator to use:
     fabric = Fabric(accelerator="gpu", devices=8)
 
     # GPU: Apple M1/M2 only
-    fabric = Fabric(accelerator="mps", devices=8)
+    fabric = Fabric(accelerator="mps")
 
     # GPU: NVIDIA CUDA only
     fabric = Fabric(accelerator="cuda", devices=8)

diff --git a/src/lightning/fabric/CHANGELOG.md b/src/lightning/fabric/CHANGELOG.md
@@ -20,6 +20,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Changed
 
+- Fabric now chooses `accelerator="auto", strategy="auto", devices="auto"` as defaults ([#16842](https://github.com/Lightning-AI/lightning/pull/16842))
+
+
 - Checkpoint saving and loading redesign ([#16434](https://github.com/Lightning-AI/lightning/pull/16434))
   * Changed the method signatrue of `Fabric.save` and `Fabric.load`
   * Changed the method signature of `Strategy.save_checkpoint` and `Fabric.load_checkpoint`

diff --git a/src/lightning/fabric/connector.py b/src/lightning/fabric/connector.py
@@ -100,18 +100,18 @@ class _Connector:
 
     def __init__(
         self,
-        accelerator: Optional[Union[str, Accelerator]] = None,
-        strategy: Optional[Union[str, Strategy]] = None,
-        devices: Optional[Union[List[int], str, int]] = None,
+        accelerator: Union[str, Accelerator] = "auto",
+        strategy: Union[str, Strategy] = "auto",
+        devices: Union[List[int], str, int] = "auto",
         num_nodes: int = 1,
         precision: _PRECISION_INPUT = "32-true",
         plugins: Optional[Union[_PLUGIN_INPUT, List[_PLUGIN_INPUT]]] = None,
     ) -> None:
 
         # These arguments can be set through environment variables set by the CLI
-        accelerator = self._argument_from_env("accelerator", accelerator, default=None)
-        strategy = self._argument_from_env("strategy", strategy, default=None)
-        devices = self._argument_from_env("devices", devices, default=None)
+        accelerator = self._argument_from_env("accelerator", accelerator, default="auto")
+        strategy = self._argument_from_env("strategy", strategy, default="auto")
+        devices = self._argument_from_env("devices", devices, default="auto")
         num_nodes = self._argument_from_env("num_nodes", num_nodes, default=1)
         precision = self._argument_from_env("precision", precision, default="32-true")
 
@@ -123,8 +123,8 @@ def __init__(
         # Raise an exception if there are conflicts between flags
         # Set each valid flag to `self._x_flag` after validation
         # For devices: Assign gpus, etc. to the accelerator flag and devices flag
-        self._strategy_flag: Optional[Union[Strategy, str]] = None
-        self._accelerator_flag: Optional[Union[Accelerator, str]] = None
+        self._strategy_flag: Union[Strategy, str] = "auto"
+        self._accelerator_flag: Union[Accelerator, str] = "auto"
         self._precision_input: _PRECISION_INPUT_STR = "32-true"
         self._precision_instance: Optional[Precision] = None
         self._cluster_environment_flag: Optional[Union[ClusterEnvironment, str]] = None
@@ -141,7 +141,7 @@ def __init__(
 
         # 2. Instantiate Accelerator
         # handle `auto`, `None` and `gpu`
-        if self._accelerator_flag == "auto" or self._accelerator_flag is None:
+        if self._accelerator_flag == "auto":
             self._accelerator_flag = self._choose_auto_accelerator()
         elif self._accelerator_flag == "gpu":
             self._accelerator_flag = self._choose_gpu_accelerator_backend()
@@ -152,7 +152,7 @@ def __init__(
         self.cluster_environment: ClusterEnvironment = self._choose_and_init_cluster_environment()
 
         # 4. Instantiate Strategy - Part 1
-        if self._strategy_flag is None:
+        if self._strategy_flag == "auto":
             self._strategy_flag = self._choose_strategy()
         # In specific cases, ignore user selection and fall back to a different strategy
         self._check_strategy_and_fallback()
@@ -166,8 +166,8 @@ def __init__(
 
     def _check_config_and_set_final_flags(
         self,
-        strategy: Optional[Union[str, Strategy]],
-        accelerator: Optional[Union[str, Accelerator]],
+        strategy: Union[str, Strategy],
+        accelerator: Union[str, Accelerator],
         precision: _PRECISION_INPUT,
         plugins: Optional[Union[_PLUGIN_INPUT, List[_PLUGIN_INPUT]]],
     ) -> None:
@@ -188,26 +188,24 @@ def _check_config_and_set_final_flags(
         if isinstance(strategy, str):
             strategy = strategy.lower()
 
-        if strategy is not None:
-            self._strategy_flag = strategy
+        self._strategy_flag = strategy
 
-        if strategy is not None and strategy not in self._registered_strategies and not isinstance(strategy, Strategy):
+        if strategy != "auto" and strategy not in self._registered_strategies and not isinstance(strategy, Strategy):
             raise ValueError(
                 f"You selected an invalid strategy name: `strategy={strategy!r}`."
                 " It must be either a string or an instance of `lightning.fabric.strategies.Strategy`."
-                " Example choices: ddp, ddp_spawn, deepspeed, dp, ..."
+                " Example choices: auto, ddp, ddp_spawn, deepspeed, dp, ..."
                 " Find a complete list of options in our documentation at https://lightning.ai"
             )
 
         if (
-            accelerator is not None
-            and accelerator not in self._registered_accelerators
+            accelerator not in self._registered_accelerators
             and accelerator not in ("auto", "gpu")
             and not isinstance(accelerator, Accelerator)
         ):
             raise ValueError(
                 f"You selected an invalid accelerator name: `accelerator={accelerator!r}`."
-                f" Available names are: {', '.join(self._registered_accelerators)}."
+                f" Available names are: auto, {', '.join(self._registered_accelerators)}."
             )
 
         # MPS accelerator is incompatible with DDP family of strategies. It supports single-device operation only.
@@ -256,9 +254,9 @@ def _check_config_and_set_final_flags(
         # handle the case when the user passes in a strategy instance which has an accelerator, precision,
         # checkpoint io or cluster env set up
         # TODO: improve the error messages below
-        if self._strategy_flag and isinstance(self._strategy_flag, Strategy):
+        if isinstance(self._strategy_flag, Strategy):
             if self._strategy_flag._accelerator:
-                if self._accelerator_flag:
+                if self._accelerator_flag != "auto":
                     raise ValueError("accelerator set through both strategy class and accelerator flag, choose one")
                 else:
                     self._accelerator_flag = self._strategy_flag._accelerator
@@ -297,9 +295,7 @@ def _check_config_and_set_final_flags(
                         self._accelerator_flag = "cuda"
                     self._parallel_devices = self._strategy_flag.parallel_devices
 
-    def _check_device_config_and_set_final_flags(
-        self, devices: Optional[Union[List[int], str, int]], num_nodes: int
-    ) -> None:
+    def _check_device_config_and_set_final_flags(self, devices: Union[List[int], str, int], num_nodes: int) -> None:
         self._num_nodes_flag = int(num_nodes) if num_nodes is not None else 1
         self._devices_flag = devices
 
@@ -314,21 +310,14 @@ def _check_device_config_and_set_final_flags(
                 f" using {accelerator_name} accelerator."
             )
 
-        if self._devices_flag == "auto" and self._accelerator_flag is None:
-            raise ValueError(
-                f"You passed `devices={devices}` but haven't specified"
-                " `accelerator=('auto'|'tpu'|'gpu'|'cpu'|'mps')` for the devices mapping."
-            )
-
     def _choose_auto_accelerator(self) -> str:
         """Choose the accelerator type (str) based on availability when ``accelerator='auto'``."""
-        if self._accelerator_flag == "auto":
-            if TPUAccelerator.is_available():
-                return "tpu"
-            if MPSAccelerator.is_available():
-                return "mps"
-            if CUDAAccelerator.is_available():
-                return "cuda"
+        if TPUAccelerator.is_available():
+            return "tpu"
+        if MPSAccelerator.is_available():
+            return "mps"
+        if CUDAAccelerator.is_available():
+            return "cuda"
         return "cpu"
 
     @staticmethod
@@ -337,7 +326,6 @@ def _choose_gpu_accelerator_backend() -> str:
             return "mps"
         if CUDAAccelerator.is_available():
             return "cuda"
-
         raise RuntimeError("No supported gpu backend found!")
 
     def _set_parallel_devices_and_init_accelerator(self) -> None:
@@ -368,7 +356,7 @@ def _set_parallel_devices_and_init_accelerator(self) -> None:
             self._parallel_devices = accelerator_cls.get_parallel_devices(self._devices_flag)
 
     def _set_devices_flag_if_auto_passed(self) -> None:
-        if self._devices_flag == "auto" or self._devices_flag is None:
+        if self._devices_flag == "auto":
             self._devices_flag = self.accelerator.auto_device_count()
 
     def _choose_and_init_cluster_environment(self) -> ClusterEnvironment:
@@ -527,7 +515,7 @@ def _lazy_init_strategy(self) -> None:
             raise RuntimeError(
                 f"`Fabric(strategy={self._strategy_flag!r})` is not compatible with an interactive"
                 " environment. Run your code as a script, or choose one of the compatible strategies:"
-                f" `Fabric(strategy=None|'dp'|'ddp_notebook')`."
+                f" `Fabric(strategy='dp'|'ddp_notebook')`."
                 " In case you are spawning processes yourself, make sure to include the Fabric"
                 " creation inside the worker function."
             )

diff --git a/src/lightning/fabric/fabric.py b/src/lightning/fabric/fabric.py
@@ -78,9 +78,9 @@ class Fabric:
 
     def __init__(
         self,
-        accelerator: Optional[Union[str, Accelerator]] = None,
-        strategy: Optional[Union[str, Strategy]] = None,
-        devices: Optional[Union[List[int], str, int]] = None,
+        accelerator: Union[str, Accelerator] = "auto",
+        strategy: Union[str, Strategy] = "auto",
+        devices: Union[List[int], str, int] = "auto",
         num_nodes: int = 1,
         precision: _PRECISION_INPUT = "32-true",
         plugins: Optional[Union[_PLUGIN_INPUT, List[_PLUGIN_INPUT]]] = None,

diff --git a/tests/tests_fabric/conftest.py b/tests/tests_fabric/conftest.py
@@ -75,17 +75,26 @@ def reset_deterministic_algorithm():
     torch.use_deterministic_algorithms(False)
 
 
+def mock_xla_available(monkeypatch: pytest.MonkeyPatch, value: bool = True) -> None:
+    monkeypatch.setattr(lightning.fabric.accelerators.tpu, "_XLA_AVAILABLE", value)
+    monkeypatch.setattr(lightning.fabric.plugins.environments.xla, "_XLA_AVAILABLE", value)
+    monkeypatch.setattr(lightning.fabric.strategies.xla, "_XLA_AVAILABLE", value)
+    monkeypatch.setattr(lightning.fabric.strategies.launchers.xla, "_XLA_AVAILABLE", value)
+
+
 @pytest.fixture(scope="function")
 def xla_available(monkeypatch: pytest.MonkeyPatch) -> None:
-    monkeypatch.setattr(lightning.fabric.accelerators.tpu, "_XLA_AVAILABLE", True)
-    monkeypatch.setattr(lightning.fabric.plugins.environments.xla, "_XLA_AVAILABLE", True)
-    monkeypatch.setattr(lightning.fabric.strategies.xla, "_XLA_AVAILABLE", True)
-    monkeypatch.setattr(lightning.fabric.strategies.launchers.xla, "_XLA_AVAILABLE", True)
+    mock_xla_available(monkeypatch)
+
+
+def mock_tpu_available(monkeypatch: pytest.MonkeyPatch, value: bool = True) -> None:
+    mock_xla_available(monkeypatch, value)
+    monkeypatch.setattr(lightning.fabric.accelerators.tpu.TPUAccelerator, "is_available", lambda: value)
 
 
 @pytest.fixture(scope="function")
-def tpu_available(xla_available, monkeypatch) -> None:
-    monkeypatch.setattr(lightning.fabric.accelerators.tpu.TPUAccelerator, "is_available", lambda: True)
+def tpu_available(monkeypatch: pytest.MonkeyPatch) -> None:
+    mock_tpu_available(monkeypatch)
 
 
 @pytest.fixture

diff --git a/tests/tests_fabric/plugins/precision/test_amp_integration.py b/tests/tests_fabric/plugins/precision/test_amp_integration.py
@@ -68,7 +68,9 @@ def after_backward(self, model):
     ],
 )
 def test_amp(accelerator, precision, expected_dtype):
-    fabric = MixedPrecisionBoringFabric(accelerator=accelerator, precision=precision)
+    # TODO: devices>1 fails with:
+    # DDP expects same model across all ranks, but Rank 0 has 2 params, while rank 1 has inconsistent 1 params
+    fabric = MixedPrecisionBoringFabric(accelerator=accelerator, precision=precision, devices=1)
     fabric.expected_dtype = expected_dtype
     fabric.run()