From 782ca4aa26c33456f45e74c700d42bf37b966450 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Wed, 22 Jun 2022 21:17:25 +0200 Subject: [PATCH 01/57] fork --- src/pytorch_lightning/strategies/launchers/spawn.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/pytorch_lightning/strategies/launchers/spawn.py b/src/pytorch_lightning/strategies/launchers/spawn.py index 6af2688e47419..28149dc5ea4a2 100644 --- a/src/pytorch_lightning/strategies/launchers/spawn.py +++ b/src/pytorch_lightning/strategies/launchers/spawn.py @@ -55,7 +55,8 @@ def is_interactive_compatible(self) -> bool: # The start method 'spawn' is currently the only one that works with DDP and CUDA support # The start method 'fork' is the only one supported in Jupyter environments but not compatible with CUDA # For more context, see https://github.com/Lightning-AI/lightning/issues/7550 - return self._start_method == "fork" and self._strategy.root_device.type != "cuda" + # return self._start_method == "fork" and self._strategy.root_device.type != "cuda" + return True def launch(self, function: Callable, *args: Any, trainer: Optional["pl.Trainer"] = None, **kwargs: Any) -> Any: """Spawns processes that run the given function in parallel. @@ -76,11 +77,11 @@ def launch(self, function: Callable, *args: Any, trainer: Optional["pl.Trainer"] os.environ["MASTER_PORT"] = str(self._strategy.cluster_environment.main_port) context = mp.get_context(self._start_method) return_queue = context.SimpleQueue() - mp.spawn( + mp.start_processes( self._wrapping_function, args=(trainer, function, args, kwargs, return_queue), nprocs=self._strategy.num_processes, - start_method=self._start_method, + start_method="fork", ) spawn_output = return_queue.get() if trainer is None: From aefac45d626e294bd6070b9d369c6d331625be19 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Thu, 23 Jun 2022 03:16:05 +0200 Subject: [PATCH 02/57] dont set device --- src/pytorch_lightning/accelerators/gpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pytorch_lightning/accelerators/gpu.py b/src/pytorch_lightning/accelerators/gpu.py index 898ce09b91431..dba0a750063e1 100644 --- a/src/pytorch_lightning/accelerators/gpu.py +++ b/src/pytorch_lightning/accelerators/gpu.py @@ -40,7 +40,7 @@ def setup_environment(self, root_device: torch.device) -> None: super().setup_environment(root_device) if root_device.type != "cuda": raise MisconfigurationException(f"Device should be GPU, got {root_device} instead") - torch.cuda.set_device(root_device) + # torch.cuda.set_device(root_device) def setup(self, trainer: "pl.Trainer") -> None: # TODO refactor input from trainer to local_rank @four4fish From efce3c41b2ef5bb41d5498c1ed7325d8c83547df Mon Sep 17 00:00:00 2001 From: awaelchli Date: Thu, 23 Jun 2022 03:23:24 +0200 Subject: [PATCH 03/57] parallel dev --- src/pytorch_lightning/strategies/ddp_spawn.py | 8 ++-- .../connectors/accelerator_connector.py | 48 +++++++++---------- 2 files changed, 28 insertions(+), 28 deletions(-) diff --git a/src/pytorch_lightning/strategies/ddp_spawn.py b/src/pytorch_lightning/strategies/ddp_spawn.py index f9d386ed3fa2b..c97bec421a909 100644 --- a/src/pytorch_lightning/strategies/ddp_spawn.py +++ b/src/pytorch_lightning/strategies/ddp_spawn.py @@ -60,7 +60,7 @@ class DDPSpawnStrategy(ParallelStrategy): def __init__( self, accelerator: Optional["pl.accelerators.accelerator.Accelerator"] = None, - parallel_devices: Optional[List[torch.device]] = None, + # parallel_devices: Optional[List[torch.device]] = None, cluster_environment: Optional[ClusterEnvironment] = None, checkpoint_io: Optional[CheckpointIO] = None, precision_plugin: Optional[PrecisionPlugin] = None, @@ -72,7 +72,7 @@ def __init__( ): super().__init__( accelerator=accelerator, - parallel_devices=parallel_devices, + # parallel_devices=parallel_devices, cluster_environment=cluster_environment, checkpoint_io=checkpoint_io, precision_plugin=precision_plugin, @@ -100,11 +100,11 @@ def local_rank(self) -> int: @property def root_device(self): - return self.parallel_devices[self.local_rank] + return torch.device("cuda", self.local_rank) @property def num_processes(self): - return len(self.parallel_devices) if self.parallel_devices is not None else 0 + return 2 @property def distributed_sampler_kwargs(self): diff --git a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py index 265cfdaf13f08..90fa2fdd047b0 100644 --- a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -351,23 +351,23 @@ def _check_config_and_set_final_flags( else: self._cluster_environment_flag = getattr(self._strategy_flag, "cluster_environment") - if hasattr(self._strategy_flag, "parallel_devices"): - if self._strategy_flag.parallel_devices: - if self._strategy_flag.parallel_devices[0].type == "cpu": - if self._accelerator_flag and self._accelerator_flag not in ("auto", "cpu"): - raise MisconfigurationException( - f"CPU parallel_devices set through {self._strategy_flag.__class__.__name__} class," - f" but accelerator set to {self._accelerator_flag}, please choose one device type" - ) - self._accelerator_flag = "cpu" - if self._strategy_flag.parallel_devices[0].type == "cuda": - if self._accelerator_flag and self._accelerator_flag not in ("auto", "gpu"): - raise MisconfigurationException( - f"GPU parallel_devices set through {self._strategy_flag.__class__.__name__} class," - f" but accelerator set to {self._accelerator_flag}, please choose one device type" - ) - self._accelerator_flag = "gpu" - self._parallel_devices = self._strategy_flag.parallel_devices + # if hasattr(self._strategy_flag, "parallel_devices"): + # if self._strategy_flag.parallel_devices: + # if self._strategy_flag.parallel_devices[0].type == "cpu": + # if self._accelerator_flag and self._accelerator_flag not in ("auto", "cpu"): + # raise MisconfigurationException( + # f"CPU parallel_devices set through {self._strategy_flag.__class__.__name__} class," + # f" but accelerator set to {self._accelerator_flag}, please choose one device type" + # ) + # self._accelerator_flag = "cpu" + # if self._strategy_flag.parallel_devices[0].type == "cuda": + # if self._accelerator_flag and self._accelerator_flag not in ("auto", "gpu"): + # raise MisconfigurationException( + # f"GPU parallel_devices set through {self._strategy_flag.__class__.__name__} class," + # f" but accelerator set to {self._accelerator_flag}, please choose one device type" + # ) + # self._accelerator_flag = "gpu" + # self._parallel_devices = self._strategy_flag.parallel_devices amp_type = amp_type if isinstance(amp_type, str) else None self._amp_type_flag = AMPType.from_str(amp_type) @@ -523,8 +523,8 @@ def _set_parallel_devices_and_init_accelerator(self) -> None: self._set_devices_flag_if_auto_select_gpus_passed() self._devices_flag = self.accelerator.parse_devices(self._devices_flag) - if not self._parallel_devices: - self._parallel_devices = self.accelerator.get_parallel_devices(self._devices_flag) + # if not self._parallel_devices: + # self._parallel_devices = self.accelerator.get_parallel_devices(self._devices_flag) def _set_devices_flag_if_auto_passed(self) -> None: if self._devices_flag == "auto" or self._devices_flag is None: @@ -762,11 +762,11 @@ def _lazy_init_strategy(self) -> None: self.strategy.checkpoint_io = self.checkpoint_io if hasattr(self.strategy, "cluster_environment"): self.strategy.cluster_environment = self.cluster_environment - if hasattr(self.strategy, "parallel_devices"): - if self.strategy.parallel_devices: - self._parallel_devices = self.strategy.parallel_devices - else: - self.strategy.parallel_devices = self._parallel_devices + # if hasattr(self.strategy, "parallel_devices"): + # if self.strategy.parallel_devices: + # self._parallel_devices = self.strategy.parallel_devices + # else: + # self.strategy.parallel_devices = self._parallel_devices if hasattr(self.strategy, "num_nodes"): self.strategy._num_nodes = self._num_nodes_flag if hasattr(self.strategy, "_layer_sync"): From 810c0ba453d674ccb16c6c4c63b8e58275beef99 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Sat, 25 Jun 2022 00:29:43 +0200 Subject: [PATCH 04/57] add cuda --- src/pytorch_lightning/utilities/device_parser.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/pytorch_lightning/utilities/device_parser.py b/src/pytorch_lightning/utilities/device_parser.py index 881a02a809ec2..16cd84d0da0a3 100644 --- a/src/pytorch_lightning/utilities/device_parser.py +++ b/src/pytorch_lightning/utilities/device_parser.py @@ -14,6 +14,8 @@ from typing import Any, List, MutableSequence, Optional, Tuple, Union import torch +import torch.cuda +import multiprocessing from pytorch_lightning.plugins.environments import TorchElasticEnvironment from pytorch_lightning.tuner.auto_gpu_select import pick_multiple_gpus @@ -330,3 +332,8 @@ def parse_hpus(devices: Optional[Union[int, str, List[int]]]) -> Optional[int]: raise MisconfigurationException("`devices` for `HPUAccelerator` must be int, string or None.") return int(devices) if isinstance(devices, str) else devices + + +def num_cuda_devices() -> int: + with multiprocessing.Pool(1) as pool: + return pool.apply(torch.cuda.device_count) From 5fd9cda74af375a0e052fa907def3bbcfb1a8bdb Mon Sep 17 00:00:00 2001 From: awaelchli Date: Sat, 25 Jun 2022 00:38:07 +0200 Subject: [PATCH 05/57] update device count --- src/pytorch_lightning/accelerators/gpu.py | 8 ++++---- .../trainer/connectors/accelerator_connector.py | 3 ++- src/pytorch_lightning/tuner/auto_gpu_select.py | 5 +++-- src/pytorch_lightning/utilities/device_parser.py | 2 +- 4 files changed, 10 insertions(+), 8 deletions(-) diff --git a/src/pytorch_lightning/accelerators/gpu.py b/src/pytorch_lightning/accelerators/gpu.py index dba0a750063e1..7bcb2e98966d4 100644 --- a/src/pytorch_lightning/accelerators/gpu.py +++ b/src/pytorch_lightning/accelerators/gpu.py @@ -52,7 +52,7 @@ def setup(self, trainer: "pl.Trainer") -> None: def set_nvidia_flags(local_rank: int) -> None: # set the correct cuda visible devices (using pci order) os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" - all_gpu_ids = ",".join(str(x) for x in range(torch.cuda.device_count())) + all_gpu_ids = ",".join(str(x) for x in range(num_cuda_devices())) devices = os.getenv("CUDA_VISIBLE_DEVICES", all_gpu_ids) _log.info(f"LOCAL_RANK: {local_rank} - CUDA_VISIBLE_DEVICES: [{devices}]") @@ -84,11 +84,11 @@ def get_parallel_devices(devices: List[int]) -> List[torch.device]: @staticmethod def auto_device_count() -> int: """Get the devices when set to auto.""" - return torch.cuda.device_count() + return device_parser.num_cuda_devices() @staticmethod def is_available() -> bool: - return torch.cuda.device_count() > 0 + return device_parser.num_cuda_devices() > 0 @classmethod def register_accelerators(cls, accelerator_registry: Dict) -> None: @@ -156,6 +156,6 @@ def _to_float(x: str) -> float: def _get_gpu_id(device_id: int) -> str: """Get the unmasked real GPU IDs.""" # All devices if `CUDA_VISIBLE_DEVICES` unset - default = ",".join(str(i) for i in range(torch.cuda.device_count())) + default = ",".join(str(i) for i in range(device_parser.num_cuda_devices())) cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES", default=default).split(",") return cuda_visible_devices[device_id].strip() diff --git a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py index 90fa2fdd047b0..799da011a67b4 100644 --- a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -82,6 +82,7 @@ rank_zero_info, rank_zero_warn, ) +from pytorch_lightning.utilities.device_parser import num_cuda_devices from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import ( _HOROVOD_AVAILABLE, @@ -487,7 +488,7 @@ def _choose_accelerator(self) -> str: return "hpu" if MPSAccelerator.is_available(): return "mps" - if torch.cuda.is_available() and torch.cuda.device_count() > 0: + if torch.cuda.is_available() and num_cuda_devices() > 0: return "gpu" return "cpu" diff --git a/src/pytorch_lightning/tuner/auto_gpu_select.py b/src/pytorch_lightning/tuner/auto_gpu_select.py index d87eba64494f0..a42e55a61321d 100644 --- a/src/pytorch_lightning/tuner/auto_gpu_select.py +++ b/src/pytorch_lightning/tuner/auto_gpu_select.py @@ -15,6 +15,7 @@ import torch +from pytorch_lightning.utilities import device_parser from pytorch_lightning.utilities.exceptions import MisconfigurationException @@ -31,7 +32,7 @@ def pick_multiple_gpus(nb: int) -> List[int]: " Please select a valid number of GPU resources when using auto_select_gpus." ) - num_gpus = torch.cuda.device_count() + num_gpus = device_parser.num_cuda_devices() if nb > num_gpus: raise MisconfigurationException(f"You requested {nb} GPUs but your machine only has {num_gpus} GPUs.") nb = num_gpus if nb == -1 else nb @@ -51,7 +52,7 @@ def pick_single_gpu(exclude_gpus: List[int]) -> int: """ previously_used_gpus = [] unused_gpus = [] - for i in range(torch.cuda.device_count()): + for i in range(device_parser.num_cuda_devices()): if i in exclude_gpus: continue diff --git a/src/pytorch_lightning/utilities/device_parser.py b/src/pytorch_lightning/utilities/device_parser.py index 16cd84d0da0a3..b32fc75e46c1c 100644 --- a/src/pytorch_lightning/utilities/device_parser.py +++ b/src/pytorch_lightning/utilities/device_parser.py @@ -252,7 +252,7 @@ def _get_all_available_cuda_gpus() -> List[int]: Returns: a list of all available CUDA gpus """ - return list(range(torch.cuda.device_count())) + return list(range(num_cuda_devices())) def _check_unique(device_ids: List[int]) -> None: From c1b4fd0d610fab83d1856521e34464177c251748 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Sat, 25 Jun 2022 00:39:04 +0200 Subject: [PATCH 06/57] fork --- src/pytorch_lightning/strategies/launchers/spawn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pytorch_lightning/strategies/launchers/spawn.py b/src/pytorch_lightning/strategies/launchers/spawn.py index 28149dc5ea4a2..f9cbf41324a77 100644 --- a/src/pytorch_lightning/strategies/launchers/spawn.py +++ b/src/pytorch_lightning/strategies/launchers/spawn.py @@ -48,7 +48,7 @@ class _SpawnLauncher(_Launcher): def __init__(self, strategy: Strategy) -> None: self._strategy = strategy - self._start_method = "spawn" + self._start_method = "fork" @property def is_interactive_compatible(self) -> bool: From daa07ee1abcff3e3abd26141f8886da3ad89592a Mon Sep 17 00:00:00 2001 From: awaelchli Date: Sat, 25 Jun 2022 00:45:06 +0200 Subject: [PATCH 07/57] cuda available --- .../trainer/connectors/accelerator_connector.py | 4 ++-- src/pytorch_lightning/trainer/trainer.py | 3 ++- src/pytorch_lightning/utilities/device_parser.py | 5 +++++ 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py index 799da011a67b4..08f259bf46c5e 100644 --- a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -82,7 +82,7 @@ rank_zero_info, rank_zero_warn, ) -from pytorch_lightning.utilities.device_parser import num_cuda_devices +from pytorch_lightning.utilities.device_parser import num_cuda_devices, is_cuda_available from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import ( _HOROVOD_AVAILABLE, @@ -488,7 +488,7 @@ def _choose_accelerator(self) -> str: return "hpu" if MPSAccelerator.is_available(): return "mps" - if torch.cuda.is_available() and num_cuda_devices() > 0: + if num_cuda_devices() > 0: return "gpu" return "cpu" diff --git a/src/pytorch_lightning/trainer/trainer.py b/src/pytorch_lightning/trainer/trainer.py index 46774395fd5e2..c3b8d9cf3b063 100644 --- a/src/pytorch_lightning/trainer/trainer.py +++ b/src/pytorch_lightning/trainer/trainer.py @@ -106,6 +106,7 @@ from pytorch_lightning.utilities.auto_restart import _add_capture_metadata_collate from pytorch_lightning.utilities.cloud_io import get_filesystem from pytorch_lightning.utilities.data import _auto_add_worker_init_fn, has_len_all_ranks +from pytorch_lightning.utilities.device_parser import is_cuda_available from pytorch_lightning.utilities.distributed import distributed_available from pytorch_lightning.utilities.exceptions import ExitGracefullyException, MisconfigurationException from pytorch_lightning.utilities.imports import _fault_tolerant_training @@ -1762,7 +1763,7 @@ def _log_device_info(self) -> None: rank_zero_info(f"HPU available: {_HPU_AVAILABLE}, using: {num_hpus} HPUs") # TODO: Integrate MPS Accelerator here, once gpu maps to both - if torch.cuda.is_available() and not isinstance(self.accelerator, GPUAccelerator): + if is_cuda_available() and not isinstance(self.accelerator, GPUAccelerator): rank_zero_warn( "GPU available but not used. Set `accelerator` and `devices` using" f" `Trainer(accelerator='gpu', devices={GPUAccelerator.auto_device_count()})`.", diff --git a/src/pytorch_lightning/utilities/device_parser.py b/src/pytorch_lightning/utilities/device_parser.py index b32fc75e46c1c..c6433cabf2edd 100644 --- a/src/pytorch_lightning/utilities/device_parser.py +++ b/src/pytorch_lightning/utilities/device_parser.py @@ -337,3 +337,8 @@ def parse_hpus(devices: Optional[Union[int, str, List[int]]]) -> Optional[int]: def num_cuda_devices() -> int: with multiprocessing.Pool(1) as pool: return pool.apply(torch.cuda.device_count) + + +def is_cuda_available() -> bool: + with multiprocessing.Pool(1) as pool: + return pool.apply(torch.cuda.is_available) From 679f3631589011631f7280c471f284cdd9ec0390 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Sat, 25 Jun 2022 00:51:25 +0200 Subject: [PATCH 08/57] set device --- src/pytorch_lightning/accelerators/gpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pytorch_lightning/accelerators/gpu.py b/src/pytorch_lightning/accelerators/gpu.py index 7bcb2e98966d4..e277c4830bc1b 100644 --- a/src/pytorch_lightning/accelerators/gpu.py +++ b/src/pytorch_lightning/accelerators/gpu.py @@ -40,7 +40,7 @@ def setup_environment(self, root_device: torch.device) -> None: super().setup_environment(root_device) if root_device.type != "cuda": raise MisconfigurationException(f"Device should be GPU, got {root_device} instead") - # torch.cuda.set_device(root_device) + torch.cuda.set_device(root_device) def setup(self, trainer: "pl.Trainer") -> None: # TODO refactor input from trainer to local_rank @four4fish From c43f8277a7fe72d552bd71da25a047f4d23917e2 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Sat, 25 Jun 2022 00:52:22 +0200 Subject: [PATCH 09/57] update --- src/pytorch_lightning/accelerators/gpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pytorch_lightning/accelerators/gpu.py b/src/pytorch_lightning/accelerators/gpu.py index e277c4830bc1b..42b96dae61596 100644 --- a/src/pytorch_lightning/accelerators/gpu.py +++ b/src/pytorch_lightning/accelerators/gpu.py @@ -52,7 +52,7 @@ def setup(self, trainer: "pl.Trainer") -> None: def set_nvidia_flags(local_rank: int) -> None: # set the correct cuda visible devices (using pci order) os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" - all_gpu_ids = ",".join(str(x) for x in range(num_cuda_devices())) + all_gpu_ids = ",".join(str(x) for x in range(device_parser.num_cuda_devices())) devices = os.getenv("CUDA_VISIBLE_DEVICES", all_gpu_ids) _log.info(f"LOCAL_RANK: {local_rank} - CUDA_VISIBLE_DEVICES: [{devices}]") From b7e529ef3eb1b32cdc9961187cc43b83f14da181 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Sat, 25 Jun 2022 01:07:39 +0200 Subject: [PATCH 10/57] update --- src/pytorch_lightning/strategies/ddp_spawn.py | 37 +++++++------- .../strategies/launchers/spawn.py | 15 +++--- .../connectors/accelerator_connector.py | 48 +++++++++---------- 3 files changed, 52 insertions(+), 48 deletions(-) diff --git a/src/pytorch_lightning/strategies/ddp_spawn.py b/src/pytorch_lightning/strategies/ddp_spawn.py index c97bec421a909..6da3a57d5be05 100644 --- a/src/pytorch_lightning/strategies/ddp_spawn.py +++ b/src/pytorch_lightning/strategies/ddp_spawn.py @@ -60,7 +60,7 @@ class DDPSpawnStrategy(ParallelStrategy): def __init__( self, accelerator: Optional["pl.accelerators.accelerator.Accelerator"] = None, - # parallel_devices: Optional[List[torch.device]] = None, + parallel_devices: Optional[List[torch.device]] = None, cluster_environment: Optional[ClusterEnvironment] = None, checkpoint_io: Optional[CheckpointIO] = None, precision_plugin: Optional[PrecisionPlugin] = None, @@ -68,11 +68,12 @@ def __init__( ddp_comm_hook: Optional[callable] = None, ddp_comm_wrapper: Optional[callable] = None, process_group_backend: Optional[str] = None, + start_method: str = "spawn", **kwargs: Any, ): super().__init__( accelerator=accelerator, - # parallel_devices=parallel_devices, + parallel_devices=parallel_devices, cluster_environment=cluster_environment, checkpoint_io=checkpoint_io, precision_plugin=precision_plugin, @@ -84,6 +85,7 @@ def __init__( self._ddp_comm_wrapper = ddp_comm_wrapper self._local_rank = 0 self._process_group_backend: Optional[str] = process_group_backend + self._start_method = start_method @property def num_nodes(self) -> int: @@ -100,11 +102,11 @@ def local_rank(self) -> int: @property def root_device(self): - return torch.device("cuda", self.local_rank) + return self.parallel_devices[self.local_rank] @property def num_processes(self): - return 2 + return len(self.parallel_devices) @property def distributed_sampler_kwargs(self): @@ -120,7 +122,7 @@ def process_group_backend(self) -> Optional[str]: return self._process_group_backend def _configure_launcher(self): - self._launcher = _SpawnLauncher(self) + self._launcher = _SpawnLauncher(self, start_method=self._start_method) def setup(self, trainer: "pl.Trainer") -> None: os.environ["MASTER_PORT"] = str(self.cluster_environment.main_port) @@ -270,17 +272,20 @@ def post_training_step(self): @classmethod def register_strategies(cls, strategy_registry: Dict) -> None: - strategy_registry.register( - "ddp_spawn_find_unused_parameters_false", - cls, - description="DDPSpawn Strategy with `find_unused_parameters` as False", - find_unused_parameters=False, - ) - strategy_registry.register( - cls.strategy_name, - cls, - description=f"{cls.__class__.__name__}", - ) + for start_method in ("spawn", "fork"): + strategy_registry.register( + f"ddp_{start_method}_find_unused_parameters_false", + cls, + description="DDPSpawn Strategy with `find_unused_parameters` as False", + find_unused_parameters=False, + start_method=start_method, + ) + strategy_registry.register( + f"ddp_{start_method}", + cls, + description=f"{cls.__class__.__name__}", + start_method=start_method, + ) def teardown(self) -> None: log.detail(f"{self.__class__.__name__}: tearing down strategy") diff --git a/src/pytorch_lightning/strategies/launchers/spawn.py b/src/pytorch_lightning/strategies/launchers/spawn.py index f9cbf41324a77..cfad40471d645 100644 --- a/src/pytorch_lightning/strategies/launchers/spawn.py +++ b/src/pytorch_lightning/strategies/launchers/spawn.py @@ -46,17 +46,16 @@ class _SpawnLauncher(_Launcher): strategy: A reference to the strategy that is used together with this launcher. """ - def __init__(self, strategy: Strategy) -> None: + def __init__(self, strategy: Strategy, start_method: str = "spawn") -> None: self._strategy = strategy - self._start_method = "fork" + self._start_method = start_method @property def is_interactive_compatible(self) -> bool: - # The start method 'spawn' is currently the only one that works with DDP and CUDA support - # The start method 'fork' is the only one supported in Jupyter environments but not compatible with CUDA - # For more context, see https://github.com/Lightning-AI/lightning/issues/7550 - # return self._start_method == "fork" and self._strategy.root_device.type != "cuda" - return True + # The start method 'spawn' is not supporrted in interactive environments + # The start method 'fork' is the only one supported in Jupyter environments, with constraints around CUDA + # initialization. For more context, see https://github.com/Lightning-AI/lightning/issues/7550 + return self._start_method == "fork" def launch(self, function: Callable, *args: Any, trainer: Optional["pl.Trainer"] = None, **kwargs: Any) -> Any: """Spawns processes that run the given function in parallel. @@ -81,7 +80,7 @@ def launch(self, function: Callable, *args: Any, trainer: Optional["pl.Trainer"] self._wrapping_function, args=(trainer, function, args, kwargs, return_queue), nprocs=self._strategy.num_processes, - start_method="fork", + start_method=self._start_method, ) spawn_output = return_queue.get() if trainer is None: diff --git a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py index 08f259bf46c5e..1f84e3d4e7e15 100644 --- a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -352,23 +352,23 @@ def _check_config_and_set_final_flags( else: self._cluster_environment_flag = getattr(self._strategy_flag, "cluster_environment") - # if hasattr(self._strategy_flag, "parallel_devices"): - # if self._strategy_flag.parallel_devices: - # if self._strategy_flag.parallel_devices[0].type == "cpu": - # if self._accelerator_flag and self._accelerator_flag not in ("auto", "cpu"): - # raise MisconfigurationException( - # f"CPU parallel_devices set through {self._strategy_flag.__class__.__name__} class," - # f" but accelerator set to {self._accelerator_flag}, please choose one device type" - # ) - # self._accelerator_flag = "cpu" - # if self._strategy_flag.parallel_devices[0].type == "cuda": - # if self._accelerator_flag and self._accelerator_flag not in ("auto", "gpu"): - # raise MisconfigurationException( - # f"GPU parallel_devices set through {self._strategy_flag.__class__.__name__} class," - # f" but accelerator set to {self._accelerator_flag}, please choose one device type" - # ) - # self._accelerator_flag = "gpu" - # self._parallel_devices = self._strategy_flag.parallel_devices + if hasattr(self._strategy_flag, "parallel_devices"): + if self._strategy_flag.parallel_devices: + if self._strategy_flag.parallel_devices[0].type == "cpu": + if self._accelerator_flag and self._accelerator_flag not in ("auto", "cpu"): + raise MisconfigurationException( + f"CPU parallel_devices set through {self._strategy_flag.__class__.__name__} class," + f" but accelerator set to {self._accelerator_flag}, please choose one device type" + ) + self._accelerator_flag = "cpu" + if self._strategy_flag.parallel_devices[0].type == "cuda": + if self._accelerator_flag and self._accelerator_flag not in ("auto", "gpu"): + raise MisconfigurationException( + f"GPU parallel_devices set through {self._strategy_flag.__class__.__name__} class," + f" but accelerator set to {self._accelerator_flag}, please choose one device type" + ) + self._accelerator_flag = "gpu" + self._parallel_devices = self._strategy_flag.parallel_devices amp_type = amp_type if isinstance(amp_type, str) else None self._amp_type_flag = AMPType.from_str(amp_type) @@ -524,8 +524,8 @@ def _set_parallel_devices_and_init_accelerator(self) -> None: self._set_devices_flag_if_auto_select_gpus_passed() self._devices_flag = self.accelerator.parse_devices(self._devices_flag) - # if not self._parallel_devices: - # self._parallel_devices = self.accelerator.get_parallel_devices(self._devices_flag) + if not self._parallel_devices: + self._parallel_devices = self.accelerator.get_parallel_devices(self._devices_flag) def _set_devices_flag_if_auto_passed(self) -> None: if self._devices_flag == "auto" or self._devices_flag is None: @@ -763,11 +763,11 @@ def _lazy_init_strategy(self) -> None: self.strategy.checkpoint_io = self.checkpoint_io if hasattr(self.strategy, "cluster_environment"): self.strategy.cluster_environment = self.cluster_environment - # if hasattr(self.strategy, "parallel_devices"): - # if self.strategy.parallel_devices: - # self._parallel_devices = self.strategy.parallel_devices - # else: - # self.strategy.parallel_devices = self._parallel_devices + if hasattr(self.strategy, "parallel_devices"): + if self.strategy.parallel_devices: + self._parallel_devices = self.strategy.parallel_devices + else: + self.strategy.parallel_devices = self._parallel_devices if hasattr(self.strategy, "num_nodes"): self.strategy._num_nodes = self._num_nodes_flag if hasattr(self.strategy, "_layer_sync"): From b51d172e0e0b444be3a08738ce181723de4a34ce Mon Sep 17 00:00:00 2001 From: awaelchli Date: Sat, 25 Jun 2022 04:55:40 +0200 Subject: [PATCH 11/57] cuda available --- src/pytorch_lightning/profilers/pytorch.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/pytorch_lightning/profilers/pytorch.py b/src/pytorch_lightning/profilers/pytorch.py index c9340444a06eb..079aafe37ec8b 100644 --- a/src/pytorch_lightning/profilers/pytorch.py +++ b/src/pytorch_lightning/profilers/pytorch.py @@ -24,6 +24,7 @@ from torch.autograd.profiler import record_function from pytorch_lightning.profilers.profiler import Profiler +from pytorch_lightning.utilities.device_parser import is_cuda_available from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _KINETO_AVAILABLE from pytorch_lightning.utilities.rank_zero import rank_zero_warn @@ -368,7 +369,7 @@ def _default_activities(self) -> List["ProfilerActivity"]: return activities if self._profiler_kwargs.get("use_cpu", True): activities.append(ProfilerActivity.CPU) - if self._profiler_kwargs.get("use_cuda", torch.cuda.is_available()): + if self._profiler_kwargs.get("use_cuda", is_cuda_available()): activities.append(ProfilerActivity.CUDA) return activities From 9b4194122dc1c01cb1d27c057798e3457719524d Mon Sep 17 00:00:00 2001 From: awaelchli Date: Sat, 25 Jun 2022 04:57:37 +0200 Subject: [PATCH 12/57] formatting --- src/pytorch_lightning/strategies/ddp_spawn.py | 2 +- .../trainer/connectors/accelerator_connector.py | 2 +- src/pytorch_lightning/utilities/device_parser.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/pytorch_lightning/strategies/ddp_spawn.py b/src/pytorch_lightning/strategies/ddp_spawn.py index 6da3a57d5be05..9b3ca7d03a0f3 100644 --- a/src/pytorch_lightning/strategies/ddp_spawn.py +++ b/src/pytorch_lightning/strategies/ddp_spawn.py @@ -106,7 +106,7 @@ def root_device(self): @property def num_processes(self): - return len(self.parallel_devices) + return len(self.parallel_devices) if self.parallel_devices is not None else 0 @property def distributed_sampler_kwargs(self): diff --git a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py index 1f84e3d4e7e15..c0cdaadc8a17c 100644 --- a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -82,7 +82,7 @@ rank_zero_info, rank_zero_warn, ) -from pytorch_lightning.utilities.device_parser import num_cuda_devices, is_cuda_available +from pytorch_lightning.utilities.device_parser import is_cuda_available, num_cuda_devices from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import ( _HOROVOD_AVAILABLE, diff --git a/src/pytorch_lightning/utilities/device_parser.py b/src/pytorch_lightning/utilities/device_parser.py index c6433cabf2edd..26c1d41e64f77 100644 --- a/src/pytorch_lightning/utilities/device_parser.py +++ b/src/pytorch_lightning/utilities/device_parser.py @@ -11,11 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import multiprocessing from typing import Any, List, MutableSequence, Optional, Tuple, Union import torch import torch.cuda -import multiprocessing from pytorch_lightning.plugins.environments import TorchElasticEnvironment from pytorch_lightning.tuner.auto_gpu_select import pick_multiple_gpus From 9cea97911999636055db573c161b97acc696d775 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Sat, 25 Jun 2022 04:59:43 +0200 Subject: [PATCH 13/57] unused import --- .../trainer/connectors/accelerator_connector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py index c0cdaadc8a17c..95ce629c7a478 100644 --- a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -82,7 +82,7 @@ rank_zero_info, rank_zero_warn, ) -from pytorch_lightning.utilities.device_parser import is_cuda_available, num_cuda_devices +from pytorch_lightning.utilities.device_parser import num_cuda_devices from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import ( _HOROVOD_AVAILABLE, From daccd2121c85d018dd90bdda8b86bd8b9ab3a98b Mon Sep 17 00:00:00 2001 From: awaelchli Date: Mon, 27 Jun 2022 09:58:31 +0200 Subject: [PATCH 14/57] test fixes --- .../connectors/accelerator_connector.py | 3 +- .../test_accelerator_connector.py | 67 +++++++++---------- .../tests_pytorch/accelerators/test_common.py | 4 +- .../deprecated_api/test_remove_2-0.py | 4 +- tests/tests_pytorch/models/test_gpu.py | 4 +- .../tests_pytorch/plugins/test_amp_plugins.py | 12 ++-- .../plugins/test_cluster_integration.py | 4 +- .../strategies/test_bagua_strategy.py | 2 +- tests/tests_pytorch/strategies/test_ddp.py | 8 +-- .../test_ddp_fully_sharded_native.py | 4 +- ..._ddp_fully_sharded_with_full_state_dict.py | 4 +- .../strategies/test_deepspeed_strategy.py | 2 +- tests/tests_pytorch/strategies/test_dp.py | 4 +- .../trainer/flags/test_env_vars.py | 4 +- .../properties/test_auto_gpu_select.py | 4 +- .../tests_pytorch/trainer/test_supporters.py | 4 +- tests/tests_pytorch/trainer/test_trainer.py | 4 +- tests/tests_pytorch/utilities/test_cli.py | 4 +- 18 files changed, 70 insertions(+), 72 deletions(-) diff --git a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py index 95ce629c7a478..62eebab400a67 100644 --- a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -82,7 +82,6 @@ rank_zero_info, rank_zero_warn, ) -from pytorch_lightning.utilities.device_parser import num_cuda_devices from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import ( _HOROVOD_AVAILABLE, @@ -488,7 +487,7 @@ def _choose_accelerator(self) -> str: return "hpu" if MPSAccelerator.is_available(): return "mps" - if num_cuda_devices() > 0: + if GPUAccelerator.is_available(): return "gpu" return "cpu" diff --git a/tests/tests_pytorch/accelerators/test_accelerator_connector.py b/tests/tests_pytorch/accelerators/test_accelerator_connector.py index 20cac155f9915..b1ec1bc65cf94 100644 --- a/tests/tests_pytorch/accelerators/test_accelerator_connector.py +++ b/tests/tests_pytorch/accelerators/test_accelerator_connector.py @@ -101,7 +101,7 @@ def _test_strategy_choice_ddp_and_cpu(tmpdir, ddp_strategy_class): "SLURM_LOCALID": "0", }, ) -@mock.patch("torch.cuda.device_count", return_value=0) +@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=0) def test_custom_cluster_environment_in_slurm_environment(_, tmpdir): """Test that we choose the custom cluster even when SLURM or TE flags are around.""" @@ -138,7 +138,7 @@ def creates_processes_externally(self) -> bool: "SLURM_LOCALID": "0", }, ) -@mock.patch("torch.cuda.device_count", return_value=0) +@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=0) @mock.patch("pytorch_lightning.strategies.DDPStrategy.setup_distributed", autospec=True) def test_custom_accelerator(device_count_mock, setup_distributed_mock): class Accel(Accelerator): @@ -197,7 +197,7 @@ class Strat(DDPStrategy): "SLURM_LOCALID": "0", }, ) -@mock.patch("torch.cuda.device_count", return_value=0) +@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=0) @mock.patch("pytorch_lightning.strategies.DDPStrategy.setup_distributed", autospec=True) def test_dist_backend_accelerator_mapping(*_): trainer = Trainer(fast_dev_run=True, strategy="ddp_spawn", accelerator="cpu", devices=2) @@ -206,7 +206,7 @@ def test_dist_backend_accelerator_mapping(*_): assert trainer.strategy.local_rank == 0 -@mock.patch("torch.cuda.device_count", return_value=2) +@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=2) def test_ipython_incompatible_backend_error(_, monkeypatch): monkeypatch.setattr(pytorch_lightning.utilities, "_IS_INTERACTIVE", True) with pytest.raises(MisconfigurationException, match=r"strategy='ddp'\)`.*is not compatible"): @@ -226,7 +226,7 @@ def test_ipython_incompatible_backend_error(_, monkeypatch): Trainer(strategy="dp") -@mock.patch("torch.cuda.device_count", return_value=2) +@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=2) def test_ipython_compatible_dp_strategy_gpu(_, monkeypatch): monkeypatch.setattr(pytorch_lightning.utilities, "_IS_INTERACTIVE", True) trainer = Trainer(strategy="dp", accelerator="gpu") @@ -251,8 +251,8 @@ def test_ipython_compatible_strategy_tpu(mock_tpu_acc_avail, monkeypatch): ], ) @pytest.mark.parametrize("devices", [1, 2]) -@mock.patch("torch.cuda.is_available", return_value=True) -@mock.patch("torch.cuda.device_count", return_value=2) +@mock.patch("pytorch_lightning.utilities.device_parser.is_cuda_available", return_value=True) +@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=2) def test_accelerator_choice_multi_node_gpu( mock_is_available, mock_device_count, tmpdir, strategy, strategy_class, devices ): @@ -260,7 +260,7 @@ def test_accelerator_choice_multi_node_gpu( assert isinstance(trainer.strategy, strategy_class) -@mock.patch("torch.cuda.is_available", return_value=False) +@mock.patch("pytorch_lightning.utilities.device_parser.is_cuda_available", return_value=False) def test_accelerator_cpu(_): trainer = Trainer(accelerator="cpu") assert isinstance(trainer.accelerator, CPUAccelerator) @@ -282,8 +282,8 @@ def test_accelerator_cpu(_): Trainer(accelerator="cpu", gpus=1) -@mock.patch("torch.cuda.device_count", return_value=2) -@mock.patch("torch.cuda.is_available", return_value=True) +@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=2) +@mock.patch("pytorch_lightning.utilities.device_parser.is_cuda_available", return_value=True) @pytest.mark.parametrize("device_count", (["0"], [0, "1"], ["GPU"], [["0", "1"], [0, 1]], [False])) def test_accelererator_invalid_type_devices(mock_is_available, mock_device_count, device_count): with pytest.raises( @@ -431,8 +431,8 @@ def test_strategy_choice_ddp_spawn_cpu(tmpdir): @mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1"}) -@mock.patch("torch.cuda.device_count", return_value=2) -@mock.patch("torch.cuda.is_available", return_value=True) +@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=2) +@mock.patch("pytorch_lightning.utilities.device_parser.is_cuda_available", return_value=True) def test_strategy_choice_ddp(*_): trainer = Trainer(fast_dev_run=True, strategy="ddp", accelerator="gpu", devices=1) assert isinstance(trainer.accelerator, GPUAccelerator) @@ -441,8 +441,8 @@ def test_strategy_choice_ddp(*_): @mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1"}) -@mock.patch("torch.cuda.device_count", return_value=2) -@mock.patch("torch.cuda.is_available", return_value=True) +@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=2) +@mock.patch("pytorch_lightning.utilities.device_parser.is_cuda_available", return_value=True) def test_strategy_choice_ddp_spawn(cuda_available_mock, device_count_mock): trainer = Trainer(fast_dev_run=True, strategy="ddp_spawn", accelerator="gpu", devices=1) assert isinstance(trainer.accelerator, GPUAccelerator) @@ -486,9 +486,9 @@ def test_strategy_choice_ddp_slurm(setup_distributed_mock, strategy): }, ) @mock.patch("torch.cuda.set_device") -@mock.patch("torch.cuda.device_count", return_value=2) +@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=2) @mock.patch("pytorch_lightning.strategies.DDPStrategy.setup_distributed", autospec=True) -@mock.patch("torch.cuda.is_available", return_value=True) +@mock.patch("pytorch_lightning.utilities.device_parser.is_cuda_available", return_value=True) @pytest.mark.parametrize("strategy", ["ddp2", DDP2Strategy()]) def test_strategy_choice_ddp2_slurm( set_device_mock, device_count_mock, setup_distributed_mock, is_available_mock, strategy @@ -515,10 +515,10 @@ def test_strategy_choice_ddp2_slurm( }, ) @mock.patch("torch.cuda.set_device") -@mock.patch("torch.cuda.device_count", return_value=2) -@mock.patch("torch.cuda.is_available", return_value=True) +@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=2) +@mock.patch("pytorch_lightning.utilities.device_parser.is_cuda_available", return_value=True) @mock.patch("pytorch_lightning.strategies.DDPStrategy.setup_distributed", autospec=True) -@mock.patch("torch.cuda.is_available", return_value=True) +@mock.patch("pytorch_lightning.utilities.device_parser.is_cuda_available", return_value=True) def test_strategy_choice_ddp_te(*_): trainer = Trainer(fast_dev_run=True, strategy="ddp", accelerator="gpu", devices=2) assert isinstance(trainer.accelerator, GPUAccelerator) @@ -541,10 +541,10 @@ def test_strategy_choice_ddp_te(*_): }, ) @mock.patch("torch.cuda.set_device") -@mock.patch("torch.cuda.device_count", return_value=2) -@mock.patch("torch.cuda.is_available", return_value=True) +@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=2) +@mock.patch("pytorch_lightning.utilities.device_parser.is_cuda_available", return_value=True) @mock.patch("pytorch_lightning.strategies.DDPStrategy.setup_distributed", autospec=True) -@mock.patch("torch.cuda.is_available", return_value=True) +@mock.patch("pytorch_lightning.utilities.device_parser.is_cuda_available", return_value=True) def test_strategy_choice_ddp2_te(*_): trainer = Trainer(fast_dev_run=True, strategy="ddp2", accelerator="gpu", devices=2) assert isinstance(trainer.accelerator, GPUAccelerator) @@ -565,7 +565,7 @@ def test_strategy_choice_ddp2_te(*_): "TORCHELASTIC_RUN_ID": "1", }, ) -@mock.patch("torch.cuda.device_count", return_value=0) +@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=0) @mock.patch("pytorch_lightning.strategies.DDPStrategy.setup_distributed", autospec=True) def test_strategy_choice_ddp_cpu_te(*_): trainer = Trainer(fast_dev_run=True, strategy="ddp_spawn", accelerator="cpu", devices=2) @@ -588,10 +588,10 @@ def test_strategy_choice_ddp_cpu_te(*_): }, ) @mock.patch("torch.cuda.set_device") -@mock.patch("torch.cuda.device_count", return_value=1) -@mock.patch("torch.cuda.is_available", return_value=True) +@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=1) +@mock.patch("pytorch_lightning.utilities.device_parser.is_cuda_available", return_value=True) @mock.patch("pytorch_lightning.strategies.DDPStrategy.setup_distributed", autospec=True) -@mock.patch("torch.cuda.is_available", return_value=True) +@mock.patch("pytorch_lightning.utilities.device_parser.is_cuda_available", return_value=True) def test_strategy_choice_ddp_kubeflow(*_): trainer = Trainer(fast_dev_run=True, strategy="ddp", accelerator="gpu", devices=1) assert isinstance(trainer.accelerator, GPUAccelerator) @@ -611,7 +611,7 @@ def test_strategy_choice_ddp_kubeflow(*_): "RANK": "1", }, ) -@mock.patch("torch.cuda.device_count", return_value=0) +@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=0) @mock.patch("pytorch_lightning.strategies.DDPStrategy.setup_distributed", autospec=True) def test_strategy_choice_ddp_cpu_kubeflow(*_): trainer = Trainer(fast_dev_run=True, strategy="ddp_spawn", accelerator="cpu", devices=2) @@ -633,7 +633,7 @@ def test_strategy_choice_ddp_cpu_kubeflow(*_): "SLURM_LOCALID": "0", }, ) -@mock.patch("torch.cuda.device_count", return_value=0) +@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=0) @mock.patch("pytorch_lightning.strategies.DDPStrategy.setup_distributed", autospec=True) @pytest.mark.parametrize("strategy", ["ddp", DDPStrategy()]) def test_strategy_choice_ddp_cpu_slurm(device_count_mock, setup_distributed_mock, strategy): @@ -655,8 +655,8 @@ def test_check_native_fsdp_strategy_and_fallback(): @mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0"}) -@mock.patch("torch.cuda.device_count", return_value=1) -@mock.patch("torch.cuda.is_available", return_value=True) +@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=1) +@mock.patch("pytorch_lightning.utilities.device_parser.is_cuda_available", return_value=True) @RunIf(min_torch="1.11") def test_mixed_precision_support_with_native_fsdp_strategy(device_count_mock, mock_cuda_available, tmpdir): with pytest.raises( @@ -702,7 +702,7 @@ def test_unsupported_ipu_choice(mock_ipu_acc_avail, monkeypatch): Trainer(accelerator="ipu", precision=64) -@mock.patch("torch.cuda.is_available", return_value=False) +@mock.patch("pytorch_lightning.utilities.device_parser.is_cuda_available", return_value=False) @mock.patch("pytorch_lightning.utilities.imports._TPU_AVAILABLE", return_value=False) @mock.patch("pytorch_lightning.utilities.imports._IPU_AVAILABLE", return_value=False) @mock.patch("pytorch_lightning.utilities.imports._HPU_AVAILABLE", return_value=False) @@ -713,11 +713,10 @@ def test_devices_auto_choice_cpu( assert trainer.num_devices == 1 -@mock.patch("torch.cuda.is_available", return_value=True) -@mock.patch("torch.cuda.device_count", return_value=2) +@mock.patch("pytorch_lightning.utilities.device_parser.is_cuda_available", return_value=True) +@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=2) @RunIf(mps=False) def test_devices_auto_choice_gpu(is_gpu_available_mock, device_count_mock): - trainer = Trainer(accelerator="auto", devices="auto") assert isinstance(trainer.accelerator, GPUAccelerator) assert trainer.num_devices == 2 diff --git a/tests/tests_pytorch/accelerators/test_common.py b/tests/tests_pytorch/accelerators/test_common.py index 3de26b5888390..a98a192c6b659 100644 --- a/tests/tests_pytorch/accelerators/test_common.py +++ b/tests/tests_pytorch/accelerators/test_common.py @@ -18,8 +18,8 @@ from pytorch_lightning.strategies import DDPStrategy -@mock.patch("torch.cuda.device_count", return_value=2) -def test_auto_device_count(device_count_mock): +@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=2) +def test_auto_device_count(_): assert CPUAccelerator.auto_device_count() == 1 assert GPUAccelerator.auto_device_count() == 2 assert TPUAccelerator.auto_device_count() == 8 diff --git a/tests/tests_pytorch/deprecated_api/test_remove_2-0.py b/tests/tests_pytorch/deprecated_api/test_remove_2-0.py index c54afd0931cff..3a13ba340d3d0 100644 --- a/tests/tests_pytorch/deprecated_api/test_remove_2-0.py +++ b/tests/tests_pytorch/deprecated_api/test_remove_2-0.py @@ -27,8 +27,8 @@ def test_v2_0_0_deprecated_num_processes(): _ = Trainer(num_processes=2) -@mock.patch("torch.cuda.is_available", return_value=True) -@mock.patch("torch.cuda.device_count", return_value=2) +@mock.patch("pytorch_lightning.utilities.device_parser.is_cuda_available", return_value=True) +@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=2) def test_v2_0_0_deprecated_gpus(*_): with pytest.deprecated_call(match=r"is deprecated in v1.7 and will be removed in v2.0."): _ = Trainer(gpus=0) diff --git a/tests/tests_pytorch/models/test_gpu.py b/tests/tests_pytorch/models/test_gpu.py index ffd093e6ee0e3..a524d7cd73837 100644 --- a/tests/tests_pytorch/models/test_gpu.py +++ b/tests/tests_pytorch/models/test_gpu.py @@ -185,8 +185,8 @@ def test_parse_gpu_returns_none_when_no_devices_are_available(mocked_device_coun "TORCHELASTIC_RUN_ID": "1", }, ) -@mock.patch("torch.cuda.device_count", return_value=1) -@mock.patch("torch.cuda.is_available", return_value=True) +@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=1) +@mock.patch("pytorch_lightning.utilities.device_parser.is_cuda_available", return_value=True) @pytest.mark.parametrize("gpus", [[0, 1, 2], 2, "0", [0, 2]]) def test_torchelastic_gpu_parsing(mocked_device_count, mocked_is_available, gpus): """Ensure when using torchelastic and nproc_per_node is set to the default of 1 per GPU device That we omit diff --git a/tests/tests_pytorch/plugins/test_amp_plugins.py b/tests/tests_pytorch/plugins/test_amp_plugins.py index 4f990e74b20aa..c6e7c018af711 100644 --- a/tests/tests_pytorch/plugins/test_amp_plugins.py +++ b/tests/tests_pytorch/plugins/test_amp_plugins.py @@ -45,8 +45,8 @@ class MyApexPlugin(ApexMixedPrecisionPlugin): "SLURM_LOCALID": "0", }, ) -@mock.patch("torch.cuda.is_available", return_value=True) -@mock.patch("torch.cuda.device_count", return_value=2) +@mock.patch("pytorch_lightning.utilities.device_parser.is_cuda_available", return_value=True) +@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=2) @pytest.mark.parametrize("strategy,devices", [("ddp", 2), ("ddp2", 2), ("ddp_spawn", 2)]) @pytest.mark.parametrize( "amp,custom_plugin,plugin_cls", @@ -272,16 +272,16 @@ def test_precision_selection_raises(monkeypatch): with pytest.raises(MisconfigurationException, match=r"amp_type='apex', precision='bf16'\)` but it's not supported"): Trainer(amp_backend="apex", precision="bf16") - with mock.patch("torch.cuda.device_count", return_value=1), pytest.raises( + with mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=1), pytest.raises( MisconfigurationException, match="Sharded plugins are not supported with apex" ): - with mock.patch("torch.cuda.is_available", return_value=True): + with mock.patch("pytorch_lightning.utilities.device_parser.is_cuda_available", return_value=True): Trainer(amp_backend="apex", precision=16, accelerator="gpu", devices=1, strategy="ddp_fully_sharded") import pytorch_lightning.plugins.precision.apex_amp as apex monkeypatch.setattr(apex, "_APEX_AVAILABLE", False) - with mock.patch("torch.cuda.device_count", return_value=1), mock.patch( - "torch.cuda.is_available", return_value=True + with mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=1), mock.patch( + "pytorch_lightning.utilities.device_parser.is_cuda_available", return_value=True ), pytest.raises(MisconfigurationException, match="asked for Apex AMP but you have not installed it"): Trainer(amp_backend="apex", precision=16, accelerator="gpu", devices=1) diff --git a/tests/tests_pytorch/plugins/test_cluster_integration.py b/tests/tests_pytorch/plugins/test_cluster_integration.py index 10ed1d6a4a650..804a5bd659f1d 100644 --- a/tests/tests_pytorch/plugins/test_cluster_integration.py +++ b/tests/tests_pytorch/plugins/test_cluster_integration.py @@ -90,8 +90,8 @@ def test_ranks_available_manual_strategy_selection(mock_gpu_acc_available, strat dict(strategy="ddp_spawn", accelerator="gpu", devices=[1, 2]), ], ) -@mock.patch("torch.cuda.is_available", return_value=True) -@mock.patch("torch.cuda.device_count", return_value=4) +@mock.patch("pytorch_lightning.utilities.device_parser.is_cuda_available", return_value=True) +@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=4) def test_ranks_available_automatic_strategy_selection(mock0, mock1, trainer_kwargs): """Test that the rank information is readily available after Trainer initialization.""" num_nodes = 2 diff --git a/tests/tests_pytorch/strategies/test_bagua_strategy.py b/tests/tests_pytorch/strategies/test_bagua_strategy.py index c9ccae43edbf3..79ec701964f8f 100644 --- a/tests/tests_pytorch/strategies/test_bagua_strategy.py +++ b/tests/tests_pytorch/strategies/test_bagua_strategy.py @@ -118,6 +118,6 @@ def test_bagua_not_available(monkeypatch): import pytorch_lightning.strategies.bagua as imports monkeypatch.setattr(imports, "_BAGUA_AVAILABLE", False) - with mock.patch("torch.cuda.device_count", return_value=1): + with mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=1): with pytest.raises(MisconfigurationException, match="you must have `Bagua` installed"): Trainer(strategy="bagua", accelerator="gpu", devices=1) diff --git a/tests/tests_pytorch/strategies/test_ddp.py b/tests/tests_pytorch/strategies/test_ddp.py index 58fa28559b97f..00e83ec6bf5ed 100644 --- a/tests/tests_pytorch/strategies/test_ddp.py +++ b/tests/tests_pytorch/strategies/test_ddp.py @@ -80,11 +80,11 @@ def test_multi_gpu_model_ddp_fit_test(tmpdir, as_module): @RunIf(skip_windows=True) @pytest.mark.skipif(torch.cuda.is_available(), reason="test doesn't requires GPU machine") -@mock.patch("torch.cuda.is_available", return_value=True) +@mock.patch("pytorch_lightning.utilities.device_parser.is_cuda_available", return_value=True) def test_torch_distributed_backend_env_variables(tmpdir): """This test set `undefined` as torch backend and should raise an `Backend.UNDEFINED` ValueError.""" _environ = {"PL_TORCH_DISTRIBUTED_BACKEND": "undefined", "CUDA_VISIBLE_DEVICES": "0,1", "WORLD_SIZE": "2"} - with patch.dict(os.environ, _environ), patch("torch.cuda.device_count", return_value=2): + with patch.dict(os.environ, _environ), patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=2): with pytest.deprecated_call(match="Environment variable `PL_TORCH_DISTRIBUTED_BACKEND` was deprecated in v1.6"): with pytest.raises(ValueError, match="Invalid backend: 'undefined'"): model = BoringModel() @@ -101,8 +101,8 @@ def test_torch_distributed_backend_env_variables(tmpdir): @RunIf(skip_windows=True) @mock.patch("torch.cuda.set_device") -@mock.patch("torch.cuda.is_available", return_value=True) -@mock.patch("torch.cuda.device_count", return_value=1) +@mock.patch("pytorch_lightning.utilities.device_parser.is_cuda_available", return_value=True) +@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=1) @mock.patch("pytorch_lightning.accelerators.gpu.GPUAccelerator.is_available", return_value=True) @mock.patch.dict(os.environ, {"PL_TORCH_DISTRIBUTED_BACKEND": "gloo"}, clear=True) def test_ddp_torch_dist_is_available_in_setup( diff --git a/tests/tests_pytorch/strategies/test_ddp_fully_sharded_native.py b/tests/tests_pytorch/strategies/test_ddp_fully_sharded_native.py index b6dbff1792668..10233b84af3b9 100644 --- a/tests/tests_pytorch/strategies/test_ddp_fully_sharded_native.py +++ b/tests/tests_pytorch/strategies/test_ddp_fully_sharded_native.py @@ -32,8 +32,8 @@ def test_invalid_on_cpu(tmpdir): @mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0"}) -@mock.patch("torch.cuda.device_count", return_value=1) -@mock.patch("torch.cuda.is_available", return_value=True) +@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=1) +@mock.patch("pytorch_lightning.utilities.device_parser.is_cuda_available", return_value=True) @RunIf(min_torch="1.12dev") def test_fsdp_with_sharded_amp(device_count_mock, mock_cuda_available, tmpdir): """Test to ensure that plugin native amp plugin raises Misconfiguration error.""" diff --git a/tests/tests_pytorch/strategies/test_ddp_fully_sharded_with_full_state_dict.py b/tests/tests_pytorch/strategies/test_ddp_fully_sharded_with_full_state_dict.py index c1120fa4e2be9..2790f014c7212 100644 --- a/tests/tests_pytorch/strategies/test_ddp_fully_sharded_with_full_state_dict.py +++ b/tests/tests_pytorch/strategies/test_ddp_fully_sharded_with_full_state_dict.py @@ -29,8 +29,8 @@ def test_invalid_on_cpu(tmpdir): @mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0"}) -@mock.patch("torch.cuda.device_count", return_value=1) -@mock.patch("torch.cuda.is_available", return_value=True) +@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=1) +@mock.patch("pytorch_lightning.utilities.device_parser.is_cuda_available", return_value=True) @RunIf(fairscale_fully_sharded=True) def test_fsdp_with_sharded_amp(device_count_mock, mock_cuda_available, tmpdir): """Test to ensure that plugin native amp plugin is correctly chosen when using sharded.""" diff --git a/tests/tests_pytorch/strategies/test_deepspeed_strategy.py b/tests/tests_pytorch/strategies/test_deepspeed_strategy.py index 41faee02f315d..cba1b34d20f7c 100644 --- a/tests/tests_pytorch/strategies/test_deepspeed_strategy.py +++ b/tests/tests_pytorch/strategies/test_deepspeed_strategy.py @@ -176,7 +176,7 @@ def test_deepspeed_strategy_env(tmpdir, monkeypatch, deepspeed_config): @RunIf(deepspeed=True) -@mock.patch("torch.cuda.device_count", return_value=1) +@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=1) @pytest.mark.parametrize("precision", [16, "mixed"]) @pytest.mark.parametrize( "amp_backend", diff --git a/tests/tests_pytorch/strategies/test_dp.py b/tests/tests_pytorch/strategies/test_dp.py index 4a1c504e12bf8..30e0e5b19a845 100644 --- a/tests/tests_pytorch/strategies/test_dp.py +++ b/tests/tests_pytorch/strategies/test_dp.py @@ -154,8 +154,8 @@ def _assert_extra_outputs(self, outputs): assert out.dtype is torch.float -@mock.patch("torch.cuda.device_count", return_value=2) -@mock.patch("torch.cuda.is_available", return_value=True) +@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=2) +@mock.patch("pytorch_lightning.utilities.device_parser.is_cuda_available", return_value=True) def test_dp_raise_exception_with_batch_transfer_hooks(mock_is_available, mock_device_count, tmpdir): """Test that an exception is raised when overriding batch_transfer_hooks in DP model.""" diff --git a/tests/tests_pytorch/trainer/flags/test_env_vars.py b/tests/tests_pytorch/trainer/flags/test_env_vars.py index e7c9a13a0cd3c..9e7bd70468482 100644 --- a/tests/tests_pytorch/trainer/flags/test_env_vars.py +++ b/tests/tests_pytorch/trainer/flags/test_env_vars.py @@ -46,8 +46,8 @@ def test_passing_env_variables_defaults(): @mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1", "PL_TRAINER_DEVICES": "2"}) -@mock.patch("torch.cuda.device_count", return_value=2) -@mock.patch("torch.cuda.is_available", return_value=True) +@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=2) +@mock.patch("pytorch_lightning.utilities.device_parser.is_cuda_available", return_value=True) def test_passing_env_variables_devices(cuda_available_mock, device_count_mock): """Testing overwriting trainer arguments.""" trainer = Trainer() diff --git a/tests/tests_pytorch/trainer/properties/test_auto_gpu_select.py b/tests/tests_pytorch/trainer/properties/test_auto_gpu_select.py index 3800f5bc8c529..aa9f15bc43c18 100644 --- a/tests/tests_pytorch/trainer/properties/test_auto_gpu_select.py +++ b/tests/tests_pytorch/trainer/properties/test_auto_gpu_select.py @@ -42,13 +42,13 @@ def test_pick_multiple_gpus(nb, expected_gpu_idxs, expected_error): assert expected_gpu_idxs == pick_multiple_gpus(nb) -@mock.patch("torch.cuda.device_count", return_value=1) +@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=1) def test_pick_multiple_gpus_more_than_available(*_): with pytest.raises(MisconfigurationException, match="You requested 3 GPUs but your machine only has 1 GPUs"): pick_multiple_gpus(3) -@mock.patch("torch.cuda.device_count", return_value=2) +@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=2) @mock.patch("pytorch_lightning.trainer.connectors.accelerator_connector.pick_multiple_gpus", return_value=[1]) def test_auto_select_gpus(*_): diff --git a/tests/tests_pytorch/trainer/test_supporters.py b/tests/tests_pytorch/trainer/test_supporters.py index 22b10c8451b70..324070fa87602 100644 --- a/tests/tests_pytorch/trainer/test_supporters.py +++ b/tests/tests_pytorch/trainer/test_supporters.py @@ -314,8 +314,8 @@ def test_nested_calc_num_data(input_data, compute_func, expected_length): @mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1"}) -@mock.patch("torch.cuda.device_count", return_value=2) -@mock.patch("torch.cuda.is_available", return_value=True) +@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=2) +@mock.patch("pytorch_lightning.utilities.device_parser.is_cuda_available", return_value=True) @pytest.mark.parametrize("use_fault_tolerant", [False, True]) @pytest.mark.parametrize("replace_sampler_ddp", [False, True]) def test_combined_data_loader_validation_test( diff --git a/tests/tests_pytorch/trainer/test_trainer.py b/tests/tests_pytorch/trainer/test_trainer.py index 5966f4a41267e..a4d0006828fdd 100644 --- a/tests/tests_pytorch/trainer/test_trainer.py +++ b/tests/tests_pytorch/trainer/test_trainer.py @@ -1242,8 +1242,8 @@ def __init__(self, **kwargs): "trainer_params", [{"max_epochs": 1, "accelerator": "gpu", "devices": 1}, {"max_epochs": 1, "accelerator": "gpu", "devices": [0]}], ) -@mock.patch("torch.cuda.is_available", return_value=True) -@mock.patch("torch.cuda.device_count", return_value=1) +@mock.patch("pytorch_lightning.utilities.device_parser.is_cuda_available", return_value=True) +@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=1) def test_trainer_omegaconf(_, __, trainer_params): config = OmegaConf.create(trainer_params) Trainer(**config) diff --git a/tests/tests_pytorch/utilities/test_cli.py b/tests/tests_pytorch/utilities/test_cli.py index 655d9849a64ca..fad402a9ced67 100644 --- a/tests/tests_pytorch/utilities/test_cli.py +++ b/tests/tests_pytorch/utilities/test_cli.py @@ -201,8 +201,8 @@ def test_parse_args_parsing_complex_types(cli_args, expected, instantiate): ) def test_parse_args_parsing_gpus(monkeypatch, cli_args, expected_gpu): """Test parsing of gpus and instantiation of Trainer.""" - monkeypatch.setattr("torch.cuda.device_count", lambda: 2) - monkeypatch.setattr("torch.cuda.is_available", lambda: True) + monkeypatch.setattr("pytorch_lightning.utilities.device_parser.num_cuda_devices", lambda: 2) + monkeypatch.setattr("pytorch_lightning.utilities.device_parser.is_cuda_available", lambda: True) cli_args = cli_args.split(" ") if cli_args else [] with mock.patch("sys.argv", ["any.py"] + cli_args): parser = LightningArgumentParser(add_help=False, parse_as_dict=False) From 0ccc3b908bfb85f188d098fd52beb73dcace5050 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 27 Jun 2022 08:00:06 +0000 Subject: [PATCH 15/57] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/tests_pytorch/strategies/test_ddp.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/tests_pytorch/strategies/test_ddp.py b/tests/tests_pytorch/strategies/test_ddp.py index 00e83ec6bf5ed..91740102c7daf 100644 --- a/tests/tests_pytorch/strategies/test_ddp.py +++ b/tests/tests_pytorch/strategies/test_ddp.py @@ -84,7 +84,9 @@ def test_multi_gpu_model_ddp_fit_test(tmpdir, as_module): def test_torch_distributed_backend_env_variables(tmpdir): """This test set `undefined` as torch backend and should raise an `Backend.UNDEFINED` ValueError.""" _environ = {"PL_TORCH_DISTRIBUTED_BACKEND": "undefined", "CUDA_VISIBLE_DEVICES": "0,1", "WORLD_SIZE": "2"} - with patch.dict(os.environ, _environ), patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=2): + with patch.dict(os.environ, _environ), patch( + "pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=2 + ): with pytest.deprecated_call(match="Environment variable `PL_TORCH_DISTRIBUTED_BACKEND` was deprecated in v1.6"): with pytest.raises(ValueError, match="Invalid backend: 'undefined'"): model = BoringModel() From a080ec018c90d0f9bc9fa6461b3d586b706498d6 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Mon, 27 Jun 2022 10:02:56 +0200 Subject: [PATCH 16/57] add docstring --- src/pytorch_lightning/utilities/device_parser.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/pytorch_lightning/utilities/device_parser.py b/src/pytorch_lightning/utilities/device_parser.py index 26c1d41e64f77..478dd63c87159 100644 --- a/src/pytorch_lightning/utilities/device_parser.py +++ b/src/pytorch_lightning/utilities/device_parser.py @@ -335,10 +335,18 @@ def parse_hpus(devices: Optional[Union[int, str, List[int]]]) -> Optional[int]: def num_cuda_devices() -> int: + """Returns the number of GPUs available. + + Unlike :func:`torch.cuda.device_count`, calling this function will not create a new CUDA context. + """ with multiprocessing.Pool(1) as pool: return pool.apply(torch.cuda.device_count) def is_cuda_available() -> bool: + """Returns a bool indicating if CUDA is currently available. + + Unlike :func:`torch.cuda.is_available`, calling this function will not create a new CUDA context. + """ with multiprocessing.Pool(1) as pool: return pool.apply(torch.cuda.is_available) From eae67cd59837103f27b46542fc752b6955dad383 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Mon, 27 Jun 2022 10:04:07 +0200 Subject: [PATCH 17/57] update --- src/pytorch_lightning/trainer/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pytorch_lightning/trainer/trainer.py b/src/pytorch_lightning/trainer/trainer.py index c3b8d9cf3b063..af9612ccad9c9 100644 --- a/src/pytorch_lightning/trainer/trainer.py +++ b/src/pytorch_lightning/trainer/trainer.py @@ -1763,7 +1763,7 @@ def _log_device_info(self) -> None: rank_zero_info(f"HPU available: {_HPU_AVAILABLE}, using: {num_hpus} HPUs") # TODO: Integrate MPS Accelerator here, once gpu maps to both - if is_cuda_available() and not isinstance(self.accelerator, GPUAccelerator): + if GPUAccelerator.is_available() and not isinstance(self.accelerator, GPUAccelerator): rank_zero_warn( "GPU available but not used. Set `accelerator` and `devices` using" f" `Trainer(accelerator='gpu', devices={GPUAccelerator.auto_device_count()})`.", From 167a710644fa1617ddd711c202a5cba4b6bccabf Mon Sep 17 00:00:00 2001 From: awaelchli Date: Mon, 27 Jun 2022 10:04:24 +0200 Subject: [PATCH 18/57] update --- src/pytorch_lightning/trainer/trainer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/pytorch_lightning/trainer/trainer.py b/src/pytorch_lightning/trainer/trainer.py index af9612ccad9c9..5f9ba8a6e38e7 100644 --- a/src/pytorch_lightning/trainer/trainer.py +++ b/src/pytorch_lightning/trainer/trainer.py @@ -106,7 +106,6 @@ from pytorch_lightning.utilities.auto_restart import _add_capture_metadata_collate from pytorch_lightning.utilities.cloud_io import get_filesystem from pytorch_lightning.utilities.data import _auto_add_worker_init_fn, has_len_all_ranks -from pytorch_lightning.utilities.device_parser import is_cuda_available from pytorch_lightning.utilities.distributed import distributed_available from pytorch_lightning.utilities.exceptions import ExitGracefullyException, MisconfigurationException from pytorch_lightning.utilities.imports import _fault_tolerant_training From 1bdd79de9e6b1b1e3b4e6b33bbc3fec8fd8b6066 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Mon, 27 Jun 2022 11:35:15 +0200 Subject: [PATCH 19/57] fix mocks in tests --- .../deprecated_api/test_remove_1-8.py | 25 ++++++++++--------- tests/tests_pytorch/models/test_gpu.py | 6 ++--- .../test_estimated_stepping_batches.py | 6 ++--- tests/tests_pytorch/trainer/test_trainer.py | 9 ++++--- .../tests_pytorch/trainer/test_trainer_cli.py | 6 ++--- 5 files changed, 27 insertions(+), 25 deletions(-) diff --git a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py index 758367d1dd40b..350b059389e67 100644 --- a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py +++ b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py @@ -45,6 +45,7 @@ from pytorch_lightning.strategies import ParallelStrategy from pytorch_lightning.trainer.configuration_validator import _check_datamodule_checkpoint_hooks from pytorch_lightning.trainer.states import RunningStage +from pytorch_lightning.utilities import device_parser from pytorch_lightning.utilities.apply_func import move_data_to_device from pytorch_lightning.utilities.enums import DeviceType, DistributedType from pytorch_lightning.utilities.imports import _TORCHTEXT_LEGACY @@ -903,8 +904,8 @@ def test_trainer_config_device_ids(): ], ) def test_root_gpu_property(monkeypatch, gpus, expected_root_gpu, strategy): - monkeypatch.setattr(torch.cuda, "is_available", lambda: True) - monkeypatch.setattr(torch.cuda, "device_count", lambda: 16) + monkeypatch.setattr(device_parser, "is_cuda_available", lambda: True) + monkeypatch.setattr(device_parser, "num_cuda_devices", lambda: 16) with pytest.deprecated_call( match="`Trainer.root_gpu` is deprecated in v1.6 and will be removed in v1.8. " "Please use `Trainer.strategy.root_device.index` instead." @@ -921,7 +922,7 @@ def test_root_gpu_property(monkeypatch, gpus, expected_root_gpu, strategy): ], ) def test_root_gpu_property_0_passing(monkeypatch, gpus, expected_root_gpu, strategy): - monkeypatch.setattr(torch.cuda, "device_count", lambda: 0) + monkeypatch.setattr(device_parser, "num_cuda_devices", lambda: 0) with pytest.deprecated_call( match="`Trainer.root_gpu` is deprecated in v1.6 and will be removed in v1.8. " "Please use `Trainer.strategy.root_device.index` instead." @@ -941,8 +942,8 @@ def test_root_gpu_property_0_passing(monkeypatch, gpus, expected_root_gpu, strat ], ) def test_trainer_gpu_parse(monkeypatch, gpus, expected_num_gpus, strategy): - monkeypatch.setattr(torch.cuda, "is_available", lambda: True) - monkeypatch.setattr(torch.cuda, "device_count", lambda: 16) + monkeypatch.setattr(device_parser, "is_cuda_available", lambda: True) + monkeypatch.setattr(device_parser, "num_cuda_devices", lambda: 16) with pytest.deprecated_call( match="`Trainer.num_gpus` was deprecated in v1.6 and will be removed in v1.8." " Please use `Trainer.num_devices` instead." @@ -958,7 +959,7 @@ def test_trainer_gpu_parse(monkeypatch, gpus, expected_num_gpus, strategy): ], ) def test_trainer_num_gpu_0(monkeypatch, gpus, expected_num_gpus, strategy): - monkeypatch.setattr(torch.cuda, "device_count", lambda: 0) + monkeypatch.setattr(device_parser, "num_cuda_devices", lambda: 0) with pytest.deprecated_call( match="`Trainer.num_gpus` was deprecated in v1.6 and will be removed in v1.8." " Please use `Trainer.num_devices` instead." @@ -1020,8 +1021,8 @@ def test_trainer_config_ipus(monkeypatch, trainer_kwargs, expected_ipus): ) def test_trainer_num_processes(monkeypatch, trainer_kwargs, expected_num_processes): if trainer_kwargs.get("accelerator") == "gpu": - monkeypatch.setattr(torch.cuda, "is_available", lambda: True) - monkeypatch.setattr(torch.cuda, "device_count", lambda: 16) + monkeypatch.setattr(device_parser, "is_cuda_available", lambda: True) + monkeypatch.setattr(device_parser, "num_cuda_devices", lambda: 16) trainer = Trainer(**trainer_kwargs) with pytest.deprecated_call( match="`Trainer.num_processes` is deprecated in v1.6 and will be removed in v1.8. " @@ -1045,8 +1046,8 @@ def test_trainer_num_processes(monkeypatch, trainer_kwargs, expected_num_process def test_trainer_data_parallel_device_ids(monkeypatch, trainer_kwargs, expected_data_parallel_device_ids): """Test multi type argument with bool.""" if trainer_kwargs.get("accelerator") == "gpu": - monkeypatch.setattr(torch.cuda, "is_available", lambda: True) - monkeypatch.setattr(torch.cuda, "device_count", lambda: 2) + monkeypatch.setattr(device_parser, "is_cuda_available", lambda: True) + monkeypatch.setattr(device_parser, "num_cuda_devices", lambda: 2) trainer = Trainer(**trainer_kwargs) with pytest.deprecated_call( @@ -1128,8 +1129,8 @@ def on_save_checkpoint(self, trainer, pl_module, checkpoint): ], ) def test_trainer_gpus(monkeypatch, trainer_kwargs): - monkeypatch.setattr(torch.cuda, "is_available", lambda: True) - monkeypatch.setattr(torch.cuda, "device_count", lambda: 4) + monkeypatch.setattr(device_parser, "is_cuda_available", lambda: True) + monkeypatch.setattr(device_parser, "num_cuda_devices", lambda: 4) trainer = Trainer(**trainer_kwargs) with pytest.deprecated_call( match="`Trainer.gpus` was deprecated in v1.6 and will be removed in v1.8." diff --git a/tests/tests_pytorch/models/test_gpu.py b/tests/tests_pytorch/models/test_gpu.py index a524d7cd73837..04f2f8fbe53d2 100644 --- a/tests/tests_pytorch/models/test_gpu.py +++ b/tests/tests_pytorch/models/test_gpu.py @@ -83,8 +83,8 @@ def device_count(): def is_available(): return True - monkeypatch.setattr(torch.cuda, "is_available", is_available) - monkeypatch.setattr(torch.cuda, "device_count", device_count) + monkeypatch.setattr(device_parser, "is_cuda_available", is_available) + monkeypatch.setattr(device_parser, "num_cuda_devices", device_count) @pytest.fixture @@ -92,7 +92,7 @@ def mocked_device_count_0(monkeypatch): def device_count(): return 0 - monkeypatch.setattr(torch.cuda, "device_count", device_count) + monkeypatch.setattr(device_parser, "num_cuda_devices", device_count) # Asking for a gpu when non are available will result in a MisconfigurationException diff --git a/tests/tests_pytorch/trainer/properties/test_estimated_stepping_batches.py b/tests/tests_pytorch/trainer/properties/test_estimated_stepping_batches.py index ab19002d751a2..7d96ebbf2ef82 100644 --- a/tests/tests_pytorch/trainer/properties/test_estimated_stepping_batches.py +++ b/tests/tests_pytorch/trainer/properties/test_estimated_stepping_batches.py @@ -16,13 +16,13 @@ from unittest import mock import pytest -import torch from torch.utils.data import DataLoader from pytorch_lightning import Trainer from pytorch_lightning.callbacks.gradient_accumulation_scheduler import GradientAccumulationScheduler from pytorch_lightning.demos.boring_classes import BoringModel from pytorch_lightning.strategies.ipu import IPUStrategy +from pytorch_lightning.utilities import device_parser from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests_pytorch.helpers.datasets import RandomIterableDataset from tests_pytorch.helpers.runif import RunIf @@ -129,8 +129,8 @@ def test_num_stepping_batches_accumulate_gradients(accumulate_grad_batches, expe ) def test_num_stepping_batches_gpu(trainer_kwargs, estimated_steps, monkeypatch): """Test stepping batches with GPU strategies.""" - monkeypatch.setattr(torch.cuda, "is_available", lambda: True) - monkeypatch.setattr(torch.cuda, "device_count", lambda: 7) + monkeypatch.setattr(device_parser, "is_cuda_available", lambda: True) + monkeypatch.setattr(device_parser, "num_cuda_devices", lambda: 7) trainer = Trainer(max_epochs=1, devices=7, accelerator="gpu", **trainer_kwargs) model = BoringModel() trainer._data_connector.attach_data(model) diff --git a/tests/tests_pytorch/trainer/test_trainer.py b/tests/tests_pytorch/trainer/test_trainer.py index a4d0006828fdd..f7a8a5140a40d 100644 --- a/tests/tests_pytorch/trainer/test_trainer.py +++ b/tests/tests_pytorch/trainer/test_trainer.py @@ -54,6 +54,7 @@ SingleDeviceStrategy, ) from pytorch_lightning.trainer.states import RunningStage, TrainerFn +from pytorch_lightning.utilities import device_parser from pytorch_lightning.utilities.cloud_io import load as pl_load from pytorch_lightning.utilities.exceptions import DeadlockDetectedException, MisconfigurationException from pytorch_lightning.utilities.imports import _OMEGACONF_AVAILABLE, _TORCH_GREATER_EQUAL_1_12 @@ -2103,8 +2104,8 @@ def training_step(self, batch, batch_idx): ) def test_trainer_config_strategy(monkeypatch, trainer_kwargs, strategy_cls, strategy_name, accelerator_cls, devices): if trainer_kwargs.get("accelerator") == "gpu": - monkeypatch.setattr(torch.cuda, "is_available", lambda: True) - monkeypatch.setattr(torch.cuda, "device_count", lambda: trainer_kwargs["devices"]) + monkeypatch.setattr(device_parser, "is_cuda_available", lambda: True) + monkeypatch.setattr(device_parser, "num_cuda_devices", lambda: trainer_kwargs["devices"]) trainer = Trainer(**trainer_kwargs) @@ -2170,8 +2171,8 @@ def test_dataloaders_are_not_loaded_if_disabled_through_limit_batches(running_st ) def test_trainer_config_device_ids(monkeypatch, trainer_kwargs, expected_device_ids): if trainer_kwargs.get("accelerator") == "gpu": - monkeypatch.setattr(torch.cuda, "is_available", lambda: True) - monkeypatch.setattr(torch.cuda, "device_count", lambda: 4) + monkeypatch.setattr(device_parser, "is_cuda_available", lambda: True) + monkeypatch.setattr(device_parser, "num_cuda_devices", lambda: 4) elif trainer_kwargs.get("accelerator") == "ipu": monkeypatch.setattr(pytorch_lightning.accelerators.ipu.IPUAccelerator, "is_available", lambda _: True) monkeypatch.setattr(pytorch_lightning.strategies.ipu, "_IPU_AVAILABLE", lambda: True) diff --git a/tests/tests_pytorch/trainer/test_trainer_cli.py b/tests/tests_pytorch/trainer/test_trainer_cli.py index 989a06f4193ed..492ad87707150 100644 --- a/tests/tests_pytorch/trainer/test_trainer_cli.py +++ b/tests/tests_pytorch/trainer/test_trainer_cli.py @@ -21,7 +21,7 @@ import tests_pytorch.helpers.utils as tutils from pytorch_lightning import Trainer -from pytorch_lightning.utilities import argparse +from pytorch_lightning.utilities import argparse, device_parser @mock.patch("argparse.ArgumentParser.parse_args") @@ -167,8 +167,8 @@ def test_argparse_args_parsing_fast_dev_run(cli_args, expected): def test_argparse_args_parsing_devices(cli_args, expected_parsed, monkeypatch): """Test multi type argument with bool.""" - monkeypatch.setattr(torch.cuda, "is_available", lambda: True) - monkeypatch.setattr(torch.cuda, "device_count", lambda: 1) + monkeypatch.setattr(device_parser, "is_cuda_available", lambda: True) + monkeypatch.setattr(device_parser, "num_cuda_devices", lambda: 1) cli_args = cli_args.split(" ") if cli_args else [] with mock.patch("argparse._sys.argv", ["any.py"] + cli_args): From 297b55a90accb84a4f785c92c06e3b6f1916d3f0 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Mon, 27 Jun 2022 11:35:36 +0200 Subject: [PATCH 20/57] refactor --- src/pytorch_lightning/strategies/launchers/xla_spawn.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/pytorch_lightning/strategies/launchers/xla_spawn.py b/src/pytorch_lightning/strategies/launchers/xla_spawn.py index b3e1bf3465203..5c860eff2501d 100644 --- a/src/pytorch_lightning/strategies/launchers/xla_spawn.py +++ b/src/pytorch_lightning/strategies/launchers/xla_spawn.py @@ -51,8 +51,7 @@ class _XLASpawnLauncher(_SpawnLauncher): """ def __init__(self, strategy: "Strategy") -> None: - super().__init__(strategy) - self._start_method = "fork" + super().__init__(strategy=strategy, start_method="fork") @property def is_interactive_compatible(self) -> bool: From 6c5b769452daf3b199feec9cb040e5c2bef97f15 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Mon, 27 Jun 2022 12:10:59 +0200 Subject: [PATCH 21/57] fix test --- .../strategies/test_strategy_registry.py | 38 ++++++++++++++----- 1 file changed, 28 insertions(+), 10 deletions(-) diff --git a/tests/tests_pytorch/strategies/test_strategy_registry.py b/tests/tests_pytorch/strategies/test_strategy_registry.py index 5f9e6208c4fa5..57dfd278a6997 100644 --- a/tests/tests_pytorch/strategies/test_strategy_registry.py +++ b/tests/tests_pytorch/strategies/test_strategy_registry.py @@ -105,22 +105,40 @@ def test_fsdp_strategy_registry(tmpdir): @pytest.mark.parametrize( - "strategy_name, strategy", + "strategy_name, strategy, expected_init_params", [ - ("ddp_find_unused_parameters_false", DDPStrategy), - ("ddp_spawn_find_unused_parameters_false", DDPSpawnStrategy), - ("ddp_sharded_spawn_find_unused_parameters_false", DDPSpawnShardedStrategy), - ("ddp_sharded_find_unused_parameters_false", DDPShardedStrategy), + ( + "ddp_find_unused_parameters_false", + DDPStrategy, + {"find_unused_parameters": False}, + ), + ( + "ddp_spawn_find_unused_parameters_false", + DDPSpawnStrategy, + {"find_unused_parameters": False, "start_method": "spawn"}, + ), + ( + "ddp_fork_find_unused_parameters_false", + DDPSpawnStrategy, + {"find_unused_parameters": False, "start_method ": "fork"}, + ), + ( + "ddp_sharded_spawn_find_unused_parameters_false", + DDPSpawnShardedStrategy, + {"find_unused_parameters": False}, + ), + ( + "ddp_sharded_find_unused_parameters_false", + DDPShardedStrategy, + {"find_unused_parameters": False}, + ), ], ) -def test_ddp_find_unused_parameters_strategy_registry(tmpdir, strategy_name, strategy): - +def test_ddp_find_unused_parameters_strategy_registry(tmpdir, strategy_name, strategy, expected_init_params): trainer = Trainer(default_root_dir=tmpdir, strategy=strategy_name) - assert isinstance(trainer.strategy, strategy) - assert strategy_name in StrategyRegistry - assert StrategyRegistry[strategy_name]["init_params"] == {"find_unused_parameters": False} + assert StrategyRegistry[strategy_name]["init_params"] == expected_init_params assert StrategyRegistry[strategy_name]["strategy"] == strategy From 2671810c7fcc6792f91cf3cd29acea41dee92bc9 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Mon, 27 Jun 2022 12:15:41 +0200 Subject: [PATCH 22/57] update lite and enums --- src/pytorch_lightning/lite/lite.py | 1 + src/pytorch_lightning/utilities/enums.py | 2 ++ tests/tests_pytorch/lite/test_lite.py | 2 ++ tests/tests_pytorch/trainer/test_trainer_cli.py | 1 - 4 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/pytorch_lightning/lite/lite.py b/src/pytorch_lightning/lite/lite.py index 4dfcde177f953..1009f5f6c7994 100644 --- a/src/pytorch_lightning/lite/lite.py +++ b/src/pytorch_lightning/lite/lite.py @@ -475,6 +475,7 @@ def _supported_strategy_types() -> Sequence[_StrategyType]: _StrategyType.DP, _StrategyType.DDP, _StrategyType.DDP_SPAWN, + _StrategyType.DDP_FORK, _StrategyType.DEEPSPEED, _StrategyType.DDP_SHARDED, _StrategyType.DDP_SHARDED_SPAWN, diff --git a/src/pytorch_lightning/utilities/enums.py b/src/pytorch_lightning/utilities/enums.py index b7f714d230971..91f8466b77500 100644 --- a/src/pytorch_lightning/utilities/enums.py +++ b/src/pytorch_lightning/utilities/enums.py @@ -214,6 +214,7 @@ class _StrategyType(LightningEnum): DDP = "ddp" DDP2 = "ddp2" DDP_SPAWN = "ddp_spawn" + DDP_FORK = "ddp_fork" TPU_SPAWN = "tpu_spawn" DEEPSPEED = "deepspeed" HOROVOD = "horovod" @@ -229,6 +230,7 @@ def interactive_compatible_types() -> list[_StrategyType]: return [ _StrategyType.DP, _StrategyType.TPU_SPAWN, + _StrategyType.DDP_FORK, ] def is_interactive_compatible(self) -> bool: diff --git a/tests/tests_pytorch/lite/test_lite.py b/tests/tests_pytorch/lite/test_lite.py index 7166be0981846..76596d0ee072d 100644 --- a/tests/tests_pytorch/lite/test_lite.py +++ b/tests/tests_pytorch/lite/test_lite.py @@ -267,6 +267,7 @@ def test_seed_everything(): _StrategyType.DP, _StrategyType.DDP, _StrategyType.DDP_SPAWN, + _StrategyType.DDP_FORK, pytest.param(_StrategyType.DEEPSPEED, marks=RunIf(deepspeed=True)), pytest.param(_StrategyType.DDP_SHARDED, marks=RunIf(fairscale=True)), pytest.param(_StrategyType.DDP_SHARDED_SPAWN, marks=RunIf(fairscale=True)), @@ -295,6 +296,7 @@ def test_setup_dataloaders_replace_custom_sampler(strategy): _StrategyType.DP, _StrategyType.DDP, _StrategyType.DDP_SPAWN, + _StrategyType.DDP_FORK, pytest.param(_StrategyType.DEEPSPEED, marks=RunIf(deepspeed=True)), pytest.param(_StrategyType.DDP_SHARDED, marks=RunIf(fairscale=True)), pytest.param(_StrategyType.DDP_SHARDED_SPAWN, marks=RunIf(fairscale=True)), diff --git a/tests/tests_pytorch/trainer/test_trainer_cli.py b/tests/tests_pytorch/trainer/test_trainer_cli.py index 492ad87707150..468650e234f81 100644 --- a/tests/tests_pytorch/trainer/test_trainer_cli.py +++ b/tests/tests_pytorch/trainer/test_trainer_cli.py @@ -17,7 +17,6 @@ from unittest import mock import pytest -import torch import tests_pytorch.helpers.utils as tutils from pytorch_lightning import Trainer From ff2a825c609c83586d43d9de74c2c10f7e3a2ee9 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Mon, 27 Jun 2022 12:16:01 +0200 Subject: [PATCH 23/57] typo --- src/pytorch_lightning/strategies/launchers/spawn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pytorch_lightning/strategies/launchers/spawn.py b/src/pytorch_lightning/strategies/launchers/spawn.py index cfad40471d645..147f5f2170484 100644 --- a/src/pytorch_lightning/strategies/launchers/spawn.py +++ b/src/pytorch_lightning/strategies/launchers/spawn.py @@ -52,7 +52,7 @@ def __init__(self, strategy: Strategy, start_method: str = "spawn") -> None: @property def is_interactive_compatible(self) -> bool: - # The start method 'spawn' is not supporrted in interactive environments + # The start method 'spawn' is not supported in interactive environments # The start method 'fork' is the only one supported in Jupyter environments, with constraints around CUDA # initialization. For more context, see https://github.com/Lightning-AI/lightning/issues/7550 return self._start_method == "fork" From 0879751b67b6a19ac7894ee8acfbf3589095ba2f Mon Sep 17 00:00:00 2001 From: awaelchli Date: Mon, 27 Jun 2022 12:29:39 +0200 Subject: [PATCH 24/57] update docs --- src/pytorch_lightning/strategies/launchers/spawn.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/pytorch_lightning/strategies/launchers/spawn.py b/src/pytorch_lightning/strategies/launchers/spawn.py index 147f5f2170484..85fa9bdf29bf8 100644 --- a/src/pytorch_lightning/strategies/launchers/spawn.py +++ b/src/pytorch_lightning/strategies/launchers/spawn.py @@ -41,9 +41,15 @@ class _SpawnLauncher(_Launcher): Note: - This launcher requires all objects to be pickleable. - It is important that the entry point to the program/script is guarded by ``if __name__ == "__main__"``. + - With start method 'fork' the user must ensure that no CUDA context gets created in the main process before + the launcher is invoked. E.g., one should avoid creating cuda tensors or calling ``torch.cuda.*`` functions + before calling ``Trainer.fit``. Args: strategy: A reference to the strategy that is used together with this launcher. + start_method: The method how to start the processes. + - 'spawn': The default start method. Requires all objects to be pickleable. + - 'fork': Preferrable for IPython/Jupyter environments where 'spawn' is not available. """ def __init__(self, strategy: Strategy, start_method: str = "spawn") -> None: From fe16575715635148634a6b017e1c7a0fe2e8f46d Mon Sep 17 00:00:00 2001 From: awaelchli Date: Mon, 27 Jun 2022 18:22:25 +0200 Subject: [PATCH 25/57] add validation for forking on platforms --- .../strategies/launchers/spawn.py | 5 ++- .../connectors/accelerator_connector.py | 6 ++- .../test_accelerator_connector.py | 7 ++++ .../strategies/launchers/__init__.py | 0 .../strategies/launchers/test_spawn.py | 40 +++++++++++++++++++ 5 files changed, 56 insertions(+), 2 deletions(-) create mode 100644 tests/tests_pytorch/strategies/launchers/__init__.py create mode 100644 tests/tests_pytorch/strategies/launchers/test_spawn.py diff --git a/src/pytorch_lightning/strategies/launchers/spawn.py b/src/pytorch_lightning/strategies/launchers/spawn.py index 85fa9bdf29bf8..faf51e1a526d4 100644 --- a/src/pytorch_lightning/strategies/launchers/spawn.py +++ b/src/pytorch_lightning/strategies/launchers/spawn.py @@ -49,12 +49,15 @@ class _SpawnLauncher(_Launcher): strategy: A reference to the strategy that is used together with this launcher. start_method: The method how to start the processes. - 'spawn': The default start method. Requires all objects to be pickleable. - - 'fork': Preferrable for IPython/Jupyter environments where 'spawn' is not available. + - 'fork': Preferrable for IPython/Jupyter environments where 'spawn' is not available. Not available on + the Windows platform. """ def __init__(self, strategy: Strategy, start_method: str = "spawn") -> None: self._strategy = strategy self._start_method = start_method + if start_method == "fork" and not hasattr(os, "fork"): + raise ValueError("The start method 'fork' is not available on this platform. Choose 'spawn' instead.") @property def is_interactive_compatible(self) -> bool: diff --git a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py index 62eebab400a67..4a71311416c36 100644 --- a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -610,7 +610,11 @@ def _check_strategy_and_fallback(self) -> None: f"You selected strategy to be `{DDPFullyShardedNativeStrategy.strategy_name}`, " "but GPU accelerator is not used." ) - + if strategy_flag in ("ddp_fork", "ddp_fork_find_unused_parameters_false") and not hasattr(os, "fork"): + raise ValueError( + f"You selected `Trainer(strategy='{strategy_flag}')` but process forking is not supported on this" + f" platform. We recommed to choose `Trainer(strategy='ddp_spawn')` instead." + ) if strategy_flag: self._strategy_flag = strategy_flag diff --git a/tests/tests_pytorch/accelerators/test_accelerator_connector.py b/tests/tests_pytorch/accelerators/test_accelerator_connector.py index b1ec1bc65cf94..120e4a6151f03 100644 --- a/tests/tests_pytorch/accelerators/test_accelerator_connector.py +++ b/tests/tests_pytorch/accelerators/test_accelerator_connector.py @@ -811,3 +811,10 @@ def test_plugin_only_one_instance_for_one_type(plugins, expected): def test_passing_zero_and_empty_list_to_devices_flag(accelerator, devices): with pytest.raises(MisconfigurationException, match="value is not a valid input using"): Trainer(accelerator=accelerator, devices=devices) + + +@pytest.mark.parametrize("strategy", ["ddp_fork", "ddp_fork_find_unused_parameters_false"]) +def test_ddp_fork_on_unsupported_platform(strategy, monkeypatch): + monkeypatch.delattr(os, "fork") + with pytest.raises(ValueError, match="process forking is not supported on this platform"): + Trainer(strategy=strategy) diff --git a/tests/tests_pytorch/strategies/launchers/__init__.py b/tests/tests_pytorch/strategies/launchers/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/tests_pytorch/strategies/launchers/test_spawn.py b/tests/tests_pytorch/strategies/launchers/test_spawn.py new file mode 100644 index 0000000000000..f7c90f8c1689a --- /dev/null +++ b/tests/tests_pytorch/strategies/launchers/test_spawn.py @@ -0,0 +1,40 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from unittest import mock +from unittest.mock import ANY, Mock + +import pytest + +import pytorch_lightning +from pytorch_lightning.strategies.launchers.spawn import _SpawnLauncher + + +def test_spawn_launcher_forking_on_unsupported_platform(monkeypatch): + monkeypatch.delattr(pytorch_lightning.strategies.launchers.spawn.os, "fork") + with pytest.raises(ValueError, match="The start method 'fork' is not available on this platform"): + _SpawnLauncher(strategy=Mock(), start_method="fork") + + +@pytest.mark.parametrize("start_method", ["spawn", "fork"]) +@mock.patch("pytorch_lightning.strategies.launchers.spawn.mp") +def test_spawn_launcher_start_method(mp_mock, start_method): + launcher = _SpawnLauncher(strategy=Mock(), start_method=start_method) + launcher.launch(function=Mock()) + mp_mock.get_context.assert_called_with(start_method) + mp_mock.start_processes.assert_called_with( + ANY, + args=ANY, + nprocs=ANY, + start_method=start_method, + ) From 582872cdf8994bc8ba0d42e03778f950c2eabe64 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Mon, 27 Jun 2022 19:01:04 +0200 Subject: [PATCH 26/57] debug no breaking change for devices=1 --- src/pytorch_lightning/utilities/device_parser.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/pytorch_lightning/utilities/device_parser.py b/src/pytorch_lightning/utilities/device_parser.py index 478dd63c87159..0cf3004b7c62a 100644 --- a/src/pytorch_lightning/utilities/device_parser.py +++ b/src/pytorch_lightning/utilities/device_parser.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import multiprocessing +import os from typing import Any, List, MutableSequence, Optional, Tuple, Union import torch @@ -339,7 +340,9 @@ def num_cuda_devices() -> int: Unlike :func:`torch.cuda.device_count`, calling this function will not create a new CUDA context. """ - with multiprocessing.Pool(1) as pool: + if not hasattr(os, "fork"): + return torch.cuda.device_count() + with multiprocessing.get_context("fork").Pool(1) as pool: return pool.apply(torch.cuda.device_count) @@ -348,5 +351,7 @@ def is_cuda_available() -> bool: Unlike :func:`torch.cuda.is_available`, calling this function will not create a new CUDA context. """ - with multiprocessing.Pool(1) as pool: + if not hasattr(os, "fork"): + return torch.cuda.is_available() + with multiprocessing.get_context("fork").Pool(1) as pool: return pool.apply(torch.cuda.is_available) From da70271a55ae4c6b5f0ab287a2ceb5e64d2b14f6 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Mon, 27 Jun 2022 19:20:33 +0200 Subject: [PATCH 27/57] fix typo in test --- tests/tests_pytorch/strategies/test_strategy_registry.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/tests_pytorch/strategies/test_strategy_registry.py b/tests/tests_pytorch/strategies/test_strategy_registry.py index 57dfd278a6997..4cc45b058678c 100644 --- a/tests/tests_pytorch/strategies/test_strategy_registry.py +++ b/tests/tests_pytorch/strategies/test_strategy_registry.py @@ -120,7 +120,7 @@ def test_fsdp_strategy_registry(tmpdir): ( "ddp_fork_find_unused_parameters_false", DDPSpawnStrategy, - {"find_unused_parameters": False, "start_method ": "fork"}, + {"find_unused_parameters": False, "start_method": "fork"}, ), ( "ddp_sharded_spawn_find_unused_parameters_false", From 3f9a872c9732cd7ef2bdd5d89107bd6dfe31bbcb Mon Sep 17 00:00:00 2001 From: awaelchli Date: Mon, 27 Jun 2022 19:20:40 +0200 Subject: [PATCH 28/57] update docstring --- src/pytorch_lightning/utilities/device_parser.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/pytorch_lightning/utilities/device_parser.py b/src/pytorch_lightning/utilities/device_parser.py index 0cf3004b7c62a..da36dbda83c80 100644 --- a/src/pytorch_lightning/utilities/device_parser.py +++ b/src/pytorch_lightning/utilities/device_parser.py @@ -338,7 +338,8 @@ def parse_hpus(devices: Optional[Union[int, str, List[int]]]) -> Optional[int]: def num_cuda_devices() -> int: """Returns the number of GPUs available. - Unlike :func:`torch.cuda.device_count`, calling this function will not create a new CUDA context. + Unlike :func:`torch.cuda.device_count`, this function will do its best not to create a CUDA context to for fork + support, if the platform allows it. """ if not hasattr(os, "fork"): return torch.cuda.device_count() @@ -349,7 +350,8 @@ def num_cuda_devices() -> int: def is_cuda_available() -> bool: """Returns a bool indicating if CUDA is currently available. - Unlike :func:`torch.cuda.is_available`, calling this function will not create a new CUDA context. + Unlike :func:`torch.cuda.is_available`, this function will do its best not to create a CUDA context to for fork + support, if the platform allows it. """ if not hasattr(os, "fork"): return torch.cuda.is_available() From 7291fa351aceb2513045d1c8fc3f8df08d1c7613 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Mon, 27 Jun 2022 20:04:40 +0200 Subject: [PATCH 29/57] added windows test for device parser --- .../utilities/test_device_parser.py | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 tests/tests_pytorch/utilities/test_device_parser.py diff --git a/tests/tests_pytorch/utilities/test_device_parser.py b/tests/tests_pytorch/utilities/test_device_parser.py new file mode 100644 index 0000000000000..59750cf94f90b --- /dev/null +++ b/tests/tests_pytorch/utilities/test_device_parser.py @@ -0,0 +1,31 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import sys +from unittest import mock + +import pytest + +from pytorch_lightning.utilities import device_parser + + +@pytest.mark.skipif(sys.platform != "win32", reason="Requires Windows without forking support") +@mock.patch("torch.cuda.is_available", return_value=True) +@mock.patch("torch.cuda.device_count", return_value=2) +def test_num_cuda_devices_without_forking(*_): + """This merely tests that on platforms without fork support our helper functions fall back to the default + implementation for determining cuda availability.""" + assert not hasattr(os, "fork") + assert device_parser.is_cuda_available() + assert device_parser.num_cuda_devices() == 2 From 785c830ef28edf108b909eab6e263008544685b3 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Mon, 27 Jun 2022 20:22:08 +0200 Subject: [PATCH 30/57] add changelog --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 314fa55a61f98..4db6ba4b96c18 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -84,6 +84,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added Apple Silicon Support via `MPSAccelerator` ([#13123](https://github.com/PyTorchLightning/pytorch-lightning/pull/13123)) +- Added support for DDP Fork ([#13405](https://github.com/PyTorchLightning/pytorch-lightning/pull/13405)) + + ### Changed - Enable validation during overfitting ([#12527](https://github.com/PyTorchLightning/pytorch-lightning/pull/12527)) From dd043adb5120b3fb9c7d9e136fe1e87af34f0bb9 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Mon, 27 Jun 2022 20:24:56 +0200 Subject: [PATCH 31/57] add test --- .../accelerators/test_accelerator_connector.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/tests/tests_pytorch/accelerators/test_accelerator_connector.py b/tests/tests_pytorch/accelerators/test_accelerator_connector.py index 120e4a6151f03..a7b36e0c85af9 100644 --- a/tests/tests_pytorch/accelerators/test_accelerator_connector.py +++ b/tests/tests_pytorch/accelerators/test_accelerator_connector.py @@ -423,11 +423,21 @@ def test_amp_level_raises_error_with_native(): _ = Trainer(amp_level="O2", amp_backend="native", precision=16) -def test_strategy_choice_ddp_spawn_cpu(tmpdir): - trainer = Trainer(fast_dev_run=True, strategy="ddp_spawn", accelerator="cpu", devices=2) +def test_strategy_choice_ddp_spawn_cpu(): + trainer = Trainer(strategy="ddp_spawn", accelerator="cpu", devices=2) + assert isinstance(trainer.accelerator, CPUAccelerator) + assert isinstance(trainer.strategy, DDPSpawnStrategy) + assert isinstance(trainer.strategy.cluster_environment, LightningEnvironment) + assert trainer.strategy.launcher._start_method == "spawn" + + +@RunIf(skip_windows=True) +def test_strategy_choice_ddp_fork_cpu(): + trainer = Trainer(strategy="ddp_fork", accelerator="cpu", devices=2) assert isinstance(trainer.accelerator, CPUAccelerator) assert isinstance(trainer.strategy, DDPSpawnStrategy) assert isinstance(trainer.strategy.cluster_environment, LightningEnvironment) + assert trainer.strategy.launcher._start_method == "fork" @mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1"}) From da843ee8a1d374174d0785f573175c3c0ac48dac Mon Sep 17 00:00:00 2001 From: awaelchli Date: Mon, 27 Jun 2022 20:29:01 +0200 Subject: [PATCH 32/57] add tests --- .../accelerators/test_accelerator_connector.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/tests/tests_pytorch/accelerators/test_accelerator_connector.py b/tests/tests_pytorch/accelerators/test_accelerator_connector.py index a7b36e0c85af9..bddcd2b37e27d 100644 --- a/tests/tests_pytorch/accelerators/test_accelerator_connector.py +++ b/tests/tests_pytorch/accelerators/test_accelerator_connector.py @@ -230,14 +230,21 @@ def test_ipython_incompatible_backend_error(_, monkeypatch): def test_ipython_compatible_dp_strategy_gpu(_, monkeypatch): monkeypatch.setattr(pytorch_lightning.utilities, "_IS_INTERACTIVE", True) trainer = Trainer(strategy="dp", accelerator="gpu") - assert trainer.strategy.launcher is None or trainer.strategy.launcher.is_interactive_compatible + assert trainer.strategy.launcher is None @mock.patch("pytorch_lightning.accelerators.tpu.TPUAccelerator.is_available", return_value=True) -def test_ipython_compatible_strategy_tpu(mock_tpu_acc_avail, monkeypatch): +def test_ipython_compatible_strategy_tpu(_, monkeypatch): monkeypatch.setattr(pytorch_lightning.utilities, "_IS_INTERACTIVE", True) trainer = Trainer(accelerator="tpu") - assert trainer.strategy.launcher is None or trainer.strategy.launcher.is_interactive_compatible + assert trainer.strategy.launcher.is_interactive_compatible + + +@RunIf(skip_windows=True) +def test_ipython_compatible_strategy_ddp_fork(monkeypatch): + monkeypatch.setattr(pytorch_lightning.utilities, "_IS_INTERACTIVE", True) + trainer = Trainer(strategy="ddp_fork", accelerator="cpu") + assert trainer.strategy.launcher.is_interactive_compatible @pytest.mark.parametrize( From 1a6366257fa9cd4b6079f3fb4459c193dcee1aa0 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Mon, 27 Jun 2022 20:34:29 +0200 Subject: [PATCH 33/57] update error message --- .../trainer/connectors/accelerator_connector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py index 4a71311416c36..983331b056549 100644 --- a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -267,7 +267,7 @@ def _check_config_and_set_final_flags( if strategy == "ddp_cpu": raise MisconfigurationException( "`Trainer(strategy='ddp_cpu')` is not a valid strategy," - " you can use `Trainer(strategy='ddp'|'ddp_spawn', accelerator='cpu')` instead." + " you can use `Trainer(strategy='ddp'|'ddp_spawn'|'ddp_fork', accelerator='cpu')` instead." ) if strategy == "tpu_spawn": raise MisconfigurationException( From 093a52e249630bcb614ae088c64fe929306d34cd Mon Sep 17 00:00:00 2001 From: awaelchli Date: Tue, 28 Jun 2022 02:53:51 +0200 Subject: [PATCH 34/57] Comparison section --- .../accelerators/gpu_intermediate.rst | 53 ++++++++++++++++++- 1 file changed, 51 insertions(+), 2 deletions(-) diff --git a/docs/source-pytorch/accelerators/gpu_intermediate.rst b/docs/source-pytorch/accelerators/gpu_intermediate.rst index c4d9ad8817621..f024a73096961 100644 --- a/docs/source-pytorch/accelerators/gpu_intermediate.rst +++ b/docs/source-pytorch/accelerators/gpu_intermediate.rst @@ -21,8 +21,10 @@ Lightning supports multiple ways of doing distributed training. | - Data Parallel (``strategy='dp'``) (multiple-gpus, 1 machine) -- DistributedDataParallel (``strategy='ddp'``) (multiple-gpus across many machines (python script based)). -- DistributedDataParallel (``strategy='ddp_spawn'``) (multiple-gpus across many machines (spawn based)). +- DistributedDataParallel (multiple-gpus across many machines) + - Regular (``strategy='ddp'``) + - Spawn (``strategy='ddp_spawn'``) + - Fork (``strategy='ddp_fork'``) - DistributedDataParallel 2 (``strategy='ddp2'``) (DP in a machine, DDP across machines). - Horovod (``strategy='horovod'``) (multi-machine, multi-gpu, configured at runtime) - Bagua (``strategy='bagua'``) (multiple-gpus across many machines with advanced training algorithms) @@ -194,6 +196,53 @@ You can then call your scripts anywhere python some_file.py --accelerator 'gpu' --devices 8 --strategy 'ddp' +Distributed Data Parallel Fork +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +TODO + + +Comparison of DDP variants and tradeoffs +**************************************** + +.. list-table:: DDP variants and their tradeoffs + :widths: 40 20 20 20 + :header-rows: 1 + + * - + - DDP + - DDP Spawn + - DDP Fork + * - Works in Jupyter notebooks / IPython environments + - No + - Yes + - No + * - Supports multi-node + - Yes + - Yes + - Yes + * - Supported platforms + - Linux, Mac, Win + - Linux, Mac, Win + - Linux, Mac + * - Requires all objects to be picklable + - No + - Yes + - No + * - Is the guard ``if "__name__"=="__main__"`` required? + - Yes + - Yes + - No + * - Limitations in the main process + - None + - None + - GPU operations such as moving tensors to the GPU or calling ``torch.cuda`` functions before invoking ``Trainer.fit`` is not allowed. + * - Process creation time + - Slow + - Slow + - Fast + + Horovod ^^^^^^^ `Horovod `_ allows the same training script to be used for single-GPU, From 7b3c1321d7ae07620dbee8bb67d77551d928289d Mon Sep 17 00:00:00 2001 From: awaelchli Date: Wed, 29 Jun 2022 15:53:02 +0200 Subject: [PATCH 35/57] fork docs --- docs/source-pytorch/accelerators/gpu_intermediate.rst | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/docs/source-pytorch/accelerators/gpu_intermediate.rst b/docs/source-pytorch/accelerators/gpu_intermediate.rst index f024a73096961..257c22594a7e1 100644 --- a/docs/source-pytorch/accelerators/gpu_intermediate.rst +++ b/docs/source-pytorch/accelerators/gpu_intermediate.rst @@ -199,7 +199,15 @@ You can then call your scripts anywhere Distributed Data Parallel Fork ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -TODO +DDP Fork is an alternative to Spawn that can be used in interactive Python and Jupyter notebooks, Google Colab, Kaggle and so on: + +.. code-block:: python + + # train on 8 GPUs in a Jupyter notebook + trainer = Trainer(accelerator="gpu", devices=8, strategy="ddp_fork") + +Data Parallel (``strategy="dp"``) is the only other strategy supported in interactive environments but is slower, is discouraged by PyTorch and has other limitations. +Among the native distributed strategies, regular DDP (``strategy="ddp"``) is still recommended as the go-to strategy over Spawn and Fork for its speed and stability. Comparison of DDP variants and tradeoffs From 1b9595426c91df6180922f1c01e9dfffea365eba Mon Sep 17 00:00:00 2001 From: awaelchli Date: Wed, 29 Jun 2022 17:40:29 +0200 Subject: [PATCH 36/57] typing --- src/pytorch_lightning/strategies/ddp_spawn.py | 4 ++-- src/pytorch_lightning/strategies/launchers/spawn.py | 4 ++-- src/pytorch_lightning/strategies/tpu_spawn.py | 3 +-- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/pytorch_lightning/strategies/ddp_spawn.py b/src/pytorch_lightning/strategies/ddp_spawn.py index 9b3ca7d03a0f3..fd5b5288950db 100644 --- a/src/pytorch_lightning/strategies/ddp_spawn.py +++ b/src/pytorch_lightning/strategies/ddp_spawn.py @@ -13,7 +13,7 @@ # limitations under the License. import logging import os -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional, Union, Literal import torch import torch.distributed @@ -68,7 +68,7 @@ def __init__( ddp_comm_hook: Optional[callable] = None, ddp_comm_wrapper: Optional[callable] = None, process_group_backend: Optional[str] = None, - start_method: str = "spawn", + start_method: Literal["spawn", "fork"] = "spawn", **kwargs: Any, ): super().__init__( diff --git a/src/pytorch_lightning/strategies/launchers/spawn.py b/src/pytorch_lightning/strategies/launchers/spawn.py index faf51e1a526d4..182a59ce2172b 100644 --- a/src/pytorch_lightning/strategies/launchers/spawn.py +++ b/src/pytorch_lightning/strategies/launchers/spawn.py @@ -14,7 +14,7 @@ import os from collections import UserList from multiprocessing.queues import SimpleQueue -from typing import Any, Callable, NamedTuple, Optional +from typing import Any, Callable, NamedTuple, Optional, Literal import numpy as np import torch @@ -53,7 +53,7 @@ class _SpawnLauncher(_Launcher): the Windows platform. """ - def __init__(self, strategy: Strategy, start_method: str = "spawn") -> None: + def __init__(self, strategy: Strategy, start_method: Literal["spawn", "fork"] = "spawn") -> None: self._strategy = strategy self._start_method = start_method if start_method == "fork" and not hasattr(os, "fork"): diff --git a/src/pytorch_lightning/strategies/tpu_spawn.py b/src/pytorch_lightning/strategies/tpu_spawn.py index 464eb6b57d4de..c6d948e64d104 100644 --- a/src/pytorch_lightning/strategies/tpu_spawn.py +++ b/src/pytorch_lightning/strategies/tpu_spawn.py @@ -71,11 +71,11 @@ def __init__( cluster_environment=XLAEnvironment(), checkpoint_io=checkpoint_io, precision_plugin=precision_plugin, + start_method="fork", ) self.debug = debug self.tpu_local_core_rank = 0 self.tpu_global_core_rank = 0 - self.start_method = "fork" @property def root_device(self) -> torch.device: @@ -116,7 +116,6 @@ def _configure_launcher(self): self._launcher = _XLASpawnLauncher(self) def setup(self, trainer: "pl.Trainer") -> None: - self.start_method = "fork" self.accelerator.setup(trainer) if self.debug: From 8df3457cfed6fe86a99d93c0c13d2c2c0ba8bcc3 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 30 Jun 2022 07:11:57 +0000 Subject: [PATCH 37/57] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/pytorch_lightning/strategies/ddp_spawn.py | 2 +- src/pytorch_lightning/strategies/launchers/spawn.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pytorch_lightning/strategies/ddp_spawn.py b/src/pytorch_lightning/strategies/ddp_spawn.py index 7082639315ba9..4bcb2ae4b61d4 100644 --- a/src/pytorch_lightning/strategies/ddp_spawn.py +++ b/src/pytorch_lightning/strategies/ddp_spawn.py @@ -14,7 +14,7 @@ import logging import os from datetime import timedelta -from typing import Any, Dict, List, Optional, Union, Literal +from typing import Any, Dict, List, Literal, Optional, Union import torch import torch.distributed diff --git a/src/pytorch_lightning/strategies/launchers/spawn.py b/src/pytorch_lightning/strategies/launchers/spawn.py index 182a59ce2172b..20af57f6d093a 100644 --- a/src/pytorch_lightning/strategies/launchers/spawn.py +++ b/src/pytorch_lightning/strategies/launchers/spawn.py @@ -14,7 +14,7 @@ import os from collections import UserList from multiprocessing.queues import SimpleQueue -from typing import Any, Callable, NamedTuple, Optional, Literal +from typing import Any, Callable, Literal, NamedTuple, Optional import numpy as np import torch From d5c28b9831a8c0819d38431450e259e18b8470fc Mon Sep 17 00:00:00 2001 From: awaelchli Date: Fri, 1 Jul 2022 06:12:17 +0200 Subject: [PATCH 38/57] fix tests --- src/pytorch_lightning/strategies/ddp_spawn.py | 3 ++- src/pytorch_lightning/strategies/launchers/spawn.py | 3 ++- .../accelerators/test_accelerator_connector.py | 8 +++----- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/pytorch_lightning/strategies/ddp_spawn.py b/src/pytorch_lightning/strategies/ddp_spawn.py index 4bcb2ae4b61d4..8fe0068f545a1 100644 --- a/src/pytorch_lightning/strategies/ddp_spawn.py +++ b/src/pytorch_lightning/strategies/ddp_spawn.py @@ -14,7 +14,7 @@ import logging import os from datetime import timedelta -from typing import Any, Dict, List, Literal, Optional, Union +from typing import Any, Dict, List, Optional, Union import torch import torch.distributed @@ -22,6 +22,7 @@ from torch.distributed.constants import default_pg_timeout from torch.nn import Module from torch.nn.parallel.distributed import DistributedDataParallel +from typing_extensions import Literal import pytorch_lightning as pl from pytorch_lightning.overrides import LightningDistributedModule diff --git a/src/pytorch_lightning/strategies/launchers/spawn.py b/src/pytorch_lightning/strategies/launchers/spawn.py index 20af57f6d093a..4f0d0de6cbae0 100644 --- a/src/pytorch_lightning/strategies/launchers/spawn.py +++ b/src/pytorch_lightning/strategies/launchers/spawn.py @@ -14,12 +14,13 @@ import os from collections import UserList from multiprocessing.queues import SimpleQueue -from typing import Any, Callable, Literal, NamedTuple, Optional +from typing import Any, Callable, NamedTuple, Optional import numpy as np import torch import torch.multiprocessing as mp from torch import Tensor +from typing_extensions import Literal import pytorch_lightning as pl from pytorch_lightning.strategies.launchers.base import _Launcher diff --git a/tests/tests_pytorch/accelerators/test_accelerator_connector.py b/tests/tests_pytorch/accelerators/test_accelerator_connector.py index f4b9da4460617..da149704cb8a4 100644 --- a/tests/tests_pytorch/accelerators/test_accelerator_connector.py +++ b/tests/tests_pytorch/accelerators/test_accelerator_connector.py @@ -259,7 +259,7 @@ def test_accelerator_choice_multi_node_gpu( assert isinstance(trainer.strategy, strategy_class) -@mock.patch("pytorch_lightning.utilities.device_parser.is_cuda_available", return_value=False) +@mock.patch("pytorch_lightning.accelerators.gpu.device_parser.num_cuda_devices", return_value=0) def test_accelerator_cpu(_): trainer = Trainer(accelerator="cpu") assert isinstance(trainer.accelerator, CPUAccelerator) @@ -648,13 +648,11 @@ def test_unsupported_ipu_choice(mock_ipu_acc_avail, monkeypatch): Trainer(accelerator="ipu", precision=64) -@mock.patch("pytorch_lightning.utilities.device_parser.is_cuda_available", return_value=False) +@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=0) @mock.patch("pytorch_lightning.utilities.imports._TPU_AVAILABLE", return_value=False) @mock.patch("pytorch_lightning.utilities.imports._IPU_AVAILABLE", return_value=False) @mock.patch("pytorch_lightning.utilities.imports._HPU_AVAILABLE", return_value=False) -def test_devices_auto_choice_cpu( - is_ipu_available_mock, is_tpu_available_mock, is_gpu_available_mock, is_hpu_available_mock -): +def test_devices_auto_choice_cpu(*_): trainer = Trainer(accelerator="auto", devices="auto") assert trainer.num_devices == 1 From a6a0d094a61cf7a6c334d43c265c9166ad41069a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 3 Jul 2022 05:01:39 -0400 Subject: [PATCH 39/57] Update docs/source-pytorch/accelerators/gpu_intermediate.rst Co-authored-by: Rohit Gupta --- docs/source-pytorch/accelerators/gpu_intermediate.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source-pytorch/accelerators/gpu_intermediate.rst b/docs/source-pytorch/accelerators/gpu_intermediate.rst index 58bf5718b06e0..6f1cc192b1bee 100644 --- a/docs/source-pytorch/accelerators/gpu_intermediate.rst +++ b/docs/source-pytorch/accelerators/gpu_intermediate.rst @@ -204,7 +204,7 @@ You can then call your scripts anywhere Distributed Data Parallel Fork ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -DDP Fork is an alternative to Spawn that can be used in interactive Python and Jupyter notebooks, Google Colab, Kaggle and so on: +DDP Fork is an alternative to Spawn that can be used in interactive Python and Jupyter notebooks, Google Colab, Kaggle notebooks, and so on: .. code-block:: python From 4ede4cb93a8b2f9f0ff56bf914b9991ba5f6c133 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 3 Jul 2022 05:01:51 -0400 Subject: [PATCH 40/57] Update docs/source-pytorch/accelerators/gpu_intermediate.rst Co-authored-by: Rohit Gupta --- docs/source-pytorch/accelerators/gpu_intermediate.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source-pytorch/accelerators/gpu_intermediate.rst b/docs/source-pytorch/accelerators/gpu_intermediate.rst index 6f1cc192b1bee..d7f98d7b81f83 100644 --- a/docs/source-pytorch/accelerators/gpu_intermediate.rst +++ b/docs/source-pytorch/accelerators/gpu_intermediate.rst @@ -212,7 +212,7 @@ DDP Fork is an alternative to Spawn that can be used in interactive Python and J trainer = Trainer(accelerator="gpu", devices=8, strategy="ddp_fork") Data Parallel (``strategy="dp"``) is the only other strategy supported in interactive environments but is slower, is discouraged by PyTorch and has other limitations. -Among the native distributed strategies, regular DDP (``strategy="ddp"``) is still recommended as the go-to strategy over Spawn and Fork for its speed and stability. +Among the native distributed strategies, regular DDP (``strategy="ddp"``) is still recommended as the go-to strategy over Spawn and Fork for its speed and stability but it can only be used with scripts. Comparison of DDP variants and tradeoffs From 877ed0799a00bedf7cfb5f460f024044fdd60d4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 3 Jul 2022 05:01:59 -0400 Subject: [PATCH 41/57] Update docs/source-pytorch/accelerators/gpu_intermediate.rst Co-authored-by: Rohit Gupta --- docs/source-pytorch/accelerators/gpu_intermediate.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source-pytorch/accelerators/gpu_intermediate.rst b/docs/source-pytorch/accelerators/gpu_intermediate.rst index d7f98d7b81f83..40f681cf1cc27 100644 --- a/docs/source-pytorch/accelerators/gpu_intermediate.rst +++ b/docs/source-pytorch/accelerators/gpu_intermediate.rst @@ -229,7 +229,7 @@ Comparison of DDP variants and tradeoffs * - Works in Jupyter notebooks / IPython environments - No - Yes - - No + - Yes * - Supports multi-node - Yes - Yes From bf362591d628889dbc5fba667b9b53b6a37f42e0 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Sun, 3 Jul 2022 11:02:33 +0200 Subject: [PATCH 42/57] reviews --- docs/source-pytorch/accelerators/gpu_intermediate.rst | 4 ++-- src/pytorch_lightning/strategies/ddp_spawn.py | 4 ++-- src/pytorch_lightning/strategies/launchers/spawn.py | 2 +- src/pytorch_lightning/utilities/device_parser.py | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/source-pytorch/accelerators/gpu_intermediate.rst b/docs/source-pytorch/accelerators/gpu_intermediate.rst index 58bf5718b06e0..78217648761c7 100644 --- a/docs/source-pytorch/accelerators/gpu_intermediate.rst +++ b/docs/source-pytorch/accelerators/gpu_intermediate.rst @@ -204,7 +204,7 @@ You can then call your scripts anywhere Distributed Data Parallel Fork ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -DDP Fork is an alternative to Spawn that can be used in interactive Python and Jupyter notebooks, Google Colab, Kaggle and so on: +DDP Fork is an alternative to Spawn that can be used in interactive Python and Jupyter notebooks, Google Colab, Kaggle notebooks and so on: .. code-block:: python @@ -228,8 +228,8 @@ Comparison of DDP variants and tradeoffs - DDP Fork * - Works in Jupyter notebooks / IPython environments - No - - Yes - No + - Yes * - Supports multi-node - Yes - Yes diff --git a/src/pytorch_lightning/strategies/ddp_spawn.py b/src/pytorch_lightning/strategies/ddp_spawn.py index 8fe0068f545a1..44974977b4c70 100644 --- a/src/pytorch_lightning/strategies/ddp_spawn.py +++ b/src/pytorch_lightning/strategies/ddp_spawn.py @@ -287,14 +287,14 @@ def register_strategies(cls, strategy_registry: Dict) -> None: strategy_registry.register( f"ddp_{start_method}_find_unused_parameters_false", cls, - description="DDPSpawn Strategy with `find_unused_parameters` as False", + description=f"DDP {start_method.title()} strategy with `find_unused_parameters` as False", find_unused_parameters=False, start_method=start_method, ) strategy_registry.register( f"ddp_{start_method}", cls, - description=f"{cls.__class__.__name__}", + description=f"DDP {start_method.title()} strategy", start_method=start_method, ) diff --git a/src/pytorch_lightning/strategies/launchers/spawn.py b/src/pytorch_lightning/strategies/launchers/spawn.py index b34414160b2ad..6cbc97233a66e 100644 --- a/src/pytorch_lightning/strategies/launchers/spawn.py +++ b/src/pytorch_lightning/strategies/launchers/spawn.py @@ -58,7 +58,7 @@ def __init__(self, strategy: Strategy, start_method: Literal["spawn", "fork"] = self._strategy = strategy self._start_method = start_method if start_method == "fork" and not hasattr(os, "fork"): - raise ValueError("The start method 'fork' is not available on this platform. Choose 'spawn' instead.") + raise ValueError("The start method 'fork' is not available on this platform. Use 'spawn' instead.") @property def is_interactive_compatible(self) -> bool: diff --git a/src/pytorch_lightning/utilities/device_parser.py b/src/pytorch_lightning/utilities/device_parser.py index da36dbda83c80..bf1a33e51713a 100644 --- a/src/pytorch_lightning/utilities/device_parser.py +++ b/src/pytorch_lightning/utilities/device_parser.py @@ -338,7 +338,7 @@ def parse_hpus(devices: Optional[Union[int, str, List[int]]]) -> Optional[int]: def num_cuda_devices() -> int: """Returns the number of GPUs available. - Unlike :func:`torch.cuda.device_count`, this function will do its best not to create a CUDA context to for fork + Unlike :func:`torch.cuda.device_count`, this function will do its best not to create a CUDA context for fork support, if the platform allows it. """ if not hasattr(os, "fork"): @@ -350,7 +350,7 @@ def num_cuda_devices() -> int: def is_cuda_available() -> bool: """Returns a bool indicating if CUDA is currently available. - Unlike :func:`torch.cuda.is_available`, this function will do its best not to create a CUDA context to for fork + Unlike :func:`torch.cuda.is_available`, this function will do its best not to create a CUDA context for fork support, if the platform allows it. """ if not hasattr(os, "fork"): From c9b2601861d21798861de72adaed5f0c3a4b4369 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Tue, 19 Jul 2022 18:10:07 +0200 Subject: [PATCH 43/57] handle start methods --- .../strategies/launchers/spawn.py | 14 +++++++++----- .../trainer/connectors/accelerator_connector.py | 5 ++++- src/pytorch_lightning/utilities/device_parser.py | 5 ++--- .../strategies/launchers/test_spawn.py | 7 +++---- .../tests_pytorch/utilities/test_device_parser.py | 8 ++++---- 5 files changed, 22 insertions(+), 17 deletions(-) diff --git a/src/pytorch_lightning/strategies/launchers/spawn.py b/src/pytorch_lightning/strategies/launchers/spawn.py index 08fd34f27bd54..e69ecc3be7199 100644 --- a/src/pytorch_lightning/strategies/launchers/spawn.py +++ b/src/pytorch_lightning/strategies/launchers/spawn.py @@ -35,7 +35,7 @@ class _SpawnLauncher(_Launcher): r"""Spawns processes that run a given function in parallel, and joins them all at the end. The main process in which this launcher is invoked creates N so-called worker processes (using - :func:`torch.multiprocessing.spawn`) that run the given function. + :func:`torch.multiprocessing.start_processes`) that run the given function. Worker processes have a rank that ranges from 0 to N - 1. Note: @@ -50,14 +50,18 @@ class _SpawnLauncher(_Launcher): start_method: The method how to start the processes. - 'spawn': The default start method. Requires all objects to be pickleable. - 'fork': Preferrable for IPython/Jupyter environments where 'spawn' is not available. Not available on - the Windows platform. + the Windows platform for example. + - 'forkserver': Alternative implementation to 'fork'. """ - def __init__(self, strategy: Strategy, start_method: Literal["spawn", "fork"] = "spawn") -> None: + def __init__(self, strategy: Strategy, start_method: Literal["spawn", "fork", "forkserver"] = "spawn") -> None: self._strategy = strategy self._start_method = start_method - if start_method == "fork" and not hasattr(os, "fork"): - raise ValueError("The start method 'fork' is not available on this platform. Use 'spawn' instead.") + if start_method not in mp.get_all_start_methods(): + raise ValueError( + f"The start method '{start_method}' is not available on this platform. Available methods are:" + f" {', '.join(mp.get_all_start_methods())}" + ) @property def is_interactive_compatible(self) -> bool: diff --git a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py index 4e6a2d1ab0cab..ae457d7335814 100644 --- a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -614,7 +614,10 @@ def _check_strategy_and_fallback(self) -> None: f"You selected strategy to be `{DDPFullyShardedNativeStrategy.strategy_name}`, " "but GPU accelerator is not used." ) - if strategy_flag in ("ddp_fork", "ddp_fork_find_unused_parameters_false") and not hasattr(os, "fork"): + if ( + strategy_flag in ("ddp_fork", "ddp_fork_find_unused_parameters_false") + and "fork" not in torch.multiprocessing.get_all_start_methods() + ): raise ValueError( f"You selected `Trainer(strategy='{strategy_flag}')` but process forking is not supported on this" f" platform. We recommed to choose `Trainer(strategy='ddp_spawn')` instead." diff --git a/src/pytorch_lightning/utilities/device_parser.py b/src/pytorch_lightning/utilities/device_parser.py index bf1a33e51713a..c76933e489db7 100644 --- a/src/pytorch_lightning/utilities/device_parser.py +++ b/src/pytorch_lightning/utilities/device_parser.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. import multiprocessing -import os from typing import Any, List, MutableSequence, Optional, Tuple, Union import torch @@ -341,7 +340,7 @@ def num_cuda_devices() -> int: Unlike :func:`torch.cuda.device_count`, this function will do its best not to create a CUDA context for fork support, if the platform allows it. """ - if not hasattr(os, "fork"): + if "fork" not in torch.multiprocessing.get_all_start_methods(): return torch.cuda.device_count() with multiprocessing.get_context("fork").Pool(1) as pool: return pool.apply(torch.cuda.device_count) @@ -353,7 +352,7 @@ def is_cuda_available() -> bool: Unlike :func:`torch.cuda.is_available`, this function will do its best not to create a CUDA context for fork support, if the platform allows it. """ - if not hasattr(os, "fork"): + if "fork" not in torch.multiprocessing.get_all_start_methods(): return torch.cuda.is_available() with multiprocessing.get_context("fork").Pool(1) as pool: return pool.apply(torch.cuda.is_available) diff --git a/tests/tests_pytorch/strategies/launchers/test_spawn.py b/tests/tests_pytorch/strategies/launchers/test_spawn.py index f7c90f8c1689a..8b408d542640c 100644 --- a/tests/tests_pytorch/strategies/launchers/test_spawn.py +++ b/tests/tests_pytorch/strategies/launchers/test_spawn.py @@ -13,15 +13,13 @@ # limitations under the License. from unittest import mock from unittest.mock import ANY, Mock - import pytest -import pytorch_lightning from pytorch_lightning.strategies.launchers.spawn import _SpawnLauncher -def test_spawn_launcher_forking_on_unsupported_platform(monkeypatch): - monkeypatch.delattr(pytorch_lightning.strategies.launchers.spawn.os, "fork") +@mock.patch("pytorch_lightning.strategies.launchers.spawn.mp.get_all_start_methods", return_value=[]) +def test_spawn_launcher_forking_on_unsupported_platform(_): with pytest.raises(ValueError, match="The start method 'fork' is not available on this platform"): _SpawnLauncher(strategy=Mock(), start_method="fork") @@ -29,6 +27,7 @@ def test_spawn_launcher_forking_on_unsupported_platform(monkeypatch): @pytest.mark.parametrize("start_method", ["spawn", "fork"]) @mock.patch("pytorch_lightning.strategies.launchers.spawn.mp") def test_spawn_launcher_start_method(mp_mock, start_method): + mp_mock.get_all_start_methods.return_value = [start_method] launcher = _SpawnLauncher(strategy=Mock(), start_method=start_method) launcher.launch(function=Mock()) mp_mock.get_context.assert_called_with(start_method) diff --git a/tests/tests_pytorch/utilities/test_device_parser.py b/tests/tests_pytorch/utilities/test_device_parser.py index 59750cf94f90b..d496db487f55c 100644 --- a/tests/tests_pytorch/utilities/test_device_parser.py +++ b/tests/tests_pytorch/utilities/test_device_parser.py @@ -11,21 +11,21 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import os -import sys from unittest import mock import pytest +import torch from pytorch_lightning.utilities import device_parser -@pytest.mark.skipif(sys.platform != "win32", reason="Requires Windows without forking support") +@pytest.mark.skipif( + "fork" in torch.multiprocessing.get_all_start_methods(), reason="Requires platform without forking support" +) @mock.patch("torch.cuda.is_available", return_value=True) @mock.patch("torch.cuda.device_count", return_value=2) def test_num_cuda_devices_without_forking(*_): """This merely tests that on platforms without fork support our helper functions fall back to the default implementation for determining cuda availability.""" - assert not hasattr(os, "fork") assert device_parser.is_cuda_available() assert device_parser.num_cuda_devices() == 2 From cca16065fe5ceffa0e0a22cbd1b3ba53648d88ac Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 19 Jul 2022 16:12:16 +0000 Subject: [PATCH 44/57] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/tests_pytorch/strategies/launchers/test_spawn.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/tests_pytorch/strategies/launchers/test_spawn.py b/tests/tests_pytorch/strategies/launchers/test_spawn.py index 8b408d542640c..3bb2e94175477 100644 --- a/tests/tests_pytorch/strategies/launchers/test_spawn.py +++ b/tests/tests_pytorch/strategies/launchers/test_spawn.py @@ -13,6 +13,7 @@ # limitations under the License. from unittest import mock from unittest.mock import ANY, Mock + import pytest from pytorch_lightning.strategies.launchers.spawn import _SpawnLauncher From e6b19a12f142a2f4fca93e8229b2c4801406eb56 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Tue, 19 Jul 2022 18:17:17 +0200 Subject: [PATCH 45/57] update tests --- .../trainer/connectors/accelerator_connector.py | 2 +- .../accelerators/test_accelerator_connector.py | 7 +++++-- tests/tests_pytorch/strategies/launchers/test_spawn.py | 1 + 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py index ae457d7335814..217f261634ac2 100644 --- a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -620,7 +620,7 @@ def _check_strategy_and_fallback(self) -> None: ): raise ValueError( f"You selected `Trainer(strategy='{strategy_flag}')` but process forking is not supported on this" - f" platform. We recommed to choose `Trainer(strategy='ddp_spawn')` instead." + f" platform. We recommed `Trainer(strategy='ddp_spawn')` instead." ) if strategy_flag: self._strategy_flag = strategy_flag diff --git a/tests/tests_pytorch/accelerators/test_accelerator_connector.py b/tests/tests_pytorch/accelerators/test_accelerator_connector.py index da149704cb8a4..175f404ce1245 100644 --- a/tests/tests_pytorch/accelerators/test_accelerator_connector.py +++ b/tests/tests_pytorch/accelerators/test_accelerator_connector.py @@ -758,7 +758,10 @@ def test_passing_zero_and_empty_list_to_devices_flag(accelerator, devices): @pytest.mark.parametrize("strategy", ["ddp_fork", "ddp_fork_find_unused_parameters_false"]) -def test_ddp_fork_on_unsupported_platform(strategy, monkeypatch): - monkeypatch.delattr(os, "fork") +@mock.patch( + "pytorch_lightning.trainer.connectors.accelerator_connector.torch.multiprocessing.get_all_start_methods", + return_value=[], +) +def test_ddp_fork_on_unsupported_platform(_, strategy): with pytest.raises(ValueError, match="process forking is not supported on this platform"): Trainer(strategy=strategy) diff --git a/tests/tests_pytorch/strategies/launchers/test_spawn.py b/tests/tests_pytorch/strategies/launchers/test_spawn.py index 8b408d542640c..3bb2e94175477 100644 --- a/tests/tests_pytorch/strategies/launchers/test_spawn.py +++ b/tests/tests_pytorch/strategies/launchers/test_spawn.py @@ -13,6 +13,7 @@ # limitations under the License. from unittest import mock from unittest.mock import ANY, Mock + import pytest from pytorch_lightning.strategies.launchers.spawn import _SpawnLauncher From cc06e124e3d8a66257b5803fe0a7cd177efef0f6 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Tue, 19 Jul 2022 18:25:25 +0200 Subject: [PATCH 46/57] update type --- src/pytorch_lightning/strategies/ddp_spawn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pytorch_lightning/strategies/ddp_spawn.py b/src/pytorch_lightning/strategies/ddp_spawn.py index 44974977b4c70..a0b0a7865869f 100644 --- a/src/pytorch_lightning/strategies/ddp_spawn.py +++ b/src/pytorch_lightning/strategies/ddp_spawn.py @@ -72,7 +72,7 @@ def __init__( ddp_comm_wrapper: Optional[callable] = None, process_group_backend: Optional[str] = None, timeout: Optional[timedelta] = default_pg_timeout, - start_method: Literal["spawn", "fork"] = "spawn", + start_method: Literal["spawn", "fork", "forkserver"] = "spawn", **kwargs: Any, ): super().__init__( From c5480a113f6c025c0f39b822680ee59cc66e3a03 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Tue, 19 Jul 2022 23:43:59 +0200 Subject: [PATCH 47/57] fix merge errors --- .../trainer/connectors/accelerator_connector.py | 2 +- src/pytorch_lightning/trainer/trainer.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py index 34bb70f1b3900..62cedb6818150 100644 --- a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -496,7 +496,7 @@ def _choose_accelerator(self) -> str: return "hpu" if MPSAccelerator.is_available(): return "mps" - if GPUAccelerator.is_available(): + if CUDAAccelerator.is_available(): return "cuda" return "cpu" diff --git a/src/pytorch_lightning/trainer/trainer.py b/src/pytorch_lightning/trainer/trainer.py index 4e8e2a3ae61eb..1374d0474c3c3 100644 --- a/src/pytorch_lightning/trainer/trainer.py +++ b/src/pytorch_lightning/trainer/trainer.py @@ -1758,7 +1758,7 @@ def _log_device_info(self) -> None: rank_zero_info(f"HPU available: {_HPU_AVAILABLE}, using: {num_hpus} HPUs") # TODO: Integrate MPS Accelerator here, once gpu maps to both - if GPUAccelerator.is_available() and not isinstance(self.accelerator, CUDAAccelerator): + if CUDAAccelerator.is_available() and not isinstance(self.accelerator, CUDAAccelerator): rank_zero_warn( "GPU available but not used. Set `accelerator` and `devices` using" f" `Trainer(accelerator='gpu', devices={CUDAAccelerator.auto_device_count()})`.", From 87cd34472277e44c329c024d617af4f6317e3daf Mon Sep 17 00:00:00 2001 From: awaelchli Date: Wed, 20 Jul 2022 00:25:34 +0200 Subject: [PATCH 48/57] update tests --- src/pytorch_lightning/strategies/launchers/spawn.py | 11 ++++++----- .../accelerators/test_accelerator_connector.py | 8 ++++++-- tests/tests_pytorch/lite/test_lite.py | 4 ++-- .../tests_pytorch/strategies/launchers/test_spawn.py | 3 ++- .../strategies/test_strategy_registry.py | 4 +++- 5 files changed, 19 insertions(+), 11 deletions(-) diff --git a/src/pytorch_lightning/strategies/launchers/spawn.py b/src/pytorch_lightning/strategies/launchers/spawn.py index e69ecc3be7199..acefbfc3a2253 100644 --- a/src/pytorch_lightning/strategies/launchers/spawn.py +++ b/src/pytorch_lightning/strategies/launchers/spawn.py @@ -57,11 +57,6 @@ class _SpawnLauncher(_Launcher): def __init__(self, strategy: Strategy, start_method: Literal["spawn", "fork", "forkserver"] = "spawn") -> None: self._strategy = strategy self._start_method = start_method - if start_method not in mp.get_all_start_methods(): - raise ValueError( - f"The start method '{start_method}' is not available on this platform. Available methods are:" - f" {', '.join(mp.get_all_start_methods())}" - ) @property def is_interactive_compatible(self) -> bool: @@ -83,6 +78,12 @@ def launch(self, function: Callable, *args: Any, trainer: Optional["pl.Trainer"] a selected set of attributes get restored in the main process after processes join. **kwargs: Optional keyword arguments to be passed to the given function. """ + if self._start_method not in mp.get_all_start_methods(): + raise ValueError( + f"The start method '{self._start_method}' is not available on this platform. Available methods are:" + f" {', '.join(mp.get_all_start_methods())}" + ) + # The default cluster environment in Lightning chooses a random free port number # This needs to be done in the main process here before spawning to ensure each rank will connect # through the same port diff --git a/tests/tests_pytorch/accelerators/test_accelerator_connector.py b/tests/tests_pytorch/accelerators/test_accelerator_connector.py index d8e62ddc2054d..9935300fda2d1 100644 --- a/tests/tests_pytorch/accelerators/test_accelerator_connector.py +++ b/tests/tests_pytorch/accelerators/test_accelerator_connector.py @@ -26,6 +26,7 @@ from pytorch_lightning.accelerators.cpu import CPUAccelerator from pytorch_lightning.accelerators.cuda import CUDAAccelerator from pytorch_lightning.accelerators.mps import MPSAccelerator +from pytorch_lightning.demos.boring_classes import BoringModel from pytorch_lightning.plugins import DoublePrecisionPlugin, LayerSync, NativeSyncBatchNorm, PrecisionPlugin from pytorch_lightning.plugins.environments import ( KubeflowEnvironment, @@ -226,7 +227,10 @@ def test_ipython_compatible_dp_strategy_gpu(_, monkeypatch): @mock.patch("pytorch_lightning.accelerators.tpu.TPUAccelerator.is_available", return_value=True) -def test_ipython_compatible_strategy_tpu(_, monkeypatch): +@mock.patch( + "pytorch_lightning.strategies.launchers.spawn.torch.multiprocessing.get_all_start_methods", return_value=["fork"] +) +def test_ipython_compatible_strategy_tpu(_, __, monkeypatch): monkeypatch.setattr(pytorch_lightning.utilities, "_IS_INTERACTIVE", True) trainer = Trainer(accelerator="tpu") assert trainer.strategy.launcher.is_interactive_compatible @@ -259,7 +263,7 @@ def test_accelerator_choice_multi_node_gpu( assert isinstance(trainer.strategy, strategy_class) -@mock.patch("pytorch_lightning.accelerators.gpu.device_parser.num_cuda_devices", return_value=0) +@mock.patch("pytorch_lightning.accelerators.cuda.device_parser.num_cuda_devices", return_value=0) def test_accelerator_cpu(_): trainer = Trainer(accelerator="cpu") assert isinstance(trainer.accelerator, CPUAccelerator) diff --git a/tests/tests_pytorch/lite/test_lite.py b/tests/tests_pytorch/lite/test_lite.py index 76596d0ee072d..6d0c0fe891695 100644 --- a/tests/tests_pytorch/lite/test_lite.py +++ b/tests/tests_pytorch/lite/test_lite.py @@ -267,7 +267,7 @@ def test_seed_everything(): _StrategyType.DP, _StrategyType.DDP, _StrategyType.DDP_SPAWN, - _StrategyType.DDP_FORK, + pytest.param(_StrategyType.DDP_FORK, marks=RunIf(skip_windows=True)), pytest.param(_StrategyType.DEEPSPEED, marks=RunIf(deepspeed=True)), pytest.param(_StrategyType.DDP_SHARDED, marks=RunIf(fairscale=True)), pytest.param(_StrategyType.DDP_SHARDED_SPAWN, marks=RunIf(fairscale=True)), @@ -296,7 +296,7 @@ def test_setup_dataloaders_replace_custom_sampler(strategy): _StrategyType.DP, _StrategyType.DDP, _StrategyType.DDP_SPAWN, - _StrategyType.DDP_FORK, + pytest.param(_StrategyType.DDP_FORK, marks=RunIf(skip_windows=True)), pytest.param(_StrategyType.DEEPSPEED, marks=RunIf(deepspeed=True)), pytest.param(_StrategyType.DDP_SHARDED, marks=RunIf(fairscale=True)), pytest.param(_StrategyType.DDP_SHARDED_SPAWN, marks=RunIf(fairscale=True)), diff --git a/tests/tests_pytorch/strategies/launchers/test_spawn.py b/tests/tests_pytorch/strategies/launchers/test_spawn.py index 3bb2e94175477..1be8cfbc53238 100644 --- a/tests/tests_pytorch/strategies/launchers/test_spawn.py +++ b/tests/tests_pytorch/strategies/launchers/test_spawn.py @@ -21,8 +21,9 @@ @mock.patch("pytorch_lightning.strategies.launchers.spawn.mp.get_all_start_methods", return_value=[]) def test_spawn_launcher_forking_on_unsupported_platform(_): + launcher = _SpawnLauncher(strategy=Mock(), start_method="fork") with pytest.raises(ValueError, match="The start method 'fork' is not available on this platform"): - _SpawnLauncher(strategy=Mock(), start_method="fork") + launcher.launch(function=Mock()) @pytest.mark.parametrize("start_method", ["spawn", "fork"]) diff --git a/tests/tests_pytorch/strategies/test_strategy_registry.py b/tests/tests_pytorch/strategies/test_strategy_registry.py index 4cc45b058678c..649b7bfadba39 100644 --- a/tests/tests_pytorch/strategies/test_strategy_registry.py +++ b/tests/tests_pytorch/strategies/test_strategy_registry.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import pytest +import torch.multiprocessing from pytorch_lightning import Trainer from pytorch_lightning.plugins import CheckpointIO @@ -117,10 +118,11 @@ def test_fsdp_strategy_registry(tmpdir): DDPSpawnStrategy, {"find_unused_parameters": False, "start_method": "spawn"}, ), - ( + pytest.param( "ddp_fork_find_unused_parameters_false", DDPSpawnStrategy, {"find_unused_parameters": False, "start_method": "fork"}, + marks=RunIf(skip_windows=True), ), ( "ddp_sharded_spawn_find_unused_parameters_false", From b686b3ba0e832cb6971b992c21ad7d268b50588c Mon Sep 17 00:00:00 2001 From: awaelchli Date: Wed, 20 Jul 2022 00:38:36 +0200 Subject: [PATCH 49/57] remove unused import --- tests/tests_pytorch/accelerators/test_accelerator_connector.py | 1 - tests/tests_pytorch/strategies/test_strategy_registry.py | 1 - 2 files changed, 2 deletions(-) diff --git a/tests/tests_pytorch/accelerators/test_accelerator_connector.py b/tests/tests_pytorch/accelerators/test_accelerator_connector.py index 9935300fda2d1..00007d114016e 100644 --- a/tests/tests_pytorch/accelerators/test_accelerator_connector.py +++ b/tests/tests_pytorch/accelerators/test_accelerator_connector.py @@ -26,7 +26,6 @@ from pytorch_lightning.accelerators.cpu import CPUAccelerator from pytorch_lightning.accelerators.cuda import CUDAAccelerator from pytorch_lightning.accelerators.mps import MPSAccelerator -from pytorch_lightning.demos.boring_classes import BoringModel from pytorch_lightning.plugins import DoublePrecisionPlugin, LayerSync, NativeSyncBatchNorm, PrecisionPlugin from pytorch_lightning.plugins.environments import ( KubeflowEnvironment, diff --git a/tests/tests_pytorch/strategies/test_strategy_registry.py b/tests/tests_pytorch/strategies/test_strategy_registry.py index 649b7bfadba39..91689c18f6e39 100644 --- a/tests/tests_pytorch/strategies/test_strategy_registry.py +++ b/tests/tests_pytorch/strategies/test_strategy_registry.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. import pytest -import torch.multiprocessing from pytorch_lightning import Trainer from pytorch_lightning.plugins import CheckpointIO From 2d05ac3438b3ac119cfeec4e7400cbd3637e803e Mon Sep 17 00:00:00 2001 From: awaelchli Date: Wed, 20 Jul 2022 12:25:20 +0200 Subject: [PATCH 50/57] revert weird change --- .../tests_pytorch/accelerators/test_accelerator_connector.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tests/tests_pytorch/accelerators/test_accelerator_connector.py b/tests/tests_pytorch/accelerators/test_accelerator_connector.py index 00007d114016e..c8710b0c630a6 100644 --- a/tests/tests_pytorch/accelerators/test_accelerator_connector.py +++ b/tests/tests_pytorch/accelerators/test_accelerator_connector.py @@ -226,10 +226,7 @@ def test_ipython_compatible_dp_strategy_gpu(_, monkeypatch): @mock.patch("pytorch_lightning.accelerators.tpu.TPUAccelerator.is_available", return_value=True) -@mock.patch( - "pytorch_lightning.strategies.launchers.spawn.torch.multiprocessing.get_all_start_methods", return_value=["fork"] -) -def test_ipython_compatible_strategy_tpu(_, __, monkeypatch): +def test_ipython_compatible_strategy_tpu(_, monkeypatch): monkeypatch.setattr(pytorch_lightning.utilities, "_IS_INTERACTIVE", True) trainer = Trainer(accelerator="tpu") assert trainer.strategy.launcher.is_interactive_compatible From 78e542edf9809043d6360b4371926a516005b3aa Mon Sep 17 00:00:00 2001 From: awaelchli Date: Wed, 20 Jul 2022 12:32:51 +0200 Subject: [PATCH 51/57] remove redundant start method attribute --- src/pytorch_lightning/strategies/tpu_spawn.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/pytorch_lightning/strategies/tpu_spawn.py b/src/pytorch_lightning/strategies/tpu_spawn.py index 04aaf6af569c0..4b097b96e62a4 100644 --- a/src/pytorch_lightning/strategies/tpu_spawn.py +++ b/src/pytorch_lightning/strategies/tpu_spawn.py @@ -73,7 +73,6 @@ def __init__( start_method="fork", ) self.debug = debug - self.start_method = "fork" @property def root_device(self) -> torch.device: From 0b13047fd827686d473c43a9bc32756571087140 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Wed, 20 Jul 2022 15:40:37 +0200 Subject: [PATCH 52/57] they insist --- src/pytorch_lightning/strategies/launchers/spawn.py | 11 +++++------ .../tests_pytorch/strategies/launchers/test_spawn.py | 3 +-- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/src/pytorch_lightning/strategies/launchers/spawn.py b/src/pytorch_lightning/strategies/launchers/spawn.py index acefbfc3a2253..91482b66e5de3 100644 --- a/src/pytorch_lightning/strategies/launchers/spawn.py +++ b/src/pytorch_lightning/strategies/launchers/spawn.py @@ -57,6 +57,11 @@ class _SpawnLauncher(_Launcher): def __init__(self, strategy: Strategy, start_method: Literal["spawn", "fork", "forkserver"] = "spawn") -> None: self._strategy = strategy self._start_method = start_method + if start_method not in mp.get_all_start_methods(): + raise ValueError( + f"The start method '{self._start_method}' is not available on this platform. Available methods are:" + f" {', '.join(mp.get_all_start_methods())}" + ) @property def is_interactive_compatible(self) -> bool: @@ -78,12 +83,6 @@ def launch(self, function: Callable, *args: Any, trainer: Optional["pl.Trainer"] a selected set of attributes get restored in the main process after processes join. **kwargs: Optional keyword arguments to be passed to the given function. """ - if self._start_method not in mp.get_all_start_methods(): - raise ValueError( - f"The start method '{self._start_method}' is not available on this platform. Available methods are:" - f" {', '.join(mp.get_all_start_methods())}" - ) - # The default cluster environment in Lightning chooses a random free port number # This needs to be done in the main process here before spawning to ensure each rank will connect # through the same port diff --git a/tests/tests_pytorch/strategies/launchers/test_spawn.py b/tests/tests_pytorch/strategies/launchers/test_spawn.py index 1be8cfbc53238..3bb2e94175477 100644 --- a/tests/tests_pytorch/strategies/launchers/test_spawn.py +++ b/tests/tests_pytorch/strategies/launchers/test_spawn.py @@ -21,9 +21,8 @@ @mock.patch("pytorch_lightning.strategies.launchers.spawn.mp.get_all_start_methods", return_value=[]) def test_spawn_launcher_forking_on_unsupported_platform(_): - launcher = _SpawnLauncher(strategy=Mock(), start_method="fork") with pytest.raises(ValueError, match="The start method 'fork' is not available on this platform"): - launcher.launch(function=Mock()) + _SpawnLauncher(strategy=Mock(), start_method="fork") @pytest.mark.parametrize("start_method", ["spawn", "fork"]) From 3d7095de76949f2a4a72e1e6b7b3f0532724980d Mon Sep 17 00:00:00 2001 From: awaelchli Date: Wed, 20 Jul 2022 16:18:40 +0200 Subject: [PATCH 53/57] update tests --- .../tests_pytorch/accelerators/test_accelerator_connector.py | 3 ++- tests/tests_pytorch/deprecated_api/test_remove_1-8.py | 5 +++-- tests/tests_pytorch/deprecated_api/test_remove_2-0.py | 1 + tests/tests_pytorch/strategies/test_strategy_registry.py | 5 ++++- 4 files changed, 10 insertions(+), 4 deletions(-) diff --git a/tests/tests_pytorch/accelerators/test_accelerator_connector.py b/tests/tests_pytorch/accelerators/test_accelerator_connector.py index af12f6500e52b..af95d8ae1804c 100644 --- a/tests/tests_pytorch/accelerators/test_accelerator_connector.py +++ b/tests/tests_pytorch/accelerators/test_accelerator_connector.py @@ -225,8 +225,9 @@ def test_ipython_compatible_dp_strategy_gpu(_, monkeypatch): assert trainer.strategy.launcher is None +@mock.patch("pytorch_lightning.strategies.launchers.spawn.mp.get_all_start_methods", return_value=["fork"]) @mock.patch("pytorch_lightning.accelerators.tpu.TPUAccelerator.is_available", return_value=True) -def test_ipython_compatible_strategy_tpu(_, monkeypatch): +def test_ipython_compatible_strategy_tpu(_, __, monkeypatch): monkeypatch.setattr(pytorch_lightning.utilities, "_IS_INTERACTIVE", True) trainer = Trainer(accelerator="tpu") assert trainer.strategy.launcher.is_interactive_compatible diff --git a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py index dff1999289cba..a93c08ddc665f 100644 --- a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py +++ b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py @@ -1140,8 +1140,9 @@ def test_trainer_gpus(monkeypatch, trainer_kwargs): assert trainer.gpus == trainer_kwargs["devices"] -def test_trainer_tpu_cores(monkeypatch): - monkeypatch.setattr(pytorch_lightning.accelerators.tpu.TPUAccelerator, "is_available", lambda _: True) +@mock.patch("pytorch_lightning.strategies.launchers.spawn.mp.get_all_start_methods", return_value=["fork"]) +@mock.patch("pytorch_lightning.accelerators.tpu.TPUAccelerator.is_available", return_value=True) +def test_trainer_tpu_cores(*_): trainer = Trainer(accelerator="tpu", devices=8) with pytest.deprecated_call( match=( diff --git a/tests/tests_pytorch/deprecated_api/test_remove_2-0.py b/tests/tests_pytorch/deprecated_api/test_remove_2-0.py index 3a13ba340d3d0..11b00828cf66d 100644 --- a/tests/tests_pytorch/deprecated_api/test_remove_2-0.py +++ b/tests/tests_pytorch/deprecated_api/test_remove_2-0.py @@ -36,6 +36,7 @@ def test_v2_0_0_deprecated_gpus(*_): @mock.patch("pytorch_lightning.accelerators.tpu.TPUAccelerator.is_available", return_value=True) @mock.patch("pytorch_lightning.accelerators.tpu.TPUAccelerator.parse_devices", return_value=8) +@mock.patch("pytorch_lightning.strategies.launchers.spawn.mp.get_all_start_methods", return_value=["fork"]) def test_v2_0_0_deprecated_tpu_cores(*_): with pytest.deprecated_call(match=r"is deprecated in v1.7 and will be removed in v2.0."): _ = Trainer(tpu_cores=8) diff --git a/tests/tests_pytorch/strategies/test_strategy_registry.py b/tests/tests_pytorch/strategies/test_strategy_registry.py index 91689c18f6e39..3c55cb1b771ae 100644 --- a/tests/tests_pytorch/strategies/test_strategy_registry.py +++ b/tests/tests_pytorch/strategies/test_strategy_registry.py @@ -11,6 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from unittest import mock + import pytest from pytorch_lightning import Trainer @@ -79,7 +81,8 @@ def test_deepspeed_strategy_registry_with_trainer(tmpdir, strategy): assert isinstance(trainer.strategy, DeepSpeedStrategy) -def test_tpu_spawn_debug_strategy_registry(tmpdir): +@mock.patch("pytorch_lightning.strategies.launchers.spawn.mp.get_all_start_methods", return_value=["fork"]) +def test_tpu_spawn_debug_strategy_registry(_): strategy = "tpu_spawn_debug" From a5c35920fda74bbd7fc61075950a703a58e8cf57 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Wed, 20 Jul 2022 16:18:40 +0200 Subject: [PATCH 54/57] Revert "update tests" This reverts commit 3d7095de76949f2a4a72e1e6b7b3f0532724980d. --- .../tests_pytorch/accelerators/test_accelerator_connector.py | 3 +-- tests/tests_pytorch/deprecated_api/test_remove_1-8.py | 5 ++--- tests/tests_pytorch/deprecated_api/test_remove_2-0.py | 1 - tests/tests_pytorch/strategies/test_strategy_registry.py | 5 +---- 4 files changed, 4 insertions(+), 10 deletions(-) diff --git a/tests/tests_pytorch/accelerators/test_accelerator_connector.py b/tests/tests_pytorch/accelerators/test_accelerator_connector.py index af95d8ae1804c..af12f6500e52b 100644 --- a/tests/tests_pytorch/accelerators/test_accelerator_connector.py +++ b/tests/tests_pytorch/accelerators/test_accelerator_connector.py @@ -225,9 +225,8 @@ def test_ipython_compatible_dp_strategy_gpu(_, monkeypatch): assert trainer.strategy.launcher is None -@mock.patch("pytorch_lightning.strategies.launchers.spawn.mp.get_all_start_methods", return_value=["fork"]) @mock.patch("pytorch_lightning.accelerators.tpu.TPUAccelerator.is_available", return_value=True) -def test_ipython_compatible_strategy_tpu(_, __, monkeypatch): +def test_ipython_compatible_strategy_tpu(_, monkeypatch): monkeypatch.setattr(pytorch_lightning.utilities, "_IS_INTERACTIVE", True) trainer = Trainer(accelerator="tpu") assert trainer.strategy.launcher.is_interactive_compatible diff --git a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py index a93c08ddc665f..dff1999289cba 100644 --- a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py +++ b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py @@ -1140,9 +1140,8 @@ def test_trainer_gpus(monkeypatch, trainer_kwargs): assert trainer.gpus == trainer_kwargs["devices"] -@mock.patch("pytorch_lightning.strategies.launchers.spawn.mp.get_all_start_methods", return_value=["fork"]) -@mock.patch("pytorch_lightning.accelerators.tpu.TPUAccelerator.is_available", return_value=True) -def test_trainer_tpu_cores(*_): +def test_trainer_tpu_cores(monkeypatch): + monkeypatch.setattr(pytorch_lightning.accelerators.tpu.TPUAccelerator, "is_available", lambda _: True) trainer = Trainer(accelerator="tpu", devices=8) with pytest.deprecated_call( match=( diff --git a/tests/tests_pytorch/deprecated_api/test_remove_2-0.py b/tests/tests_pytorch/deprecated_api/test_remove_2-0.py index 11b00828cf66d..3a13ba340d3d0 100644 --- a/tests/tests_pytorch/deprecated_api/test_remove_2-0.py +++ b/tests/tests_pytorch/deprecated_api/test_remove_2-0.py @@ -36,7 +36,6 @@ def test_v2_0_0_deprecated_gpus(*_): @mock.patch("pytorch_lightning.accelerators.tpu.TPUAccelerator.is_available", return_value=True) @mock.patch("pytorch_lightning.accelerators.tpu.TPUAccelerator.parse_devices", return_value=8) -@mock.patch("pytorch_lightning.strategies.launchers.spawn.mp.get_all_start_methods", return_value=["fork"]) def test_v2_0_0_deprecated_tpu_cores(*_): with pytest.deprecated_call(match=r"is deprecated in v1.7 and will be removed in v2.0."): _ = Trainer(tpu_cores=8) diff --git a/tests/tests_pytorch/strategies/test_strategy_registry.py b/tests/tests_pytorch/strategies/test_strategy_registry.py index 3c55cb1b771ae..91689c18f6e39 100644 --- a/tests/tests_pytorch/strategies/test_strategy_registry.py +++ b/tests/tests_pytorch/strategies/test_strategy_registry.py @@ -11,8 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from unittest import mock - import pytest from pytorch_lightning import Trainer @@ -81,8 +79,7 @@ def test_deepspeed_strategy_registry_with_trainer(tmpdir, strategy): assert isinstance(trainer.strategy, DeepSpeedStrategy) -@mock.patch("pytorch_lightning.strategies.launchers.spawn.mp.get_all_start_methods", return_value=["fork"]) -def test_tpu_spawn_debug_strategy_registry(_): +def test_tpu_spawn_debug_strategy_registry(tmpdir): strategy = "tpu_spawn_debug" From 8c74f9a1ec0d66f4bdda0d8f67956dcf9d2ca679 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Wed, 20 Jul 2022 17:26:45 +0200 Subject: [PATCH 55/57] insist --- tests/tests_pytorch/accelerators/test_accelerator_connector.py | 1 + tests/tests_pytorch/deprecated_api/test_remove_1-8.py | 1 + tests/tests_pytorch/deprecated_api/test_remove_2-0.py | 3 +++ tests/tests_pytorch/strategies/test_strategy_registry.py | 3 ++- 4 files changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/tests_pytorch/accelerators/test_accelerator_connector.py b/tests/tests_pytorch/accelerators/test_accelerator_connector.py index af12f6500e52b..4f2b729888a4e 100644 --- a/tests/tests_pytorch/accelerators/test_accelerator_connector.py +++ b/tests/tests_pytorch/accelerators/test_accelerator_connector.py @@ -225,6 +225,7 @@ def test_ipython_compatible_dp_strategy_gpu(_, monkeypatch): assert trainer.strategy.launcher is None +@RunIf(skip_windows=True) @mock.patch("pytorch_lightning.accelerators.tpu.TPUAccelerator.is_available", return_value=True) def test_ipython_compatible_strategy_tpu(_, monkeypatch): monkeypatch.setattr(pytorch_lightning.utilities, "_IS_INTERACTIVE", True) diff --git a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py index dff1999289cba..12aca123eacc1 100644 --- a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py +++ b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py @@ -1140,6 +1140,7 @@ def test_trainer_gpus(monkeypatch, trainer_kwargs): assert trainer.gpus == trainer_kwargs["devices"] +@RunIf(skip_windows=True) def test_trainer_tpu_cores(monkeypatch): monkeypatch.setattr(pytorch_lightning.accelerators.tpu.TPUAccelerator, "is_available", lambda _: True) trainer = Trainer(accelerator="tpu", devices=8) diff --git a/tests/tests_pytorch/deprecated_api/test_remove_2-0.py b/tests/tests_pytorch/deprecated_api/test_remove_2-0.py index 3a13ba340d3d0..ed8cc4e3e9ebd 100644 --- a/tests/tests_pytorch/deprecated_api/test_remove_2-0.py +++ b/tests/tests_pytorch/deprecated_api/test_remove_2-0.py @@ -21,6 +21,8 @@ from pytorch_lightning.demos.boring_classes import BoringModel from tests_pytorch.callbacks.test_callbacks import OldStatefulCallback +from tests_pytorch.helpers.runif import RunIf + def test_v2_0_0_deprecated_num_processes(): with pytest.deprecated_call(match=r"is deprecated in v1.7 and will be removed in v2.0."): @@ -34,6 +36,7 @@ def test_v2_0_0_deprecated_gpus(*_): _ = Trainer(gpus=0) +@RunIf(skip_windows=True) @mock.patch("pytorch_lightning.accelerators.tpu.TPUAccelerator.is_available", return_value=True) @mock.patch("pytorch_lightning.accelerators.tpu.TPUAccelerator.parse_devices", return_value=8) def test_v2_0_0_deprecated_tpu_cores(*_): diff --git a/tests/tests_pytorch/strategies/test_strategy_registry.py b/tests/tests_pytorch/strategies/test_strategy_registry.py index 91689c18f6e39..f5576fa14eb8a 100644 --- a/tests/tests_pytorch/strategies/test_strategy_registry.py +++ b/tests/tests_pytorch/strategies/test_strategy_registry.py @@ -79,7 +79,8 @@ def test_deepspeed_strategy_registry_with_trainer(tmpdir, strategy): assert isinstance(trainer.strategy, DeepSpeedStrategy) -def test_tpu_spawn_debug_strategy_registry(tmpdir): +@RunIf(skip_windows=True) +def test_tpu_spawn_debug_strategy_registry(): strategy = "tpu_spawn_debug" From 1832c3efbb79c8d8a0d20b5a5de336582139b671 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 20 Jul 2022 15:28:29 +0000 Subject: [PATCH 56/57] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/tests_pytorch/deprecated_api/test_remove_2-0.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/tests_pytorch/deprecated_api/test_remove_2-0.py b/tests/tests_pytorch/deprecated_api/test_remove_2-0.py index ed8cc4e3e9ebd..b39c6dafc1696 100644 --- a/tests/tests_pytorch/deprecated_api/test_remove_2-0.py +++ b/tests/tests_pytorch/deprecated_api/test_remove_2-0.py @@ -20,7 +20,6 @@ from pytorch_lightning import Trainer from pytorch_lightning.demos.boring_classes import BoringModel from tests_pytorch.callbacks.test_callbacks import OldStatefulCallback - from tests_pytorch.helpers.runif import RunIf From 5c2e6ea930750c98fbae039be0a0b378a8f00c3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 21 Jul 2022 13:26:52 -0400 Subject: [PATCH 57/57] Update docs/source-pytorch/accelerators/gpu_intermediate.rst Co-authored-by: Akihiro Nitta --- docs/source-pytorch/accelerators/gpu_intermediate.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source-pytorch/accelerators/gpu_intermediate.rst b/docs/source-pytorch/accelerators/gpu_intermediate.rst index ede7bea653dcd..dbd2dfd790bd6 100644 --- a/docs/source-pytorch/accelerators/gpu_intermediate.rst +++ b/docs/source-pytorch/accelerators/gpu_intermediate.rst @@ -242,7 +242,7 @@ Comparison of DDP variants and tradeoffs - No - Yes - No - * - Is the guard ``if "__name__"=="__main__"`` required? + * - Is the guard ``if __name__=="__main__"`` required? - Yes - Yes - No