From 7e12ea2178e6bc8ace4a6faf78f8b7a0f1168b9a Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Tue, 19 Jul 2022 15:46:35 +0530 Subject: [PATCH 01/35] Rename GPUAccelerator to CUDAAccelerator --- docs/source-pytorch/api_references.rst | 2 +- docs/source-pytorch/common/trainer.rst | 8 ++-- .../source-pytorch/extensions/accelerator.rst | 2 +- .../accelerators/__init__.py | 2 +- .../accelerators/{gpu.py => cuda.py} | 10 ++++- .../loops/dataloader/evaluation_loop.py | 4 +- src/pytorch_lightning/loops/fit_loop.py | 4 +- src/pytorch_lightning/strategies/deepspeed.py | 4 +- src/pytorch_lightning/strategies/hivemind.py | 4 +- .../connectors/accelerator_connector.py | 20 +++++----- src/pytorch_lightning/trainer/trainer.py | 16 ++++---- src/pytorch_lightning/utilities/memory.py | 2 +- .../test_accelerator_connector.py | 30 +++++++-------- .../accelerators/test_accelerator_registry.py | 2 +- .../tests_pytorch/accelerators/test_common.py | 4 +- tests/tests_pytorch/accelerators/test_gpu.py | 8 ++-- .../callbacks/test_quantization.py | 6 +-- tests/tests_pytorch/models/test_gpu.py | 4 +- .../plugins/test_cluster_integration.py | 2 +- tests/tests_pytorch/strategies/test_ddp.py | 2 +- tests/tests_pytorch/trainer/test_trainer.py | 38 +++++++++---------- 21 files changed, 90 insertions(+), 84 deletions(-) rename src/pytorch_lightning/accelerators/{gpu.py => cuda.py} (94%) diff --git a/docs/source-pytorch/api_references.rst b/docs/source-pytorch/api_references.rst index ba95e74428a15..96a061a941b57 100644 --- a/docs/source-pytorch/api_references.rst +++ b/docs/source-pytorch/api_references.rst @@ -12,7 +12,7 @@ accelerators Accelerator CPUAccelerator - GPUAccelerator + CUDAAccelerator HPUAccelerator IPUAccelerator TPUAccelerator diff --git a/docs/source-pytorch/common/trainer.rst b/docs/source-pytorch/common/trainer.rst index ee4cd6c5c0005..b7c4c21018dcc 100644 --- a/docs/source-pytorch/common/trainer.rst +++ b/docs/source-pytorch/common/trainer.rst @@ -249,8 +249,8 @@ Example:: .. code-block:: python - # This is part of the built-in `GPUAccelerator` - class GPUAccelerator(Accelerator): + # This is part of the built-in `CUDAAccelerator` + class CUDAAccelerator(Accelerator): """Accelerator for GPU devices.""" @staticmethod @@ -603,8 +603,8 @@ based on the accelerator type (``"cpu", "gpu", "tpu", "ipu", "auto"``). .. code-block:: python - # This is part of the built-in `GPUAccelerator` - class GPUAccelerator(Accelerator): + # This is part of the built-in `CUDAAccelerator` + class CUDAAccelerator(Accelerator): """Accelerator for GPU devices.""" @staticmethod diff --git a/docs/source-pytorch/extensions/accelerator.rst b/docs/source-pytorch/extensions/accelerator.rst index fdfe9660b90aa..5fedd441fdd2c 100644 --- a/docs/source-pytorch/extensions/accelerator.rst +++ b/docs/source-pytorch/extensions/accelerator.rst @@ -125,7 +125,7 @@ Accelerator API Accelerator CPUAccelerator - GPUAccelerator + CUDAAccelerator HPUAccelerator IPUAccelerator MPSAccelerator diff --git a/src/pytorch_lightning/accelerators/__init__.py b/src/pytorch_lightning/accelerators/__init__.py index e7d757cd73149..b4521d931c734 100644 --- a/src/pytorch_lightning/accelerators/__init__.py +++ b/src/pytorch_lightning/accelerators/__init__.py @@ -12,7 +12,7 @@ # limitations under the License. from pytorch_lightning.accelerators.accelerator import Accelerator # noqa: F401 from pytorch_lightning.accelerators.cpu import CPUAccelerator # noqa: F401 -from pytorch_lightning.accelerators.gpu import GPUAccelerator # noqa: F401 +from pytorch_lightning.accelerators.cuda import CUDAAccelerator # noqa: F401 from pytorch_lightning.accelerators.hpu import HPUAccelerator # noqa: F401 from pytorch_lightning.accelerators.ipu import IPUAccelerator # noqa: F401 from pytorch_lightning.accelerators.mps import MPSAccelerator # noqa: F401 diff --git a/src/pytorch_lightning/accelerators/gpu.py b/src/pytorch_lightning/accelerators/cuda.py similarity index 94% rename from src/pytorch_lightning/accelerators/gpu.py rename to src/pytorch_lightning/accelerators/cuda.py index 898ce09b91431..89d1a5b284b0c 100644 --- a/src/pytorch_lightning/accelerators/gpu.py +++ b/src/pytorch_lightning/accelerators/cuda.py @@ -28,8 +28,8 @@ _log = logging.getLogger(__name__) -class GPUAccelerator(Accelerator): - """Accelerator for GPU devices.""" +class CUDAAccelerator(Accelerator): + """Accelerator for NVIDIA CUDA devices.""" def setup_environment(self, root_device: torch.device) -> None: """ @@ -92,6 +92,12 @@ def is_available() -> bool: @classmethod def register_accelerators(cls, accelerator_registry: Dict) -> None: + accelerator_registry.register( + "cuda", + cls, + description=f"{cls.__class__.__name__}", + ) + # temporarily enable "gpu" to point to the CUDA Accelerator accelerator_registry.register( "gpu", cls, diff --git a/src/pytorch_lightning/loops/dataloader/evaluation_loop.py b/src/pytorch_lightning/loops/dataloader/evaluation_loop.py index baf65d566d2dc..3a9c9ec0ac391 100644 --- a/src/pytorch_lightning/loops/dataloader/evaluation_loop.py +++ b/src/pytorch_lightning/loops/dataloader/evaluation_loop.py @@ -23,7 +23,7 @@ from torch.utils.data.dataloader import DataLoader import pytorch_lightning as pl -from pytorch_lightning.accelerators import GPUAccelerator +from pytorch_lightning.accelerators import CUDAAccelerator from pytorch_lightning.callbacks.progress.rich_progress import _RICH_AVAILABLE from pytorch_lightning.loops.dataloader import DataLoaderLoop from pytorch_lightning.loops.epoch import EvaluationEpochLoop @@ -411,7 +411,7 @@ def _select_data_fetcher_type(trainer: "pl.Trainer") -> Type[AbstractDataFetcher ) return DataLoaderIterDataFetcher elif os.getenv("PL_INTER_BATCH_PARALLELISM", "0") == "1": - if not isinstance(trainer.accelerator, GPUAccelerator): + if not isinstance(trainer.accelerator, CUDAAccelerator): raise MisconfigurationException("Inter batch parallelism is available only when using Nvidia GPUs.") return InterBatchParallelDataFetcher return DataFetcher diff --git a/src/pytorch_lightning/loops/fit_loop.py b/src/pytorch_lightning/loops/fit_loop.py index ab63b0e6df3be..8b54579a6bbfb 100644 --- a/src/pytorch_lightning/loops/fit_loop.py +++ b/src/pytorch_lightning/loops/fit_loop.py @@ -17,7 +17,7 @@ from typing import Optional, Type import pytorch_lightning as pl -from pytorch_lightning.accelerators import GPUAccelerator +from pytorch_lightning.accelerators import CUDAAccelerator from pytorch_lightning.loops import Loop from pytorch_lightning.loops.epoch import TrainingEpochLoop from pytorch_lightning.loops.epoch.training_epoch_loop import _OUTPUTS_TYPE as _EPOCH_OUTPUTS_TYPE @@ -340,7 +340,7 @@ def _select_data_fetcher(trainer: "pl.Trainer") -> Type[AbstractDataFetcher]: ) return DataLoaderIterDataFetcher elif os.getenv("PL_INTER_BATCH_PARALLELISM", "0") == "1": - if not isinstance(trainer.accelerator, GPUAccelerator): + if not isinstance(trainer.accelerator, CUDAAccelerator): raise MisconfigurationException("Inter batch parallelism is available only when using Nvidia GPUs.") return InterBatchParallelDataFetcher return DataFetcher diff --git a/src/pytorch_lightning/strategies/deepspeed.py b/src/pytorch_lightning/strategies/deepspeed.py index 9b4d3513c1aab..ede42754aafc9 100644 --- a/src/pytorch_lightning/strategies/deepspeed.py +++ b/src/pytorch_lightning/strategies/deepspeed.py @@ -27,7 +27,7 @@ from torch.optim import Optimizer import pytorch_lightning as pl -from pytorch_lightning.accelerators.gpu import GPUAccelerator +from pytorch_lightning.accelerators.cuda import CUDAAccelerator from pytorch_lightning.core.optimizer import _init_optimizers_and_lr_schedulers from pytorch_lightning.overrides.base import _LightningModuleWrapperBase, _LightningPrecisionModuleWrapperBase from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment @@ -452,7 +452,7 @@ def init_deepspeed(self): if self.lightning_module.trainer.gradient_clip_algorithm == GradClipAlgorithmType.VALUE: raise MisconfigurationException("DeepSpeed does not support clipping gradients by value.") - if not isinstance(self.accelerator, GPUAccelerator): + if not isinstance(self.accelerator, CUDAAccelerator): raise MisconfigurationException( f"DeepSpeed strategy is only supported on GPU but `{self.accelerator.__class__.__name__}` is used." ) diff --git a/src/pytorch_lightning/strategies/hivemind.py b/src/pytorch_lightning/strategies/hivemind.py index b274856bb6113..b258fe7f738ad 100644 --- a/src/pytorch_lightning/strategies/hivemind.py +++ b/src/pytorch_lightning/strategies/hivemind.py @@ -172,9 +172,9 @@ def num_peers(self) -> int: @property def root_device(self) -> torch.device: from pytorch_lightning.accelerators.cpu import CPUAccelerator - from pytorch_lightning.accelerators.gpu import GPUAccelerator + from pytorch_lightning.accelerators.cuda import CUDAAccelerator - if isinstance(self.accelerator, GPUAccelerator): + if isinstance(self.accelerator, CUDAAccelerator): return torch.device(f"cuda:{torch.cuda.current_device()}") elif isinstance(self.accelerator, CPUAccelerator): return torch.device("cpu") diff --git a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py index 2e112c754cbc5..ece0e5d27bdce 100644 --- a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -22,7 +22,7 @@ from pytorch_lightning.accelerators.accelerator import Accelerator from pytorch_lightning.accelerators.cpu import CPUAccelerator -from pytorch_lightning.accelerators.gpu import GPUAccelerator +from pytorch_lightning.accelerators.cuda import CUDAAccelerator from pytorch_lightning.accelerators.hpu import HPUAccelerator from pytorch_lightning.accelerators.ipu import IPUAccelerator from pytorch_lightning.accelerators.mps import MPSAccelerator @@ -370,12 +370,12 @@ def _check_config_and_set_final_flags( ) self._accelerator_flag = "cpu" if self._strategy_flag.parallel_devices[0].type == "cuda": - if self._accelerator_flag and self._accelerator_flag not in ("auto", "gpu"): + if self._accelerator_flag and self._accelerator_flag not in ("auto", "cuda", "gpu"): raise MisconfigurationException( f"GPU parallel_devices set through {self._strategy_flag.__class__.__name__} class," f" but accelerator set to {self._accelerator_flag}, please choose one device type" ) - self._accelerator_flag = "gpu" + self._accelerator_flag = "cuda" self._parallel_devices = self._strategy_flag.parallel_devices amp_type = amp_type if isinstance(amp_type, str) else None @@ -475,7 +475,7 @@ def _map_deprecated_devices_specific_info_to_accelerator_and_device_flag( if tpu_cores: self._accelerator_flag = "tpu" if gpus: - self._accelerator_flag = "gpu" + self._accelerator_flag = "cuda" if num_processes: self._accelerator_flag = "cpu" @@ -497,7 +497,7 @@ def _choose_accelerator(self) -> str: if MPSAccelerator.is_available(): return "mps" if torch.cuda.is_available() and torch.cuda.device_count() > 0: - return "gpu" + return "cuda" return "cpu" def _set_parallel_devices_and_init_accelerator(self) -> None: @@ -534,7 +534,7 @@ def _set_devices_flag_if_auto_passed(self) -> None: self._devices_flag = self.accelerator.auto_device_count() def _set_devices_flag_if_auto_select_gpus_passed(self) -> None: - if self._auto_select_gpus and isinstance(self._gpus, int) and isinstance(self.accelerator, GPUAccelerator): + if self._auto_select_gpus and isinstance(self._gpus, int) and isinstance(self.accelerator, CUDAAccelerator): self._devices_flag = pick_multiple_gpus(self._gpus) log.info(f"Auto select gpus: {self._devices_flag}") @@ -579,8 +579,8 @@ def _choose_strategy(self) -> Union[Strategy, str]: return DDPStrategy.strategy_name if len(self._parallel_devices) <= 1: # TODO: Change this once gpu accelerator was renamed to cuda accelerator - if isinstance(self._accelerator_flag, (GPUAccelerator, MPSAccelerator)) or ( - isinstance(self._accelerator_flag, str) and self._accelerator_flag in ("gpu", "mps") + if isinstance(self._accelerator_flag, (CUDAAccelerator, MPSAccelerator)) or ( + isinstance(self._accelerator_flag, str) and self._accelerator_flag in ("cuda", "gpu", "mps") ): device = device_parser.determine_root_gpu_device(self._parallel_devices) else: @@ -609,7 +609,7 @@ def _check_strategy_and_fallback(self) -> None: if ( strategy_flag in DDPFullyShardedNativeStrategy.get_registered_strategies() or isinstance(self._strategy_flag, DDPFullyShardedNativeStrategy) - ) and self._accelerator_flag != "gpu": + ) and self._accelerator_flag not in ("cuda", "gpu"): raise MisconfigurationException( f"You selected strategy to be `{DDPFullyShardedNativeStrategy.strategy_name}`, " "but GPU accelerator is not used." @@ -632,7 +632,7 @@ def _handle_horovod(self) -> None: ) hvd.init() - if isinstance(self.accelerator, GPUAccelerator): + if isinstance(self.accelerator, CUDAAccelerator): # Horovod assigns one local GPU per process self._parallel_devices = [torch.device(f"cuda:{i}") for i in range(hvd.local_size())] else: diff --git a/src/pytorch_lightning/trainer/trainer.py b/src/pytorch_lightning/trainer/trainer.py index 25357578ea24e..882326f870de6 100644 --- a/src/pytorch_lightning/trainer/trainer.py +++ b/src/pytorch_lightning/trainer/trainer.py @@ -38,7 +38,7 @@ import pytorch_lightning as pl from pytorch_lightning.accelerators import ( Accelerator, - GPUAccelerator, + CUDAAccelerator, HPUAccelerator, IPUAccelerator, MPSAccelerator, @@ -1735,7 +1735,7 @@ def __setup_profiler(self) -> None: def _log_device_info(self) -> None: - if GPUAccelerator.is_available(): + if CUDAAccelerator.is_available(): gpu_available = True gpu_type = " (cuda)" elif MPSAccelerator.is_available(): @@ -1745,7 +1745,7 @@ def _log_device_info(self) -> None: gpu_available = False gpu_type = "" - gpu_used = isinstance(self.accelerator, (GPUAccelerator, MPSAccelerator)) + gpu_used = isinstance(self.accelerator, (CUDAAccelerator, MPSAccelerator)) rank_zero_info(f"GPU available: {gpu_available}{gpu_type}, used: {gpu_used}") num_tpu_cores = self.num_devices if isinstance(self.accelerator, TPUAccelerator) else 0 @@ -1758,10 +1758,10 @@ def _log_device_info(self) -> None: rank_zero_info(f"HPU available: {_HPU_AVAILABLE}, using: {num_hpus} HPUs") # TODO: Integrate MPS Accelerator here, once gpu maps to both - if torch.cuda.is_available() and not isinstance(self.accelerator, GPUAccelerator): + if torch.cuda.is_available() and not isinstance(self.accelerator, CUDAAccelerator): rank_zero_warn( "GPU available but not used. Set `accelerator` and `devices` using" - f" `Trainer(accelerator='gpu', devices={GPUAccelerator.auto_device_count()})`.", + f" `Trainer(accelerator='gpu', devices={CUDAAccelerator.auto_device_count()})`.", category=PossibleUserWarning, ) @@ -2069,7 +2069,7 @@ def root_gpu(self) -> Optional[int]: "`Trainer.root_gpu` is deprecated in v1.6 and will be removed in v1.8. " "Please use `Trainer.strategy.root_device.index` instead." ) - return self.strategy.root_device.index if isinstance(self.accelerator, GPUAccelerator) else None + return self.strategy.root_device.index if isinstance(self.accelerator, CUDAAccelerator) else None @property def tpu_cores(self) -> int: @@ -2093,7 +2093,7 @@ def num_gpus(self) -> int: "`Trainer.num_gpus` was deprecated in v1.6 and will be removed in v1.8." " Please use `Trainer.num_devices` instead." ) - return self.num_devices if isinstance(self.accelerator, GPUAccelerator) else 0 + return self.num_devices if isinstance(self.accelerator, CUDAAccelerator) else 0 @property def devices(self) -> int: @@ -2109,7 +2109,7 @@ def data_parallel_device_ids(self) -> Optional[List[int]]: "`Trainer.data_parallel_device_ids` was deprecated in v1.6 and will be removed in v1.8." " Please use `Trainer.device_ids` instead." ) - return self.device_ids if isinstance(self.accelerator, GPUAccelerator) else None + return self.device_ids if isinstance(self.accelerator, CUDAAccelerator) else None @property def lightning_module(self) -> "pl.LightningModule": diff --git a/src/pytorch_lightning/utilities/memory.py b/src/pytorch_lightning/utilities/memory.py index 286a571001b0f..573dd6ed0f129 100644 --- a/src/pytorch_lightning/utilities/memory.py +++ b/src/pytorch_lightning/utilities/memory.py @@ -101,7 +101,7 @@ def get_gpu_memory_map() -> Dict[str, float]: r""" .. deprecated:: v1.5 This function was deprecated in v1.5 in favor of - `pytorch_lightning.accelerators.gpu._get_nvidia_gpu_stats` and will be removed in v1.7. + `pytorch_lightning.accelerators.cuda._get_nvidia_gpu_stats` and will be removed in v1.7. Get the current gpu usage. diff --git a/tests/tests_pytorch/accelerators/test_accelerator_connector.py b/tests/tests_pytorch/accelerators/test_accelerator_connector.py index 100a4cc1d1c7a..33911bffb0eb7 100644 --- a/tests/tests_pytorch/accelerators/test_accelerator_connector.py +++ b/tests/tests_pytorch/accelerators/test_accelerator_connector.py @@ -24,7 +24,7 @@ from pytorch_lightning import Trainer from pytorch_lightning.accelerators.accelerator import Accelerator from pytorch_lightning.accelerators.cpu import CPUAccelerator -from pytorch_lightning.accelerators.gpu import GPUAccelerator +from pytorch_lightning.accelerators.cuda import CUDAAccelerator from pytorch_lightning.accelerators.mps import MPSAccelerator from pytorch_lightning.plugins import DoublePrecisionPlugin, LayerSync, NativeSyncBatchNorm, PrecisionPlugin from pytorch_lightning.plugins.environments import ( @@ -259,14 +259,14 @@ def test_accelerator_cpu(_): with pytest.raises( MisconfigurationException, - match="GPUAccelerator can not run on your system since the accelerator is not available.", + match="CUDAAccelerator can not run on your system since the accelerator is not available.", ): with pytest.deprecated_call(match=r"is deprecated in v1.7 and will be removed"): Trainer(gpus=1) with pytest.raises( MisconfigurationException, - match="GPUAccelerator can not run on your system since the accelerator is not available.", + match="CUDAAccelerator can not run on your system since the accelerator is not available.", ): Trainer(accelerator="gpu") @@ -287,13 +287,13 @@ def test_accelererator_invalid_type_devices(mock_is_available, mock_device_count @RunIf(min_cuda_gpus=1) def test_accelerator_gpu(): trainer = Trainer(accelerator="gpu", devices=1) - assert isinstance(trainer.accelerator, GPUAccelerator) + assert isinstance(trainer.accelerator, CUDAAccelerator) trainer = Trainer(accelerator="gpu") - assert isinstance(trainer.accelerator, GPUAccelerator) + assert isinstance(trainer.accelerator, CUDAAccelerator) trainer = Trainer(accelerator="auto", devices=1) - assert isinstance(trainer.accelerator, GPUAccelerator) + assert isinstance(trainer.accelerator, CUDAAccelerator) @pytest.mark.parametrize(["devices", "strategy_class"], [(1, SingleDeviceStrategy), (5, DDPSpawnStrategy)]) @@ -312,13 +312,13 @@ def test_accelerator_gpu_with_devices(devices, strategy_class): trainer = Trainer(accelerator="gpu", devices=devices) assert trainer.num_devices == len(devices) if isinstance(devices, list) else devices assert isinstance(trainer.strategy, strategy_class) - assert isinstance(trainer.accelerator, GPUAccelerator) + assert isinstance(trainer.accelerator, CUDAAccelerator) @RunIf(min_cuda_gpus=1) def test_accelerator_auto_with_devices_gpu(): trainer = Trainer(accelerator="auto", devices=1) - assert isinstance(trainer.accelerator, GPUAccelerator) + assert isinstance(trainer.accelerator, CUDAAccelerator) assert trainer.num_devices == 1 @@ -392,7 +392,7 @@ def test_device_type_when_strategy_instance_gpu_passed(strategy_class): trainer = Trainer(strategy=strategy_class(), accelerator="gpu", devices=2) assert isinstance(trainer.strategy, strategy_class) - assert isinstance(trainer.accelerator, GPUAccelerator) + assert isinstance(trainer.accelerator, CUDAAccelerator) @pytest.mark.parametrize("precision", [1, 12, "invalid"]) @@ -419,7 +419,7 @@ def test_strategy_choice_ddp_spawn_cpu(): @mock.patch("torch.cuda.is_available", return_value=True) def test_strategy_choice_ddp(*_): trainer = Trainer(fast_dev_run=True, strategy="ddp", accelerator="gpu", devices=1) - assert isinstance(trainer.accelerator, GPUAccelerator) + assert isinstance(trainer.accelerator, CUDAAccelerator) assert isinstance(trainer.strategy, DDPStrategy) assert isinstance(trainer.strategy.cluster_environment, LightningEnvironment) @@ -429,7 +429,7 @@ def test_strategy_choice_ddp(*_): @mock.patch("torch.cuda.is_available", return_value=True) def test_strategy_choice_ddp_spawn(cuda_available_mock, device_count_mock): trainer = Trainer(fast_dev_run=True, strategy="ddp_spawn", accelerator="gpu", devices=1) - assert isinstance(trainer.accelerator, GPUAccelerator) + assert isinstance(trainer.accelerator, CUDAAccelerator) assert isinstance(trainer.strategy, DDPSpawnStrategy) assert isinstance(trainer.strategy.cluster_environment, LightningEnvironment) @@ -451,7 +451,7 @@ def test_strategy_choice_ddp_spawn(cuda_available_mock, device_count_mock): def test_strategy_choice_ddp_slurm(setup_distributed_mock, strategy): trainer = Trainer(fast_dev_run=True, strategy=strategy, accelerator="gpu", devices=2) assert trainer._accelerator_connector._is_slurm_managing_tasks() - assert isinstance(trainer.accelerator, GPUAccelerator) + assert isinstance(trainer.accelerator, CUDAAccelerator) assert isinstance(trainer.strategy, DDPStrategy) assert isinstance(trainer.strategy.cluster_environment, SLURMEnvironment) assert trainer.strategy.cluster_environment.local_rank() == 1 @@ -477,7 +477,7 @@ def test_strategy_choice_ddp_slurm(setup_distributed_mock, strategy): @mock.patch("torch.cuda.is_available", return_value=True) def test_strategy_choice_ddp_te(*_): trainer = Trainer(fast_dev_run=True, strategy="ddp", accelerator="gpu", devices=2) - assert isinstance(trainer.accelerator, GPUAccelerator) + assert isinstance(trainer.accelerator, CUDAAccelerator) assert isinstance(trainer.strategy, DDPStrategy) assert isinstance(trainer.strategy.cluster_environment, TorchElasticEnvironment) assert trainer.strategy.cluster_environment.local_rank() == 1 @@ -524,7 +524,7 @@ def test_strategy_choice_ddp_cpu_te(*_): @mock.patch("torch.cuda.is_available", return_value=True) def test_strategy_choice_ddp_kubeflow(*_): trainer = Trainer(fast_dev_run=True, strategy="ddp", accelerator="gpu", devices=1) - assert isinstance(trainer.accelerator, GPUAccelerator) + assert isinstance(trainer.accelerator, CUDAAccelerator) assert isinstance(trainer.strategy, DDPStrategy) assert isinstance(trainer.strategy.cluster_environment, KubeflowEnvironment) assert trainer.strategy.cluster_environment.local_rank() == 0 @@ -649,7 +649,7 @@ def test_devices_auto_choice_cpu( def test_devices_auto_choice_gpu(is_gpu_available_mock, device_count_mock): trainer = Trainer(accelerator="auto", devices="auto") - assert isinstance(trainer.accelerator, GPUAccelerator) + assert isinstance(trainer.accelerator, CUDAAccelerator) assert trainer.num_devices == 2 diff --git a/tests/tests_pytorch/accelerators/test_accelerator_registry.py b/tests/tests_pytorch/accelerators/test_accelerator_registry.py index 11c806a21c740..791d4c33dbbe8 100644 --- a/tests/tests_pytorch/accelerators/test_accelerator_registry.py +++ b/tests/tests_pytorch/accelerators/test_accelerator_registry.py @@ -63,4 +63,4 @@ def is_available(): def test_available_accelerators_in_registry(): - assert AcceleratorRegistry.available_accelerators() == ["cpu", "gpu", "hpu", "ipu", "mps", "tpu"] + assert AcceleratorRegistry.available_accelerators() == ["cpu", "cuda", "gpu", "hpu", "ipu", "mps", "tpu"] diff --git a/tests/tests_pytorch/accelerators/test_common.py b/tests/tests_pytorch/accelerators/test_common.py index 3de26b5888390..9395c7e84c709 100644 --- a/tests/tests_pytorch/accelerators/test_common.py +++ b/tests/tests_pytorch/accelerators/test_common.py @@ -14,14 +14,14 @@ from unittest import mock from pytorch_lightning import Trainer -from pytorch_lightning.accelerators import Accelerator, CPUAccelerator, GPUAccelerator, IPUAccelerator, TPUAccelerator +from pytorch_lightning.accelerators import Accelerator, CPUAccelerator, CUDAAccelerator, IPUAccelerator, TPUAccelerator from pytorch_lightning.strategies import DDPStrategy @mock.patch("torch.cuda.device_count", return_value=2) def test_auto_device_count(device_count_mock): assert CPUAccelerator.auto_device_count() == 1 - assert GPUAccelerator.auto_device_count() == 2 + assert CUDAAccelerator.auto_device_count() == 2 assert TPUAccelerator.auto_device_count() == 8 assert IPUAccelerator.auto_device_count() == 4 diff --git a/tests/tests_pytorch/accelerators/test_gpu.py b/tests/tests_pytorch/accelerators/test_gpu.py index f6334780d75a5..e660ff270f921 100644 --- a/tests/tests_pytorch/accelerators/test_gpu.py +++ b/tests/tests_pytorch/accelerators/test_gpu.py @@ -17,8 +17,8 @@ import torch from pytorch_lightning import Trainer -from pytorch_lightning.accelerators import GPUAccelerator -from pytorch_lightning.accelerators.gpu import get_nvidia_gpu_stats +from pytorch_lightning.accelerators import CUDAAccelerator +from pytorch_lightning.accelerators.cuda import get_nvidia_gpu_stats from pytorch_lightning.demos.boring_classes import BoringModel from tests_pytorch.helpers.runif import RunIf @@ -26,7 +26,7 @@ @RunIf(min_cuda_gpus=1) def test_get_torch_gpu_stats(tmpdir): current_device = torch.device(f"cuda:{torch.cuda.current_device()}") - gpu_stats = GPUAccelerator().get_device_stats(current_device) + gpu_stats = CUDAAccelerator().get_device_stats(current_device) fields = ["allocated_bytes.all.freed", "inactive_split.all.peak", "reserved_bytes.large_pool.peak"] for f in fields: @@ -62,7 +62,7 @@ def test_set_cuda_device(set_device_mock, tmpdir): @RunIf(min_cuda_gpus=1) def test_gpu_availability(): - assert GPUAccelerator.is_available() + assert CUDAAccelerator.is_available() @RunIf(min_cuda_gpus=1) diff --git a/tests/tests_pytorch/callbacks/test_quantization.py b/tests/tests_pytorch/callbacks/test_quantization.py index 1a12728c7face..41d0810a0aec8 100644 --- a/tests/tests_pytorch/callbacks/test_quantization.py +++ b/tests/tests_pytorch/callbacks/test_quantization.py @@ -20,7 +20,7 @@ from torchmetrics.functional import mean_absolute_percentage_error as mape from pytorch_lightning import seed_everything, Trainer -from pytorch_lightning.accelerators import GPUAccelerator +from pytorch_lightning.accelerators import CUDAAccelerator from pytorch_lightning.callbacks import QuantizationAwareTraining from pytorch_lightning.demos.boring_classes import RandomDataset from pytorch_lightning.utilities.exceptions import MisconfigurationException @@ -38,9 +38,9 @@ @RunIf(quantization=True, max_torch="1.11") def test_quantization(tmpdir, observe: str, fuse: bool, convert: bool): """Parity test for quant model.""" - cuda_available = GPUAccelerator.is_available() + cuda_available = CUDAAccelerator.is_available() - if observe == "average" and not fuse and GPUAccelerator.is_available(): + if observe == "average" and not fuse and CUDAAccelerator.is_available(): pytest.xfail("TODO: flakiness in GPU CI") seed_everything(42) diff --git a/tests/tests_pytorch/models/test_gpu.py b/tests/tests_pytorch/models/test_gpu.py index ffd093e6ee0e3..bdc61ca399e12 100644 --- a/tests/tests_pytorch/models/test_gpu.py +++ b/tests/tests_pytorch/models/test_gpu.py @@ -23,7 +23,7 @@ import tests_pytorch.helpers.pipelines as tpipes import tests_pytorch.helpers.utils as tutils from pytorch_lightning import Trainer -from pytorch_lightning.accelerators import CPUAccelerator, GPUAccelerator +from pytorch_lightning.accelerators import CPUAccelerator, CUDAAccelerator from pytorch_lightning.demos.boring_classes import BoringModel from pytorch_lightning.plugins.environments import TorchElasticEnvironment from pytorch_lightning.utilities import device_parser @@ -196,7 +196,7 @@ def test_torchelastic_gpu_parsing(mocked_device_count, mocked_is_available, gpus assert isinstance(trainer._accelerator_connector.cluster_environment, TorchElasticEnvironment) # when use gpu if device_parser.parse_gpu_ids(gpus, include_cuda=True) is not None: - assert isinstance(trainer.accelerator, GPUAccelerator) + assert isinstance(trainer.accelerator, CUDAAccelerator) assert trainer.num_devices == len(gpus) if isinstance(gpus, list) else gpus assert trainer.device_ids == device_parser.parse_gpu_ids(gpus, include_cuda=True) # fall back to cpu diff --git a/tests/tests_pytorch/plugins/test_cluster_integration.py b/tests/tests_pytorch/plugins/test_cluster_integration.py index a5c4f7e101761..c413b0015db61 100644 --- a/tests/tests_pytorch/plugins/test_cluster_integration.py +++ b/tests/tests_pytorch/plugins/test_cluster_integration.py @@ -59,7 +59,7 @@ def environment_combinations(): "strategy_cls", [DDPStrategy, DDPShardedStrategy, pytest.param(DeepSpeedStrategy, marks=RunIf(deepspeed=True))], ) -@mock.patch("pytorch_lightning.accelerators.gpu.GPUAccelerator.is_available", return_value=True) +@mock.patch("pytorch_lightning.accelerators.cuda.CUDAAccelerator.is_available", return_value=True) def test_ranks_available_manual_strategy_selection(mock_gpu_acc_available, strategy_cls): """Test that the rank information is readily available after Trainer initialization.""" num_nodes = 2 diff --git a/tests/tests_pytorch/strategies/test_ddp.py b/tests/tests_pytorch/strategies/test_ddp.py index 58fa28559b97f..003fe2250b575 100644 --- a/tests/tests_pytorch/strategies/test_ddp.py +++ b/tests/tests_pytorch/strategies/test_ddp.py @@ -103,7 +103,7 @@ def test_torch_distributed_backend_env_variables(tmpdir): @mock.patch("torch.cuda.set_device") @mock.patch("torch.cuda.is_available", return_value=True) @mock.patch("torch.cuda.device_count", return_value=1) -@mock.patch("pytorch_lightning.accelerators.gpu.GPUAccelerator.is_available", return_value=True) +@mock.patch("pytorch_lightning.accelerators.cuda.CUDAAccelerator.is_available", return_value=True) @mock.patch.dict(os.environ, {"PL_TORCH_DISTRIBUTED_BACKEND": "gloo"}, clear=True) def test_ddp_torch_dist_is_available_in_setup( mock_gpu_is_available, mock_device_count, mock_cuda_available, mock_set_device, tmpdir diff --git a/tests/tests_pytorch/trainer/test_trainer.py b/tests/tests_pytorch/trainer/test_trainer.py index a0d20fc58ed1c..c46c0168db558 100644 --- a/tests/tests_pytorch/trainer/test_trainer.py +++ b/tests/tests_pytorch/trainer/test_trainer.py @@ -34,7 +34,7 @@ import pytorch_lightning import tests_pytorch.helpers.utils as tutils from pytorch_lightning import Callback, LightningDataModule, LightningModule, Trainer -from pytorch_lightning.accelerators import CPUAccelerator, GPUAccelerator +from pytorch_lightning.accelerators import CPUAccelerator, CUDAAccelerator from pytorch_lightning.callbacks import EarlyStopping, GradientAccumulationScheduler, ModelCheckpoint, Timer from pytorch_lightning.callbacks.fault_tolerance import _FaultToleranceCheckpoint from pytorch_lightning.callbacks.prediction_writer import BasePredictionWriter @@ -1967,21 +1967,21 @@ def training_step(self, batch, batch_idx): {"strategy": None, "accelerator": "gpu", "devices": 1}, SingleDeviceStrategy, "single_device", - GPUAccelerator, + CUDAAccelerator, 1, ), - ({"strategy": "dp", "accelerator": "gpu", "devices": 1}, DataParallelStrategy, "dp", GPUAccelerator, 1), - ({"strategy": "ddp", "accelerator": "gpu", "devices": 1}, DDPStrategy, "ddp", GPUAccelerator, 1), + ({"strategy": "dp", "accelerator": "gpu", "devices": 1}, DataParallelStrategy, "dp", CUDAAccelerator, 1), + ({"strategy": "ddp", "accelerator": "gpu", "devices": 1}, DDPStrategy, "ddp", CUDAAccelerator, 1), ( {"strategy": "ddp_spawn", "accelerator": "gpu", "devices": 1}, DDPSpawnStrategy, "ddp_spawn", - GPUAccelerator, + CUDAAccelerator, 1, ), - ({"strategy": None, "accelerator": "gpu", "devices": 2}, DDPSpawnStrategy, "ddp_spawn", GPUAccelerator, 2), - ({"strategy": "dp", "accelerator": "gpu", "devices": 2}, DataParallelStrategy, "dp", GPUAccelerator, 2), - ({"strategy": "ddp", "accelerator": "gpu", "devices": 2}, DDPStrategy, "ddp", GPUAccelerator, 2), + ({"strategy": None, "accelerator": "gpu", "devices": 2}, DDPSpawnStrategy, "ddp_spawn", CUDAAccelerator, 2), + ({"strategy": "dp", "accelerator": "gpu", "devices": 2}, DataParallelStrategy, "dp", CUDAAccelerator, 2), + ({"strategy": "ddp", "accelerator": "gpu", "devices": 2}, DDPStrategy, "ddp", CUDAAccelerator, 2), ({"strategy": "ddp", "accelerator": "cpu", "devices": 2}, DDPStrategy, "ddp", CPUAccelerator, 2), ( {"strategy": "ddp_spawn", "accelerator": "cpu", "devices": 2}, @@ -2001,7 +2001,7 @@ def training_step(self, batch, batch_idx): {"strategy": "ddp_fully_sharded", "accelerator": "gpu", "devices": 1}, DDPFullyShardedStrategy, "ddp_fully_sharded", - GPUAccelerator, + CUDAAccelerator, 1, ), ( @@ -2015,65 +2015,65 @@ def training_step(self, batch, batch_idx): {"strategy": DDPSpawnStrategy(), "accelerator": "gpu", "devices": 2}, DDPSpawnStrategy, "ddp_spawn", - GPUAccelerator, + CUDAAccelerator, 2, ), ({"strategy": DDPStrategy()}, DDPStrategy, "ddp", CPUAccelerator, 1), - ({"strategy": DDPStrategy(), "accelerator": "gpu", "devices": 2}, DDPStrategy, "ddp", GPUAccelerator, 2), + ({"strategy": DDPStrategy(), "accelerator": "gpu", "devices": 2}, DDPStrategy, "ddp", CUDAAccelerator, 2), ( {"strategy": DataParallelStrategy(), "accelerator": "gpu", "devices": 2}, DataParallelStrategy, "dp", - GPUAccelerator, + CUDAAccelerator, 2, ), ( {"strategy": DDPFullyShardedStrategy(), "accelerator": "gpu", "devices": 2}, DDPFullyShardedStrategy, "ddp_fully_sharded", - GPUAccelerator, + CUDAAccelerator, 2, ), ( {"strategy": DDPSpawnShardedStrategy(), "accelerator": "gpu", "devices": 2}, DDPSpawnShardedStrategy, "ddp_sharded_spawn", - GPUAccelerator, + CUDAAccelerator, 2, ), ( {"strategy": DDPShardedStrategy(), "accelerator": "gpu", "devices": 2}, DDPShardedStrategy, "ddp_sharded", - GPUAccelerator, + CUDAAccelerator, 2, ), ( {"strategy": "ddp_spawn", "accelerator": "gpu", "devices": 2, "num_nodes": 2}, DDPSpawnStrategy, "ddp_spawn", - GPUAccelerator, + CUDAAccelerator, 2, ), ( {"strategy": "ddp_fully_sharded", "accelerator": "gpu", "devices": 1, "num_nodes": 2}, DDPFullyShardedStrategy, "ddp_fully_sharded", - GPUAccelerator, + CUDAAccelerator, 1, ), ( {"strategy": "ddp_sharded", "accelerator": "gpu", "devices": 2, "num_nodes": 2}, DDPShardedStrategy, "ddp_sharded", - GPUAccelerator, + CUDAAccelerator, 2, ), ( {"strategy": "ddp_sharded_spawn", "accelerator": "gpu", "devices": 2, "num_nodes": 2}, DDPSpawnShardedStrategy, "ddp_sharded_spawn", - GPUAccelerator, + CUDAAccelerator, 2, ), ], From d630a2c8614cbe08385c1faf1890fb8344593587 Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Tue, 19 Jul 2022 15:49:51 +0530 Subject: [PATCH 02/35] Add back GPUAccelerator and deprecate it --- .../accelerators/__init__.py | 1 + src/pytorch_lightning/accelerators/gpu.py | 31 +++++++++++++++++++ .../deprecated_api/test_remove_1-9.py | 11 +++++++ 3 files changed, 43 insertions(+) create mode 100644 src/pytorch_lightning/accelerators/gpu.py diff --git a/src/pytorch_lightning/accelerators/__init__.py b/src/pytorch_lightning/accelerators/__init__.py index b4521d931c734..1bba4a42879bc 100644 --- a/src/pytorch_lightning/accelerators/__init__.py +++ b/src/pytorch_lightning/accelerators/__init__.py @@ -13,6 +13,7 @@ from pytorch_lightning.accelerators.accelerator import Accelerator # noqa: F401 from pytorch_lightning.accelerators.cpu import CPUAccelerator # noqa: F401 from pytorch_lightning.accelerators.cuda import CUDAAccelerator # noqa: F401 +from pytorch_lightning.accelerators.gpu import GPUAccelerator # noqa: F401 from pytorch_lightning.accelerators.hpu import HPUAccelerator # noqa: F401 from pytorch_lightning.accelerators.ipu import IPUAccelerator # noqa: F401 from pytorch_lightning.accelerators.mps import MPSAccelerator # noqa: F401 diff --git a/src/pytorch_lightning/accelerators/gpu.py b/src/pytorch_lightning/accelerators/gpu.py new file mode 100644 index 0000000000000..a7d054b946393 --- /dev/null +++ b/src/pytorch_lightning/accelerators/gpu.py @@ -0,0 +1,31 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from pytorch_lightning.accelerators.cuda import CUDAAccelerator +from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation + + +class GPUAccelerator(CUDAAccelerator): + """Accelerator for NVIDIA GPU devices. + + .. deprecated:: 1.9 + + Please use the ``CUDAAccelerator`` instead. + """ + + def __init__(self) -> None: + rank_zero_deprecation( + "The `GPUAccelerator` has been renamed to `CUDAAccelerator` and will be removed in v1.9." + " Please use the `CUDAAccelerator` instead!" + ) + super().__init__() diff --git a/tests/tests_pytorch/deprecated_api/test_remove_1-9.py b/tests/tests_pytorch/deprecated_api/test_remove_1-9.py index 66bbf80d4e3ea..9c7d02d499ab4 100644 --- a/tests/tests_pytorch/deprecated_api/test_remove_1-9.py +++ b/tests/tests_pytorch/deprecated_api/test_remove_1-9.py @@ -18,6 +18,7 @@ import pytorch_lightning.loggers.base as logger_base from pytorch_lightning import Trainer +from pytorch_lightning.accelerators.gpu import GPUAccelerator from pytorch_lightning.core.module import LightningModule from pytorch_lightning.demos.boring_classes import BoringModel from pytorch_lightning.profiler.advanced import AdvancedProfiler @@ -195,3 +196,13 @@ def test_pytorch_profiler_schedule_wrapper_deprecation_warning(): def test_pytorch_profiler_register_record_function_deprecation_warning(): with pytest.deprecated_call(match="RegisterRecordFunction` is deprecated in v1.7 and will be removed in in v1.9."): _ = RegisterRecordFunction(None) + + +def test_gpu_accelerator_deprecation_warning(): + with pytest.deprecated_call( + match=( + "The `GPUAccelerator` has been renamed to `CUDAAccelerator` and will be removed in v1.9." + + " Please use the `CUDAAccelerator` instead!" + ) + ): + GPUAccelerator() From 94b68ec0bb88555a918559a1400f8c641f36f92f Mon Sep 17 00:00:00 2001 From: Justus Schock <12886177+justusschock@users.noreply.github.com> Date: Tue, 19 Jul 2022 14:53:54 +0200 Subject: [PATCH 03/35] Remove temporary registration --- src/pytorch_lightning/accelerators/cuda.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/pytorch_lightning/accelerators/cuda.py b/src/pytorch_lightning/accelerators/cuda.py index 89d1a5b284b0c..2764b59321df1 100644 --- a/src/pytorch_lightning/accelerators/cuda.py +++ b/src/pytorch_lightning/accelerators/cuda.py @@ -97,12 +97,6 @@ def register_accelerators(cls, accelerator_registry: Dict) -> None: cls, description=f"{cls.__class__.__name__}", ) - # temporarily enable "gpu" to point to the CUDA Accelerator - accelerator_registry.register( - "gpu", - cls, - description=f"{cls.__class__.__name__}", - ) def teardown(self) -> None: # clean up memory From c1457554569440e70b66322054e6ac9ac5d6db4e Mon Sep 17 00:00:00 2001 From: Justus Schock <12886177+justusschock@users.noreply.github.com> Date: Wed, 20 Jul 2022 14:41:11 +0200 Subject: [PATCH 04/35] accelerator connector reroute --- .../connectors/accelerator_connector.py | 23 +++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py index ece0e5d27bdce..7c54c0554744b 100644 --- a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -199,10 +199,15 @@ def __init__( devices=devices, num_nodes=num_nodes, num_processes=num_processes, gpus=gpus, ipus=ipus, tpu_cores=tpu_cores ) # 2. Instantiate Accelerator - # handle `auto` and `None` self._set_accelerator_if_ipu_strategy_is_passed() + + # handle "gpu" + if self._accelerator_flag == "gpu": + self._accelerator_flag = self._choose_gpu_accelerator_backend() + + # handle `auto` and `None` if self._accelerator_flag == "auto" or self._accelerator_flag is None: - self._accelerator_flag = self._choose_accelerator() + self._accelerator_flag = self._choose_auto_accelerator() self._set_parallel_devices_and_init_accelerator() # 3. Instantiate ClusterEnvironment @@ -485,7 +490,7 @@ def _set_accelerator_if_ipu_strategy_is_passed(self) -> None: if isinstance(self._strategy_flag, IPUStrategy): self._accelerator_flag = "ipu" - def _choose_accelerator(self) -> str: + def _choose_auto_accelerator(self) -> str: """Choose the accelerator type (str) based on availability when ``accelerator='auto'``.""" if self._accelerator_flag == "auto": if _TPU_AVAILABLE: @@ -496,10 +501,20 @@ def _choose_accelerator(self) -> str: return "hpu" if MPSAccelerator.is_available(): return "mps" - if torch.cuda.is_available() and torch.cuda.device_count() > 0: + if CUDAAccelerator.is_available(): return "cuda" return "cpu" + @staticmethod + def _choose_gpu_accelerator_backend() -> str: + if CUDAAccelerator.is_available(): + return "cuda" + + if MPSAccelerator.is_available(): + return "mps" + + raise RuntimeError('No supported gpu backend found!') + def _set_parallel_devices_and_init_accelerator(self) -> None: if isinstance(self._accelerator_flag, Accelerator): self.accelerator: Accelerator = self._accelerator_flag From 953d5511cb157cdf19ae78a080170cda18e42262 Mon Sep 17 00:00:00 2001 From: Justus Schock <12886177+justusschock@users.noreply.github.com> Date: Wed, 20 Jul 2022 14:41:21 +0200 Subject: [PATCH 05/35] accelerator_connector tests --- .../accelerators/test_accelerator_connector.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/tests/tests_pytorch/accelerators/test_accelerator_connector.py b/tests/tests_pytorch/accelerators/test_accelerator_connector.py index 33911bffb0eb7..9f3c4273e52fd 100644 --- a/tests/tests_pytorch/accelerators/test_accelerator_connector.py +++ b/tests/tests_pytorch/accelerators/test_accelerator_connector.py @@ -737,8 +737,18 @@ def test_plugin_only_one_instance_for_one_type(plugins, expected): Trainer(plugins=plugins) -@pytest.mark.parametrize("accelerator", ("cpu", "gpu", "tpu", "ipu")) +@pytest.mark.parametrize("accelerator", ("cpu", "cuda", "gpu", "tpu", "ipu")) @pytest.mark.parametrize("devices", ("0", 0, [])) def test_passing_zero_and_empty_list_to_devices_flag(accelerator, devices): with pytest.raises(MisconfigurationException, match="value is not a valid input using"): Trainer(accelerator=accelerator, devices=devices) + + +@pytest.marks.parametrize("expected_accelerator_flag,expected_accelerator_class", +[pytest.param(("cuda", CUDAAccelerator), marks=RunIf(min_cuda_gpus=1)), +pytest.param(("mps",MPSAccelerator), marks=RunIf(mps=True)),]) +def test_gpu_accelerator_backend_choice(expected_accelerator_flag, expected_accelerator_class): + + trainer = Trainer(accelerator='gpu') + assert trainer._accelerator_connector._accelerator_flag == expected_accelerator_flag + assert isinstance(trainer.accelerator, expected_accelerator_class) \ No newline at end of file From 7d443cf9c27daadf6fc8a49db0c85f7f8cb18751 Mon Sep 17 00:00:00 2001 From: Justus Schock <12886177+justusschock@users.noreply.github.com> Date: Wed, 20 Jul 2022 14:41:29 +0200 Subject: [PATCH 06/35] update enums --- src/pytorch_lightning/utilities/enums.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pytorch_lightning/utilities/enums.py b/src/pytorch_lightning/utilities/enums.py index b7f714d230971..41e6e394984d4 100644 --- a/src/pytorch_lightning/utilities/enums.py +++ b/src/pytorch_lightning/utilities/enums.py @@ -242,7 +242,7 @@ class _AcceleratorType(LightningEnum): >>> _AcceleratorType.CPU == _AcceleratorType.from_str('cpu') True >>> # you can match the type with string - >>> _AcceleratorType.GPU == 'GPU' + >>> _AcceleratorType.CUDA == 'CUDA' True >>> # which is case invariant >>> _AcceleratorType.TPU in ('tpu', 'CPU') @@ -250,7 +250,7 @@ class _AcceleratorType(LightningEnum): """ CPU = "CPU" - GPU = "GPU" + CUDA = "CUDA" IPU = "IPU" TPU = "TPU" HPU = "HPU" From 729a8bce5a2d53e37663b1fedf18eb01c484ba6f Mon Sep 17 00:00:00 2001 From: Justus Schock <12886177+justusschock@users.noreply.github.com> Date: Wed, 20 Jul 2022 14:41:39 +0200 Subject: [PATCH 07/35] lite support + tests --- src/pytorch_lightning/lite/lite.py | 6 +++--- tests/tests_pytorch/lite/test_lite.py | 2 ++ 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/pytorch_lightning/lite/lite.py b/src/pytorch_lightning/lite/lite.py index f5cebcd89a63d..01db7263af33d 100644 --- a/src/pytorch_lightning/lite/lite.py +++ b/src/pytorch_lightning/lite/lite.py @@ -54,7 +54,7 @@ class LightningLite(ABC): - Multi-node support. Args: - accelerator: The hardware to run on. Possible choices are: ``"cpu"``, ``"gpu"``, ``"tpu"``, ``"auto"``. + accelerator: The hardware to run on. Possible choices are: ``"cpu"``, ``"cuda"``, ``"mps"``, ``"gpu"``, ``"tpu"``, ``"auto"``. strategy: Strategy for how to run across multiple devices. Possible choices are: ``"dp"``, ``"ddp"``, ``"ddp_spawn"``, ``"deepspeed"``, ``"ddp_sharded"``. devices: Number of devices to train on (``int``), which GPUs to train on (``list`` or ``str``), or ``"auto"``. @@ -436,7 +436,7 @@ def _get_distributed_sampler(dataloader: DataLoader, **kwargs: Any) -> Distribut return DistributedSamplerWrapper(dataloader.sampler, **kwargs) def _check_accelerator_support(self, accelerator: Optional[Union[str, Accelerator]]) -> None: - supported = [t.value.lower() for t in self._supported_device_types()] + ["auto"] + supported = [t.value.lower() for t in self._supported_device_types()] + ["gpu", "auto"] valid = accelerator is None or isinstance(accelerator, Accelerator) or accelerator in supported if not valid: raise MisconfigurationException( @@ -457,7 +457,7 @@ def _check_strategy_support(self, strategy: Optional[Union[str, Strategy]]) -> N def _supported_device_types() -> Sequence[_AcceleratorType]: return ( _AcceleratorType.CPU, - _AcceleratorType.GPU, + _AcceleratorType.CUDA, _AcceleratorType.TPU, _AcceleratorType.MPS, ) diff --git a/tests/tests_pytorch/lite/test_lite.py b/tests/tests_pytorch/lite/test_lite.py index 7166be0981846..4560cea6747cc 100644 --- a/tests/tests_pytorch/lite/test_lite.py +++ b/tests/tests_pytorch/lite/test_lite.py @@ -313,9 +313,11 @@ def test_setup_dataloaders_replace_standard_sampler(shuffle, strategy): "accelerator, expected", [ ("cpu", "cpu"), + pytest.param("cuda", "cuda:0", marks=RunIf(min_cuda_gpus=1)), pytest.param("gpu", "cuda:0", marks=RunIf(min_cuda_gpus=1)), pytest.param("tpu", "xla:0", marks=RunIf(tpu=True)), pytest.param("mps", "mps:0", marks=RunIf(mps=True)), + pytest.param("gpu", "mps:0", marks=RunIf(mps=True)) ], ) def test_to_device(accelerator, expected): From a170ae54d1eb9185c23d26b0658f5f9084d1f76d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 20 Jul 2022 12:42:48 +0000 Subject: [PATCH 08/35] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../trainer/connectors/accelerator_connector.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py index 7c54c0554744b..60ac2b0279733 100644 --- a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -204,7 +204,7 @@ def __init__( # handle "gpu" if self._accelerator_flag == "gpu": self._accelerator_flag = self._choose_gpu_accelerator_backend() - + # handle `auto` and `None` if self._accelerator_flag == "auto" or self._accelerator_flag is None: self._accelerator_flag = self._choose_auto_accelerator() @@ -509,11 +509,11 @@ def _choose_auto_accelerator(self) -> str: def _choose_gpu_accelerator_backend() -> str: if CUDAAccelerator.is_available(): return "cuda" - + if MPSAccelerator.is_available(): return "mps" - raise RuntimeError('No supported gpu backend found!') + raise RuntimeError("No supported gpu backend found!") def _set_parallel_devices_and_init_accelerator(self) -> None: if isinstance(self._accelerator_flag, Accelerator): From 7ddc024e254c7a4f3018c78e52feead05d54e0db Mon Sep 17 00:00:00 2001 From: Justus Schock <12886177+justusschock@users.noreply.github.com> Date: Wed, 20 Jul 2022 14:53:32 +0200 Subject: [PATCH 09/35] typo --- .../accelerators/test_accelerator_connector.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/tests_pytorch/accelerators/test_accelerator_connector.py b/tests/tests_pytorch/accelerators/test_accelerator_connector.py index 9f3c4273e52fd..e7a067237ab78 100644 --- a/tests/tests_pytorch/accelerators/test_accelerator_connector.py +++ b/tests/tests_pytorch/accelerators/test_accelerator_connector.py @@ -744,9 +744,9 @@ def test_passing_zero_and_empty_list_to_devices_flag(accelerator, devices): Trainer(accelerator=accelerator, devices=devices) -@pytest.marks.parametrize("expected_accelerator_flag,expected_accelerator_class", -[pytest.param(("cuda", CUDAAccelerator), marks=RunIf(min_cuda_gpus=1)), -pytest.param(("mps",MPSAccelerator), marks=RunIf(mps=True)),]) +@pytest.mark.parametrize("expected_accelerator_flag,expected_accelerator_class", +[pytest.param("cuda", CUDAAccelerator, marks=RunIf(min_cuda_gpus=1)), +pytest.param("mps",MPSAccelerator, marks=RunIf(mps=True)),]) def test_gpu_accelerator_backend_choice(expected_accelerator_flag, expected_accelerator_class): trainer = Trainer(accelerator='gpu') From 2575c0101eefb2566047f13c0b5401833c331779 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 20 Jul 2022 12:55:07 +0000 Subject: [PATCH 10/35] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../accelerators/test_accelerator_connector.py | 16 ++++++++++------ tests/tests_pytorch/lite/test_lite.py | 2 +- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/tests/tests_pytorch/accelerators/test_accelerator_connector.py b/tests/tests_pytorch/accelerators/test_accelerator_connector.py index e7a067237ab78..9b94cb7e4b1e1 100644 --- a/tests/tests_pytorch/accelerators/test_accelerator_connector.py +++ b/tests/tests_pytorch/accelerators/test_accelerator_connector.py @@ -744,11 +744,15 @@ def test_passing_zero_and_empty_list_to_devices_flag(accelerator, devices): Trainer(accelerator=accelerator, devices=devices) -@pytest.mark.parametrize("expected_accelerator_flag,expected_accelerator_class", -[pytest.param("cuda", CUDAAccelerator, marks=RunIf(min_cuda_gpus=1)), -pytest.param("mps",MPSAccelerator, marks=RunIf(mps=True)),]) +@pytest.mark.parametrize( + "expected_accelerator_flag,expected_accelerator_class", + [ + pytest.param("cuda", CUDAAccelerator, marks=RunIf(min_cuda_gpus=1)), + pytest.param("mps", MPSAccelerator, marks=RunIf(mps=True)), + ], +) def test_gpu_accelerator_backend_choice(expected_accelerator_flag, expected_accelerator_class): - - trainer = Trainer(accelerator='gpu') + + trainer = Trainer(accelerator="gpu") assert trainer._accelerator_connector._accelerator_flag == expected_accelerator_flag - assert isinstance(trainer.accelerator, expected_accelerator_class) \ No newline at end of file + assert isinstance(trainer.accelerator, expected_accelerator_class) diff --git a/tests/tests_pytorch/lite/test_lite.py b/tests/tests_pytorch/lite/test_lite.py index 4560cea6747cc..041138fc8bc24 100644 --- a/tests/tests_pytorch/lite/test_lite.py +++ b/tests/tests_pytorch/lite/test_lite.py @@ -317,7 +317,7 @@ def test_setup_dataloaders_replace_standard_sampler(shuffle, strategy): pytest.param("gpu", "cuda:0", marks=RunIf(min_cuda_gpus=1)), pytest.param("tpu", "xla:0", marks=RunIf(tpu=True)), pytest.param("mps", "mps:0", marks=RunIf(mps=True)), - pytest.param("gpu", "mps:0", marks=RunIf(mps=True)) + pytest.param("gpu", "mps:0", marks=RunIf(mps=True)), ], ) def test_to_device(accelerator, expected): From 708b4b47c733a66fbd5d77abf4a7e7945e57d63f Mon Sep 17 00:00:00 2001 From: Justus Schock <12886177+justusschock@users.noreply.github.com> Date: Thu, 21 Jul 2022 11:07:03 +0200 Subject: [PATCH 11/35] move "gpu" support up before actual accelerator flag checks --- .../trainer/connectors/accelerator_connector.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py index 60ac2b0279733..4d2075961043f 100644 --- a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -186,6 +186,10 @@ def __init__( self._amp_level_flag: Optional[str] = amp_level self._auto_select_gpus: bool = auto_select_gpus + # handle "gpu" + if self._accelerator_flag == "gpu": + self._accelerator_flag = self._choose_gpu_accelerator_backend() + self._check_config_and_set_final_flags( strategy=strategy, accelerator=accelerator, @@ -201,10 +205,6 @@ def __init__( # 2. Instantiate Accelerator self._set_accelerator_if_ipu_strategy_is_passed() - # handle "gpu" - if self._accelerator_flag == "gpu": - self._accelerator_flag = self._choose_gpu_accelerator_backend() - # handle `auto` and `None` if self._accelerator_flag == "auto" or self._accelerator_flag is None: self._accelerator_flag = self._choose_auto_accelerator() From 315fd052955a101d27ea4bd90994bf495ef746eb Mon Sep 17 00:00:00 2001 From: Justus Schock <12886177+justusschock@users.noreply.github.com> Date: Thu, 21 Jul 2022 15:55:00 +0200 Subject: [PATCH 12/35] Stupid arguments --- .../trainer/connectors/accelerator_connector.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py index 4d2075961043f..23fe011f3dead 100644 --- a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -187,8 +187,8 @@ def __init__( self._auto_select_gpus: bool = auto_select_gpus # handle "gpu" - if self._accelerator_flag == "gpu": - self._accelerator_flag = self._choose_gpu_accelerator_backend() + if accelerator == "gpu": + accelerator = self._choose_gpu_accelerator_backend() self._check_config_and_set_final_flags( strategy=strategy, From d7365ff4d48ff1fe3b6f08fd779717bba2bbeecc Mon Sep 17 00:00:00 2001 From: Justus Schock <12886177+justusschock@users.noreply.github.com> Date: Thu, 21 Jul 2022 17:14:57 +0200 Subject: [PATCH 13/35] fix tests --- .../test_accelerator_connector.py | 23 ++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/tests/tests_pytorch/accelerators/test_accelerator_connector.py b/tests/tests_pytorch/accelerators/test_accelerator_connector.py index 9b94cb7e4b1e1..401edf3782feb 100644 --- a/tests/tests_pytorch/accelerators/test_accelerator_connector.py +++ b/tests/tests_pytorch/accelerators/test_accelerator_connector.py @@ -268,7 +268,7 @@ def test_accelerator_cpu(_): MisconfigurationException, match="CUDAAccelerator can not run on your system since the accelerator is not available.", ): - Trainer(accelerator="gpu") + Trainer(accelerator="cuda") with pytest.deprecated_call(match=r"is deprecated in v1.7 and will be removed"): Trainer(accelerator="cpu", gpus=1) @@ -662,7 +662,7 @@ def test_devices_auto_choice_mps(): @pytest.mark.parametrize( ["parallel_devices", "accelerator"], - [([torch.device("cpu")], "gpu"), ([torch.device("cuda", i) for i in range(8)], ("tpu"))], + [([torch.device("cpu")], "cuda"), ([torch.device("cuda", i) for i in range(8)], ("tpu"))], ) def test_parallel_devices_in_strategy_confilict_with_accelerator(parallel_devices, accelerator): with pytest.raises(MisconfigurationException, match=r"parallel_devices set through"): @@ -737,7 +737,7 @@ def test_plugin_only_one_instance_for_one_type(plugins, expected): Trainer(plugins=plugins) -@pytest.mark.parametrize("accelerator", ("cpu", "cuda", "gpu", "tpu", "ipu")) +@pytest.mark.parametrize("accelerator", ("cpu", "cuda", "mps", "tpu", "ipu")) @pytest.mark.parametrize("devices", ("0", 0, [])) def test_passing_zero_and_empty_list_to_devices_flag(accelerator, devices): with pytest.raises(MisconfigurationException, match="value is not a valid input using"): @@ -756,3 +756,20 @@ def test_gpu_accelerator_backend_choice(expected_accelerator_flag, expected_acce trainer = Trainer(accelerator="gpu") assert trainer._accelerator_connector._accelerator_flag == expected_accelerator_flag assert isinstance(trainer.accelerator, expected_accelerator_class) + + +@mock.patch("torch.cuda.device_count", return_value=1) +def test_gpu_accelerator_backend_choice_cuda(_): + trainer = Trainer(accelerator="gpu") + + assert trainer._accelerator_connector._accelerator_flag == "cuda" + assert isinstance(trainer.accelerator, CUDAAccelerator) + + +@mock.patch("pytorch_lightning.accelerators.mps._MPS_AVAILABLE", return_value=True) +@mock.patch("torch.device", return_value="mps") # necessary because torch doesn't allow creation of mps devices +def test_gpu_accelerator_backend_choice_mps(*_): + trainer = Trainer(accelerator="gpu") + + assert trainer._accelerator_connector._accelerator_flag == "mps" + assert isinstance(trainer.accelerator, MPSAccelerator) From 50bcbdea5dc9b673136604636317935a0e574f65 Mon Sep 17 00:00:00 2001 From: Justus Schock <12886177+justusschock@users.noreply.github.com> Date: Thu, 21 Jul 2022 17:15:13 +0200 Subject: [PATCH 14/35] change exception type --- .../trainer/connectors/accelerator_connector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py index 23fe011f3dead..59ffcbb21a61c 100644 --- a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -513,7 +513,7 @@ def _choose_gpu_accelerator_backend() -> str: if MPSAccelerator.is_available(): return "mps" - raise RuntimeError("No supported gpu backend found!") + raise MisconfigurationException("No supported gpu backend found!") def _set_parallel_devices_and_init_accelerator(self) -> None: if isinstance(self._accelerator_flag, Accelerator): From b0f18f26c5d93522310621df80a0887783f95f45 Mon Sep 17 00:00:00 2001 From: Justus Schock <12886177+justusschock@users.noreply.github.com> Date: Thu, 21 Jul 2022 17:15:23 +0200 Subject: [PATCH 15/35] fix registry test --- tests/tests_pytorch/accelerators/test_accelerator_registry.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/tests_pytorch/accelerators/test_accelerator_registry.py b/tests/tests_pytorch/accelerators/test_accelerator_registry.py index 791d4c33dbbe8..004723c19eeb6 100644 --- a/tests/tests_pytorch/accelerators/test_accelerator_registry.py +++ b/tests/tests_pytorch/accelerators/test_accelerator_registry.py @@ -63,4 +63,4 @@ def is_available(): def test_available_accelerators_in_registry(): - assert AcceleratorRegistry.available_accelerators() == ["cpu", "cuda", "gpu", "hpu", "ipu", "mps", "tpu"] + assert AcceleratorRegistry.available_accelerators() == ["cpu", "cuda", "hpu", "ipu", "mps", "tpu"] From 7fe75bf69a705d249a7ca7b91def640a0a815037 Mon Sep 17 00:00:00 2001 From: Justus Schock <12886177+justusschock@users.noreply.github.com> Date: Thu, 21 Jul 2022 19:28:05 +0200 Subject: [PATCH 16/35] pre-commit --- src/pytorch_lightning/lite/lite.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/pytorch_lightning/lite/lite.py b/src/pytorch_lightning/lite/lite.py index 01db7263af33d..39b5dacaa9077 100644 --- a/src/pytorch_lightning/lite/lite.py +++ b/src/pytorch_lightning/lite/lite.py @@ -54,7 +54,8 @@ class LightningLite(ABC): - Multi-node support. Args: - accelerator: The hardware to run on. Possible choices are: ``"cpu"``, ``"cuda"``, ``"mps"``, ``"gpu"``, ``"tpu"``, ``"auto"``. + accelerator: The hardware to run on. Possible choices are: + ``"cpu"``, ``"cuda"``, ``"mps"``, ``"gpu"``, ``"tpu"``, ``"auto"``. strategy: Strategy for how to run across multiple devices. Possible choices are: ``"dp"``, ``"ddp"``, ``"ddp_spawn"``, ``"deepspeed"``, ``"ddp_sharded"``. devices: Number of devices to train on (``int``), which GPUs to train on (``list`` or ``str``), or ``"auto"``. From c84cbe9b8b9a7a6c2bb2801904028b6680ce3515 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Wed, 20 Jul 2022 12:35:01 +0200 Subject: [PATCH 17/35] CI: debug HPU flow (#13419) * Update the hpu-tests.yml to pull docker from vault * fire & sudo * habana-gaudi-hpus * Check the driver status on gaudi server (#13718) Co-authored-by: arao Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Akarsha Rao <94624926+raoakarsha@users.noreply.github.com> --- .azure/hpu-tests.yml | 27 ++++++++-- .github/workflows/ci_pr-gatekeeper.yml | 2 +- .github/workflows/cicd-pytorch_dockers.yml | 2 +- dockers/ci-runner-hpu/Dockerfile | 63 ++++++++++++++++++---- dockers/ci-runner-hpu/start.sh | 2 +- 5 files changed, 79 insertions(+), 17 deletions(-) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index e530ace901bfa..f0b279bda3f60 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -20,23 +20,44 @@ jobs: timeoutInMinutes: "10" # how much time to give 'run always even if cancelled tasks' before stopping them cancelTimeoutInMinutes: "2" - pool: intel-hpus + pool: habana-gaudi-hpus + container: + image: "vault.habana.ai/gaudi-docker/1.5.0/ubuntu20.04/habanalabs/pytorch-installer-1.11.0:latest" + options: "--runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host --shm-size=4g --name cd-container -v /usr/bin/docker:/tmp/docker:ro" workspace: clean: all steps: + - script: | + /tmp/docker exec -t -u 0 cd-container \ + sh -c "apt-get update && DEBIAN_FRONTEND=noninteractive apt-get -o Dpkg::Options::="--force-confold" -y install sudo" + displayName: 'Install Sudo in container (thanks Microsoft!)' + - bash: | - apt-get install -y hwinfo + sudo apt-get install -y hwinfo hwinfo --short + python --version + sudo pip install pip -U displayName: 'Instance HW info' - bash: | - pip install -e .[extra] -r requirements/pytorch/test.txt + set -e + pip --version + sudo pip uninstall -y lightning pytorch-lightning + pip install fire + python .actions/assistant.py requirements-prune-pkgs torch,torchvision,torchtext + pip install ".[extra,test]" + pip list env: PACKAGE_NAME: pytorch FREEZE_REQUIREMENTS: 1 displayName: 'Install dependencies' + - bash: | + hl-smi -L + lsmod | grep habanalabs + displayName: 'Check the driver status' + - bash: | python -m pytest -sv accelerators/test_hpu.py --forked --junitxml=hpu1_test-results.xml workingDirectory: tests/tests_pytorch diff --git a/.github/workflows/ci_pr-gatekeeper.yml b/.github/workflows/ci_pr-gatekeeper.yml index d76801fd529a5..92215edd3c107 100644 --- a/.github/workflows/ci_pr-gatekeeper.yml +++ b/.github/workflows/ci_pr-gatekeeper.yml @@ -26,7 +26,7 @@ jobs: run: | patterns = ('docs/source-${{ matrix.pkg }}', 'src/lightning_${{ matrix.pkg }}', 'tests/tests_${{ matrix.pkg }}') changed = any(p in "${{steps.changed-files.outputs.all_changed_and_modified_files}}" for p in patterns) - print('::set-output name=files::' + int(changed)) + print(f'::set-output name=files::{int(changed)}') shell: python - uses: octodemo/pr-gatekeeper@main if: steps.touched.outputs.files == 1 diff --git a/.github/workflows/cicd-pytorch_dockers.yml b/.github/workflows/cicd-pytorch_dockers.yml index 3d7bb6fc363e9..4742f3579c274 100644 --- a/.github/workflows/cicd-pytorch_dockers.yml +++ b/.github/workflows/cicd-pytorch_dockers.yml @@ -225,7 +225,7 @@ jobs: build-args: | DIST=latest GAUDI_VERSION=${{ matrix.gaudi_version }} - PYTORCH_VERSION=${{ matrix.pytorch_version }} + PYTORCH_INSTALLER_VERSION=${{ matrix.pytorch_version }} file: dockers/ci-runner-hpu/Dockerfile push: ${{ env.PUSH_TO_HUB }} tags: pytorchlightning/pytorch_lightning:hpu-ci-runner-gaudi${{ matrix.gaudi_version }} diff --git a/dockers/ci-runner-hpu/Dockerfile b/dockers/ci-runner-hpu/Dockerfile index c4e37a5e2b41b..588d23702e9ff 100644 --- a/dockers/ci-runner-hpu/Dockerfile +++ b/dockers/ci-runner-hpu/Dockerfile @@ -1,24 +1,65 @@ +# Run command to build: +# gaudi_ver=$(curl -s "https://vault.habana.ai/artifactory/gaudi-docker/" | sed -n 's/.*href="\([^"]*\).*/\1/p' | tail -2 | head -1 | sed "s/\///1") +# pytorch_install_ver=$(curl -s "https://vault.habana.ai/artifactory/gaudi-docker/$gaudi_ver/ubuntu20.04/habanalabs/" | sed -n 's/.*href="\([^"]*\).*/\1/p'| sed "s/\///1" | grep pytorch-installer) +# pytorch_install_ver=${pytorch_install_ver/"pytorch-installer-"/""} +# docker build -t gaudi-docker-agent:latest \ +# --build-arg GAUDI_VERSION=$gaudi_ver \ +# --build-arg PYTORCH_INSTALLER_VERSION=$pytorch_install_ver \ +# -f Dockerfile . +# Run command: +# docker run --privileged \ +# -v /dev:/dev \ +# -e AZP_URL="https://dev.azure.com/ORGANIZATION/" \ +# -e AZP_TOKEN="XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX" \ +# -e AZP_AGENT_NAME="hpu1" \ +# -e AZP_POOL="intel-hpus" \ +# gaudi-docker-agent:latest + ARG DIST="latest" ARG GAUDI_VERSION="1.5.0" -ARG PYTORCH_VERSION="1.11.0" - -FROM vault.habana.ai/gaudi-docker/${GAUDI_VERSION}/ubuntu20.04/habanalabs/pytorch-installer-${PYTORCH_VERSION}:${DIST} +ARG PYTORCH_INSTALLER_VERSION="1.11.0" +FROM vault.habana.ai/gaudi-docker/${GAUDI_VERSION}/ubuntu20.04/habanalabs/pytorch-installer-${PYTORCH_INSTALLER_VERSION}:${DIST} LABEL maintainer="https://vault.habana.ai/" +# update the base packages and add a non-sudo user +RUN \ + apt-get update -y && \ + apt-get upgrade -y && \ + useradd -m docker -RUN echo "ALL ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers - -WORKDIR /azp - -COPY ./dockers/ci-runner-hpu/start.sh /usr/local/bin/ +# To make it easier for build and release pipelines to run apt-get, +# configure apt to not require confirmation (assume the -y argument by default) +ENV DEBIAN_FRONTEND=noninteractive +RUN echo "APT::Get::Assume-Yes \"true\";" > /etc/apt/apt.conf.d/90assumeyes -RUN chmod +x /usr/local/bin/start.sh +RUN apt-get update --fix-missing && \ + apt-get install -y --no-install-recommends \ + ca-certificates \ + build-essential \ + curl \ + jq \ + git \ + iputils-ping \ + libcurl4 \ + libunwind8 \ + netcat \ + libssl1.0 \ + libssl-dev \ + libffi-dev \ + python3 \ + python3-venv \ + python3-dev \ + python3-pip RUN curl -fsSL https://get.docker.com -o get-docker.sh && \ sh get-docker.sh && \ rm get-docker.sh -#RUN docker --help +RUN pip uninstall pytorch-lightning -y + +WORKDIR /azp + +COPY ./dockers/ci-runner-hpu/start.sh /usr/local/bin/ +RUN chmod +x /usr/local/bin/start.sh ENTRYPOINT ["/usr/local/bin/start.sh"] -CMD ["bash"] diff --git a/dockers/ci-runner-hpu/start.sh b/dockers/ci-runner-hpu/start.sh index caa452b978c18..82472a817ab94 100644 --- a/dockers/ci-runner-hpu/start.sh +++ b/dockers/ci-runner-hpu/start.sh @@ -93,4 +93,4 @@ trap 'cleanup; exit 143' TERM # To be aware of TERM and INT signals call run.sh # Running it with the --once flag at the end will shut down the agent after the build is executed -./run.sh --once & wait $! +./run.sh & wait $! From c1a13b23d44cbe3ab9342a4a04797c857a6f46c6 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 20 Jul 2022 11:01:39 +0000 Subject: [PATCH 18/35] Update typing-extensions requirement from <4.2.1,>=4.0.0 to >=4.0.0,<4.3.1 in /requirements (#13529) Update typing-extensions requirement in /requirements Updates the requirements on [typing-extensions](https://github.com/python/typing_extensions) to permit the latest version. - [Release notes](https://github.com/python/typing_extensions/releases) - [Changelog](https://github.com/python/typing_extensions/blob/main/CHANGELOG.md) - [Commits](https://github.com/python/typing_extensions/compare/4.0.0...4.3.0) --- updated-dependencies: - dependency-name: typing-extensions dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- requirements/pytorch/base.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/pytorch/base.txt b/requirements/pytorch/base.txt index f46863e779d9a..a0c1786362390 100644 --- a/requirements/pytorch/base.txt +++ b/requirements/pytorch/base.txt @@ -7,4 +7,4 @@ tensorboard>=2.9.1, <2.10.0 torchmetrics>=0.7.0, <0.9.2 # needed for using fixed compare_version pyDeprecate>=0.3.1, <=0.3.2 packaging>=17.0, <=21.3 -typing-extensions>=4.0.0, <4.2.1 +typing-extensions>=4.0.0, <4.3.1 From 039d3dd75e0be4e01cfa667596b2130bf2f50dc5 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 20 Jul 2022 11:03:08 +0000 Subject: [PATCH 19/35] [pre-commit.ci] pre-commit suggestions (#13540) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/psf/black: 22.3.0 → 22.6.0](https://github.com/psf/black/compare/22.3.0...22.6.0) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 3a34bab490a05..4b8ec5239615c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -73,7 +73,7 @@ repos: name: Format imports - repo: https://github.com/psf/black - rev: 22.3.0 + rev: 22.6.0 hooks: - id: black name: Format code From bb1552182008161b788206dbc9f649669641c816 Mon Sep 17 00:00:00 2001 From: Sean Naren Date: Wed, 20 Jul 2022 12:32:35 +0100 Subject: [PATCH 20/35] [FIX] Native FSDP precision + tests (#12985) --- .../precision/fully_sharded_native_amp.py | 26 ++++- .../strategies/fully_sharded_native.py | 109 ++++++++++++++---- .../connectors/accelerator_connector.py | 6 +- .../test_accelerator_connector.py | 21 +--- .../test_ddp_fully_sharded_native.py | 79 +++++++------ 5 files changed, 155 insertions(+), 86 deletions(-) diff --git a/src/pytorch_lightning/plugins/precision/fully_sharded_native_amp.py b/src/pytorch_lightning/plugins/precision/fully_sharded_native_amp.py index 870e658bfc9c3..8c693f2975bbd 100644 --- a/src/pytorch_lightning/plugins/precision/fully_sharded_native_amp.py +++ b/src/pytorch_lightning/plugins/precision/fully_sharded_native_amp.py @@ -11,10 +11,19 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Any +from typing import Any, Optional + +import torch from pytorch_lightning.plugins.precision.sharded_native_amp import ShardedNativeMixedPrecisionPlugin +from pytorch_lightning.utilities.enums import PrecisionType from pytorch_lightning.utilities.exceptions import MisconfigurationException +from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_12 + +if _TORCH_GREATER_EQUAL_1_12: + from torch.distributed.fsdp.fully_sharded_data_parallel import MixedPrecision +else: + MixedPrecision = None class FullyShardedNativeMixedPrecisionPlugin(ShardedNativeMixedPrecisionPlugin): @@ -29,3 +38,18 @@ def clip_grad_by_norm(self, *_: Any, **__: Any) -> None: raise MisconfigurationException( f"`gradient_clip_algorithm='norm'` is currently not supported for `{self.__class__.__name__}`" ) + + @property + def mixed_precision_config(self) -> Optional[MixedPrecision]: + assert MixedPrecision is not None + if self.precision == PrecisionType.HALF: + dtype = torch.float16 + elif self.precision == PrecisionType.BFLOAT: + dtype = torch.bfloat16 + else: + raise MisconfigurationException(f"Was unable to infer precision type, received {self.precision!r}.") + return MixedPrecision( + param_dtype=dtype, + reduce_dtype=dtype, + buffer_dtype=dtype, + ) diff --git a/src/pytorch_lightning/strategies/fully_sharded_native.py b/src/pytorch_lightning/strategies/fully_sharded_native.py index d8cd66be2e02f..7528d5b95903e 100644 --- a/src/pytorch_lightning/strategies/fully_sharded_native.py +++ b/src/pytorch_lightning/strategies/fully_sharded_native.py @@ -23,6 +23,8 @@ from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.precision import PrecisionPlugin +from pytorch_lightning.plugins.precision.fully_sharded_native_amp import FullyShardedNativeMixedPrecisionPlugin +from pytorch_lightning.strategies.launchers.subprocess_script import _SubprocessScriptLauncher from pytorch_lightning.strategies.parallel import ParallelStrategy from pytorch_lightning.strategies.strategy import TBroadcast from pytorch_lightning.trainer.states import TrainerFn @@ -35,18 +37,23 @@ from pytorch_lightning.utilities.distributed import group as _group from pytorch_lightning.utilities.distributed import init_dist_connection, ReduceOp, sync_ddp_if_available from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_11 +from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_12 from pytorch_lightning.utilities.optimizer import optimizers_to_device +from pytorch_lightning.utilities.rank_zero import rank_zero_info from pytorch_lightning.utilities.seed import reset_seed -if _TORCH_GREATER_EQUAL_1_11: +if _TORCH_GREATER_EQUAL_1_12: from torch.distributed.fsdp.fully_sharded_data_parallel import ( BackwardPrefetch, CPUOffload, FullyShardedDataParallel, + MixedPrecision, ) from torch.distributed.fsdp.wrap import enable_wrap - +else: + MixedPrecision = None + BackwardPrefetch = None # type: ignore[misc,assignment] + CPUOffload = None # type: ignore[misc,assignment] log = logging.getLogger(__name__) @@ -56,7 +63,7 @@ class DDPFullyShardedNativeStrategy(ParallelStrategy): strategy_name = "fsdp_native" _registered_strategies: List[str] = [] - def __init__( # type: ignore[no-untyped-def] + def __init__( self, accelerator: Optional["pl.accelerators.accelerator.Accelerator"] = None, parallel_devices: Optional[List[torch.device]] = None, @@ -64,10 +71,12 @@ def __init__( # type: ignore[no-untyped-def] checkpoint_io: Optional[CheckpointIO] = None, precision_plugin: Optional[PrecisionPlugin] = None, process_group_backend: Optional[str] = None, - cpu_offload=None, - backward_prefetch=None, + cpu_offload: Optional[CPUOffload] = None, + backward_prefetch: Optional[BackwardPrefetch] = None, + mixed_precision: Optional[MixedPrecision] = None, + **kwargs: Any, ) -> None: - """Strategy for Fully Sharded Data Parallel provided by torch.Distributed. + r"""Strategy for Fully Sharded Data Parallel provided by torch.Distributed. Fully Sharded Training shards the entire model across all available GPUs, allowing you to scale model size, whilst using efficient communication to reduce overhead. In practice, this means we can remain @@ -84,7 +93,7 @@ def __init__( # type: ignore[no-untyped-def] `https://pytorch.org/tutorials/intermediate/FSDP_tutorial.html` Arguments: - cpu_offload (Optional [CPUOffload]): + cpu_offload: CPU offloading config. Currently, only parameter and gradient CPU offload is supported. It can be enabled via passing in ``cpu_offload=CPUOffload(offload_params=True)``. Note that this @@ -92,14 +101,21 @@ def __init__( # type: ignore[no-untyped-def] params and grads to be on same device to work with optimizer. This API is subject to change. Default is ``None`` in which case there will be no offloading. - backward_prefetch: (Optional[BackwardPrefetch]): + backward_prefetch: This is an experimental feature that is subject to change in the the near future. It allows users to enable two different backward_prefetch algorithms to help backward communication and computation overlapping. Pros and cons of each algorithm is explained in the class ``BackwardPrefetch``. + mixed_precision: + Mixed Precision config. By default, Lightning will enable FP16 if ``precision=16` + or BF16 if ``precision=bf16`` unless a config is passed in. + This is only available in PyTorch 1.12 and later. + \**kwargs: Passed to the FSDP Context manager which will configure the FSDP class when wrapping modules. """ - if not _TORCH_GREATER_EQUAL_1_11: - raise MisconfigurationException("DDPFullyShardedNativeStrategy is supported from pytorch v1.11.0 onwards.") + if not _TORCH_GREATER_EQUAL_1_12: + raise MisconfigurationException( + "`DDPFullyShardedNativeStrategy` is supported from PyTorch v1.12.0 onwards." + ) super().__init__( accelerator=accelerator, @@ -109,16 +125,23 @@ def __init__( # type: ignore[no-untyped-def] precision_plugin=precision_plugin, ) self._process_group = None - self.num_processes = len(self.parallel_devices) if self.parallel_devices is not None else 0 - self._process_group_backend: Optional[str] = process_group_backend - self.cpu_offload: Optional[CPUOffload] = cpu_offload - self.backward_prefetch: Optional[BackwardPrefetch] = backward_prefetch + self.num_nodes = 1 + self._process_group_backend = process_group_backend + self.cpu_offload = cpu_offload + self.backward_prefetch = backward_prefetch + self.mixed_precision = mixed_precision + self._rank_0_will_call_children_scripts: bool = False + self.kwargs = kwargs @property def root_device(self) -> torch.device: assert self.parallel_devices is not None return self.parallel_devices[self.local_rank] + @property + def num_processes(self) -> int: + return len(self.parallel_devices) if self.parallel_devices is not None else 0 + @property def process_group(self) -> Optional[ProcessGroup]: if self._process_group is None: @@ -130,10 +153,28 @@ def process_group(self) -> Optional[ProcessGroup]: def process_group_backend(self) -> Optional[str]: return self._process_group_backend + @property + def mixed_precision_config(self) -> Optional[MixedPrecision]: + if self.mixed_precision: + return self.mixed_precision + plugin = self.precision_plugin + if isinstance(plugin, FullyShardedNativeMixedPrecisionPlugin): + return plugin.mixed_precision_config + + @property + def distributed_sampler_kwargs(self) -> Dict: + return dict(num_replicas=(self.num_nodes * self.num_processes), rank=self.global_rank) + def setup_environment(self) -> None: + log.detail(f"{self.__class__.__name__}: setting up distributed...") reset_seed() + + # determine which process we are and world size + self.set_world_ranks() + # set warning rank rank_zero_only.rank = self.global_rank + self._process_group_backend = self._get_process_group_backend() assert self.cluster_environment is not None init_dist_connection(self.cluster_environment, self._process_group_backend) @@ -146,15 +187,31 @@ def _get_process_group_backend(self) -> str: or get_default_process_group_backend_for_device(self.root_device) ) + def set_world_ranks(self) -> None: + if self.cluster_environment is None: + return + self.cluster_environment.set_global_rank(self.node_rank * self.num_processes + self.local_rank) + self.cluster_environment.set_world_size(self.num_nodes * self.num_processes) + rank_zero_only.rank = self.cluster_environment.global_rank() + + def _configure_launcher(self) -> None: + assert self.cluster_environment is not None + if not self.cluster_environment.creates_processes_externally: + self._launcher = _SubprocessScriptLauncher(self.cluster_environment, self.num_processes, self.num_nodes) + self._rank_0_will_call_children_scripts = True + def setup(self, trainer: "pl.Trainer") -> None: self.accelerator.setup(trainer) + # share ddp pids to all processes + self._rank_0_will_call_children_scripts = self.broadcast(self._rank_0_will_call_children_scripts) if trainer.state.fn == TrainerFn.FITTING and self._layer_sync: assert self.model is not None self.model = self._layer_sync.apply(self.model) - if not self.cpu_offload: - self.model_to_device() + # we set the device so that optimizers can be created with distributed comms. + assert self.lightning_module is not None + self.lightning_module._device = self.root_device self.barrier() self.setup_optimizers(trainer) @@ -162,20 +219,19 @@ def setup(self, trainer: "pl.Trainer") -> None: self.setup_precision_plugin() def model_to_device(self) -> None: - # ensure we update the device type in the lightning module - assert self.lightning_module is not None - log.info(f"{self.__class__.__name__}: moving model to device [{self.root_device}]...") - self.lightning_module.to(self.root_device) + pass @contextlib.contextmanager def model_sharded_context(self) -> Generator: log.detail(f"{self.__class__.__name__}: entered model_sharded_context.") - with enable_wrap( wrapper_cls=FullyShardedDataParallel, process_group=self.process_group, cpu_offload=self.cpu_offload, backward_prefetch=self.backward_prefetch, + mixed_precision=self.mixed_precision_config, + device_id=self.root_device.index, + **self.kwargs, ): yield @@ -219,7 +275,7 @@ def _determine_device_ids(self) -> List[int]: return [self.root_device.index] def teardown(self) -> None: - log.info(f"{self.__class__.__name__}: tearing down strategy...") + rank_zero_info(f"{self.__class__.__name__}: tearing down strategy...") if ( self.lightning_module is not None and self.lightning_module.trainer is not None @@ -229,7 +285,10 @@ def teardown(self) -> None: assert self.model is not None self.model = self._layer_sync.revert(self.model) - super().teardown() + assert self.cluster_environment is not None + self.cluster_environment.teardown() + self.precision_plugin.teardown() + self.accelerator.teardown() @classmethod def get_registered_strategies(cls) -> List[str]: @@ -237,7 +296,7 @@ def get_registered_strategies(cls) -> List[str]: @classmethod def register_strategies(cls, strategy_registry: Dict) -> None: - if _TORCH_GREATER_EQUAL_1_11: + if _TORCH_GREATER_EQUAL_1_12: strategy_registry.register( "fsdp_native", cls, diff --git a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py index 59ffcbb21a61c..5d3f0d1b23e1a 100644 --- a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -715,17 +715,13 @@ def _check_and_init_precision(self) -> PrecisionPlugin: if self._precision_flag == 16 else "Using bfloat16 Automatic Mixed Precision (AMP)" ) - if isinstance(self.strategy, DDPFullyShardedNativeStrategy): - raise MisconfigurationException( - "DDPFullyShardedNativeStrategy currently doesn't support Mixed Precision" - ) if self._amp_type_flag == AMPType.NATIVE: device = "cpu" if self._accelerator_flag == "cpu" else "cuda" if isinstance(self.strategy, (DDPShardedStrategy, DDPSpawnShardedStrategy)): return ShardedNativeMixedPrecisionPlugin(self._precision_flag, device) - if isinstance(self.strategy, DDPFullyShardedStrategy): + if isinstance(self.strategy, (DDPFullyShardedStrategy, DDPFullyShardedNativeStrategy)): return FullyShardedNativeMixedPrecisionPlugin(self._precision_flag, device) return NativeMixedPrecisionPlugin(self._precision_flag, device) diff --git a/tests/tests_pytorch/accelerators/test_accelerator_connector.py b/tests/tests_pytorch/accelerators/test_accelerator_connector.py index 401edf3782feb..b0952c4cfeb41 100644 --- a/tests/tests_pytorch/accelerators/test_accelerator_connector.py +++ b/tests/tests_pytorch/accelerators/test_accelerator_connector.py @@ -574,7 +574,7 @@ def test_strategy_choice_ddp_cpu_slurm(device_count_mock, setup_distributed_mock assert trainer.strategy.local_rank == 0 -@RunIf(min_torch="1.11") +@RunIf(min_torch="1.12") def test_check_native_fsdp_strategy_and_fallback(): with pytest.raises( MisconfigurationException, @@ -584,25 +584,6 @@ def test_check_native_fsdp_strategy_and_fallback(): Trainer(accelerator="cpu", strategy="fsdp_native") -@mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0"}) -@mock.patch("torch.cuda.device_count", return_value=1) -@mock.patch("torch.cuda.is_available", return_value=True) -@RunIf(min_torch="1.11") -def test_mixed_precision_support_with_native_fsdp_strategy(device_count_mock, mock_cuda_available, tmpdir): - with pytest.raises( - MisconfigurationException, match="DDPFullyShardedNativeStrategy currently doesn't support Mixed Precision" - ): - trainer = Trainer( - default_root_dir=tmpdir, - fast_dev_run=True, - strategy="fsdp_native", - accelerator="gpu", - devices=1, - precision=16, - ) - assert isinstance(trainer.strategy, DDPFullyShardedNativeStrategy) - - @mock.patch("pytorch_lightning.accelerators.tpu.TPUAccelerator.is_available", return_value=True) def test_unsupported_tpu_choice(mock_tpu_acc_avail): diff --git a/tests/tests_pytorch/strategies/test_ddp_fully_sharded_native.py b/tests/tests_pytorch/strategies/test_ddp_fully_sharded_native.py index b6dbff1792668..1ac7ad0b6660b 100644 --- a/tests/tests_pytorch/strategies/test_ddp_fully_sharded_native.py +++ b/tests/tests_pytorch/strategies/test_ddp_fully_sharded_native.py @@ -1,6 +1,5 @@ import os from typing import Any, Dict, Optional -from unittest import mock import pytest import torch @@ -8,19 +7,21 @@ from pytorch_lightning import Trainer from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.demos.boring_classes import BoringModel +from pytorch_lightning.plugins.precision.fully_sharded_native_amp import FullyShardedNativeMixedPrecisionPlugin from pytorch_lightning.strategies import DDPFullyShardedNativeStrategy from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_12 +from pytorch_lightning.utilities.types import STEP_OUTPUT from tests_pytorch.helpers.runif import RunIf if _TORCH_GREATER_EQUAL_1_12: - from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel + from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel, MixedPrecision from torch.distributed.fsdp.wrap import wrap -@RunIf(min_torch="1.12dev") +@RunIf(min_torch="1.12") def test_invalid_on_cpu(tmpdir): - """Test to ensure that to raise Misconfiguration for Native FSDP on CPU.""" + """Test to ensure that we raise Misconfiguration for Native FSDP on CPU.""" with pytest.raises( MisconfigurationException, match=f"You selected strategy to be `{DDPFullyShardedNativeStrategy.strategy_name}`, " @@ -31,29 +32,27 @@ def test_invalid_on_cpu(tmpdir): trainer.strategy.setup_environment() -@mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0"}) -@mock.patch("torch.cuda.device_count", return_value=1) -@mock.patch("torch.cuda.is_available", return_value=True) -@RunIf(min_torch="1.12dev") -def test_fsdp_with_sharded_amp(device_count_mock, mock_cuda_available, tmpdir): - """Test to ensure that plugin native amp plugin raises Misconfiguration error.""" - with pytest.raises( - MisconfigurationException, match="DDPFullyShardedNativeStrategy currently doesn't support Mixed Precision" - ): - trainer = Trainer( - default_root_dir=tmpdir, - fast_dev_run=True, - strategy="fsdp_native", - accelerator="gpu", - devices=1, - precision=16, - ) - assert isinstance(trainer.strategy, DDPFullyShardedNativeStrategy) +@RunIf(min_torch="1.12", min_cuda_gpus=1) +@pytest.mark.parametrize("precision, expected", [(16, torch.float16), ("bf16", torch.bfloat16)]) +def test_precision_plugin_config(precision, expected): + plugin = FullyShardedNativeMixedPrecisionPlugin(precision=precision, device="cuda") + config = plugin.mixed_precision_config + assert config.param_dtype == expected + assert config.buffer_dtype == expected + assert config.reduce_dtype == expected + + +@RunIf(min_torch="1.12") +def test_fsdp_custom_mixed_precision(tmpdir): + """Test to ensure that passing a custom mixed precision config works.""" + config = MixedPrecision() + strategy = DDPFullyShardedNativeStrategy(mixed_precision=config) + assert strategy.mixed_precision_config == config class TestFSDPModel(BoringModel): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) + def __init__(self): + super().__init__() self.layer: Optional[torch.nn.Module] = None def _init_model(self) -> None: @@ -79,16 +78,20 @@ def on_load_checkpoint(self, checkpoint: Dict[str, Any]) -> None: def configure_optimizers(self): return torch.optim.SGD(self.layer.parameters(), lr=0.1) - def on_train_start(self) -> None: + def on_train_batch_end(self, outputs: STEP_OUTPUT, batch: Any, batch_idx: int) -> None: self._assert_layer_fsdp_instance() - def on_test_start(self) -> None: + def on_test_batch_end( + self, outputs: Optional[STEP_OUTPUT], batch: Any, batch_idx: int, dataloader_idx: int + ) -> None: self._assert_layer_fsdp_instance() - def on_validation_start(self) -> None: + def on_validation_batch_end( + self, outputs: Optional[STEP_OUTPUT], batch: Any, batch_idx: int, dataloader_idx: int + ) -> None: self._assert_layer_fsdp_instance() - def on_prediction_start(self) -> None: + def on_predict_batch_end(self, outputs: Optional[Any], batch: Any, batch_idx: int, dataloader_idx: int) -> None: self._assert_layer_fsdp_instance() def _assert_layer_fsdp_instance(self) -> None: @@ -101,8 +104,13 @@ def _assert_layer_fsdp_instance(self) -> None: assert self.layer.module[0].reshard_after_forward is True assert self.layer.module[2].reshard_after_forward is True + precision = torch.float16 if self.precision == 16 else torch.bfloat16 + assert self.layer.mixed_precision.param_dtype == precision + assert self.layer.mixed_precision.reduce_dtype == precision + assert self.layer.mixed_precision.buffer_dtype == precision + -@RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True, min_torch="1.12dev") +@RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True, min_torch="1.12") def test_fully_sharded_native_strategy_sync_batchnorm(tmpdir): """Test to ensure that sync_batchnorm works when using fsdp_native and GPU, and all stages can be run.""" @@ -119,18 +127,19 @@ def test_fully_sharded_native_strategy_sync_batchnorm(tmpdir): _run_multiple_stages(trainer, model, os.path.join(tmpdir, "last.ckpt")) -@RunIf(min_cuda_gpus=1, skip_windows=True, standalone=True, min_torch="1.12dev") -def test_fully_sharded_native_strategy_checkpoint(tmpdir): +@RunIf(min_cuda_gpus=1, skip_windows=True, standalone=True, min_torch="1.12") +@pytest.mark.parametrize("precision", [16, "bf16"]) +def test_fully_sharded_native_strategy_checkpoint(tmpdir, precision): """Test to ensure that checkpoint is saved correctly when using a single GPU, and all stages can be run.""" model = TestFSDPModel() trainer = Trainer( - default_root_dir=tmpdir, accelerator="gpu", devices=1, strategy="fsdp_native", precision=16, max_epochs=1 + default_root_dir=tmpdir, accelerator="gpu", devices=1, strategy="fsdp_native", precision=precision, max_epochs=1 ) _run_multiple_stages(trainer, model, os.path.join(tmpdir, "last.ckpt")) -@RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True, min_torch="1.12dev") +@RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True, min_torch="1.12") def test_fully_sharded_native_strategy_checkpoint_multi_gpus(tmpdir): """Test to ensure that checkpoint is saved correctly when using multiple GPUs, and all stages can be run.""" @@ -150,7 +159,7 @@ def test_fully_sharded_native_strategy_checkpoint_multi_gpus(tmpdir): def _run_multiple_stages(trainer, model, model_path: Optional[str] = None): trainer.fit(model) - + model_path = trainer.strategy.broadcast(model_path) model_path = model_path if model_path else trainer.checkpoint_callback.last_model_path trainer.save_checkpoint(model_path, weights_only=True) @@ -158,7 +167,7 @@ def _run_multiple_stages(trainer, model, model_path: Optional[str] = None): _assert_save_equality(trainer, model_path, cls=TestFSDPModel) # Test entry point - trainer.test(model) # model is wrapped, will not call configure_shared_model + trainer.test(model) # model is wrapped, will not call `configure_sharded_model` # provide model path, will create a new unwrapped model and load and then call configure_shared_model to wrap trainer.test(ckpt_path=model_path) From a49e8c5b5b9f659f887c3a728771584cab437f16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Wed, 20 Jul 2022 14:15:24 +0200 Subject: [PATCH 21/35] Simplify fetching's loader types (#13111) --- src/pytorch_lightning/utilities/fetching.py | 17 ++++++----------- .../utilities/test_auto_restart.py | 17 ++++------------- tests/tests_pytorch/utilities/test_fetching.py | 8 +++----- 3 files changed, 13 insertions(+), 29 deletions(-) diff --git a/src/pytorch_lightning/utilities/fetching.py b/src/pytorch_lightning/utilities/fetching.py index ff7e6080bad7b..a4518e147da02 100644 --- a/src/pytorch_lightning/utilities/fetching.py +++ b/src/pytorch_lightning/utilities/fetching.py @@ -144,23 +144,18 @@ def _store_dataloader_iter_state( dataloader_iter.state.update(iter_name, state) @property - def loaders(self) -> List[DataLoader]: + def loaders(self) -> Any: if isinstance(self.dataloader, CombinedLoader): - loaders = self.dataloader.loaders - else: - loaders = [self.dataloader] - return loaders + return self.dataloader.loaders + return self.dataloader @property - def loader_iters(self) -> List[Iterator]: + def loader_iters(self) -> Any: if self.dataloader_iter is None: raise MisconfigurationException("The `dataloader_iter` isn't available outside the __iter__ context.") - if isinstance(self.dataloader, CombinedLoader): - loader_iters = self.dataloader_iter.loader_iters - else: - loader_iters = [self.dataloader_iter] - return loader_iters + return self.dataloader_iter.loader_iters + return self.dataloader_iter @property def state(self) -> List[MergedIteratorState]: diff --git a/tests/tests_pytorch/utilities/test_auto_restart.py b/tests/tests_pytorch/utilities/test_auto_restart.py index 47051d4efd098..5a5982ad009f9 100644 --- a/tests/tests_pytorch/utilities/test_auto_restart.py +++ b/tests/tests_pytorch/utilities/test_auto_restart.py @@ -700,16 +700,8 @@ def __len__(self): return self.len -# TODO: test with `RandomGeneratorGetItemDataset` @mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"}) -@pytest.mark.parametrize( - "dataset_class", - [ - SequentialGetItemDataset, - RandomGetItemDataset, - # RandomGeneratorGetItemDataset, - ], -) +@pytest.mark.parametrize("dataset_class", [SequentialGetItemDataset, RandomGetItemDataset]) @pytest.mark.parametrize("num_workers", [0, pytest.param(2, marks=RunIf(slow=True))]) @pytest.mark.parametrize("batch_size", [1, 2, 3]) def test_dataset_rng_states_restart(dataset_class, num_workers, batch_size): @@ -732,12 +724,11 @@ def fetch(fetcher, prefetch_iter, num_batches_fetched): _ = next(prefetch_iter) state: List[MergedIteratorState] = fetcher.state - assert len(state) == 1 - assert isinstance(state[0], MergedIteratorState) + assert isinstance(state, MergedIteratorState) assert len(fetcher.dataloader_iter.cache_states) == 1 if num_workers == 0: - assert state[0].state[0].num_batches_fetched == num_batches_fetched + assert state.state[0].num_batches_fetched == num_batches_fetched return state dataset, random_sampler = create_dataset_sampler() @@ -754,7 +745,7 @@ def fetch(fetcher, prefetch_iter, num_batches_fetched): # (A) capture the state after fetching 4 batches state = fetch(fetcher, prefetch_iter, 4) - state = deepcopy(state[0]) + state = deepcopy(state) # (B) simulate 2 additional batches batch05 = next(prefetch_iter) diff --git a/tests/tests_pytorch/utilities/test_fetching.py b/tests/tests_pytorch/utilities/test_fetching.py index e9ab01387f7f6..2d5e3954c7061 100644 --- a/tests/tests_pytorch/utilities/test_fetching.py +++ b/tests/tests_pytorch/utilities/test_fetching.py @@ -101,16 +101,14 @@ def test_empty_prefetch_iterator(dataset_cls, prefetch_batches): def test_misconfiguration_error(): - fetcher = DataFetcher() + loader = DataLoader(range(10)) + fetcher.setup(loader) + assert fetcher.loaders == loader with pytest.raises( MisconfigurationException, match="The `dataloader_iter` isn't available outside the __iter__ context." ): - loader = DataLoader(range(10)) - fetcher.setup(loader) - assert fetcher.loaders[0] == loader fetcher.loader_iters - iter(fetcher) assert fetcher.loader_iters From 3129d97dac882471e24caa520df5c571e3cc3508 Mon Sep 17 00:00:00 2001 From: Mansy Date: Wed, 20 Jul 2022 15:59:28 +0200 Subject: [PATCH 22/35] Include app templates to the lightning and app packages (#13731) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Include app templates to the package Co-authored-by: mansy Co-authored-by: Adrian Wälchli --- src/lightning/__setup__.py | 1 + src/lightning_app/__setup__.py | 1 + 2 files changed, 2 insertions(+) diff --git a/src/lightning/__setup__.py b/src/lightning/__setup__.py index 22c3fd2b847cd..83f060b9b2e23 100644 --- a/src/lightning/__setup__.py +++ b/src/lightning/__setup__.py @@ -39,6 +39,7 @@ def _adjust_manifest(**kwargs: Any) -> None: lines += [ "recursive-include src *.md" + os.linesep, "recursive-include requirements *.txt" + os.linesep, + "recursive-include src/lightning_app/cli/*-template *" + os.linesep, # Add templates ] with open(manifest_path, "w") as fp: fp.writelines(lines) diff --git a/src/lightning_app/__setup__.py b/src/lightning_app/__setup__.py index 0e892fa047ab7..9fe01a0ebe0e5 100644 --- a/src/lightning_app/__setup__.py +++ b/src/lightning_app/__setup__.py @@ -50,6 +50,7 @@ def _adjust_manifest(**__: Any) -> None: "recursive-exclude requirements *.txt" + os.linesep, "recursive-include src/lightning_app *.md" + os.linesep, "recursive-include requirements/app *.txt" + os.linesep, + "recursive-include src/lightning_app/cli/*-template *" + os.linesep, # Add templates ] # TODO: remove this once lightning-ui package is ready as a dependency From 74ab87821f4bff751f498d55269d95cfa9b44261 Mon Sep 17 00:00:00 2001 From: Lee Jungwon <33821003+BongYang@users.noreply.github.com> Date: Thu, 21 Jul 2022 02:07:38 +0900 Subject: [PATCH 23/35] Fix mypy typing errors in pytorch_lightning/callbacks/model_checkpoint.py (#13617) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Mocholí --- pyproject.toml | 1 - .../callbacks/early_stopping.py | 2 +- .../callbacks/model_checkpoint.py | 42 +++++++++++-------- src/pytorch_lightning/core/module.py | 2 +- src/pytorch_lightning/strategies/strategy.py | 2 +- src/pytorch_lightning/strategies/tpu_spawn.py | 12 ++---- .../connectors/logger_connector/result.py | 2 +- src/pytorch_lightning/trainer/trainer.py | 4 +- .../utilities/distributed.py | 2 +- 9 files changed, 35 insertions(+), 34 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 989e63122f640..b6cbbbda15006 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,7 +47,6 @@ warn_no_return = "False" # the list can be generated with: # mypy --no-error-summary 2>&1 | tr ':' ' ' | awk '{print $1}' | sort | uniq | sed 's/\.py//g; s|src/||g; s|\/|\.|g' | xargs -I {} echo '"{}",' module = [ - "pytorch_lightning.callbacks.model_checkpoint", "pytorch_lightning.callbacks.progress.rich_progress", "pytorch_lightning.callbacks.quantization", "pytorch_lightning.callbacks.stochastic_weight_avg", diff --git a/src/pytorch_lightning/callbacks/early_stopping.py b/src/pytorch_lightning/callbacks/early_stopping.py index 2fd730482fcc4..72d8445d84407 100644 --- a/src/pytorch_lightning/callbacks/early_stopping.py +++ b/src/pytorch_lightning/callbacks/early_stopping.py @@ -135,7 +135,7 @@ def setup(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", stage: O # validation, then we run after validation instead of on train epoch end self._check_on_train_epoch_end = trainer.val_check_interval == 1.0 and trainer.check_val_every_n_epoch == 1 - def _validate_condition_metric(self, logs: Dict[str, float]) -> bool: + def _validate_condition_metric(self, logs: Dict[str, Tensor]) -> bool: monitor_val = logs.get(self.monitor) error_msg = ( diff --git a/src/pytorch_lightning/callbacks/model_checkpoint.py b/src/pytorch_lightning/callbacks/model_checkpoint.py index bb6d0a9a9b0b6..9b49b9d44bb10 100644 --- a/src/pytorch_lightning/callbacks/model_checkpoint.py +++ b/src/pytorch_lightning/callbacks/model_checkpoint.py @@ -39,7 +39,7 @@ from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.logger import _name, _version from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation, rank_zero_info, rank_zero_warn -from pytorch_lightning.utilities.types import _METRIC, _PATH, STEP_OUTPUT +from pytorch_lightning.utilities.types import _PATH, STEP_OUTPUT from pytorch_lightning.utilities.warnings import WarningCache log = logging.getLogger(__name__) @@ -231,13 +231,14 @@ def __init__( self._save_on_train_epoch_end = save_on_train_epoch_end self._last_global_step_saved = 0 # no need to save when no steps were taken self._last_time_checked: Optional[float] = None - self.current_score = None - self.best_k_models = {} + self.current_score: Optional[Tensor] = None + self.best_k_models: Dict[str, Tensor] = {} self.kth_best_model_path = "" - self.best_model_score = None + self.best_model_score: Optional[Tensor] = None self.best_model_path = "" self.last_model_path = "" + self.kth_value: Tensor self.__init_monitor_mode(mode) self.__init_ckpt_dir(dirpath, filename) self.__init_triggers(every_n_train_steps, every_n_epochs, train_time_interval) @@ -256,6 +257,7 @@ def state_key(self) -> str: def setup(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", stage: Optional[str] = None) -> None: self.__resolve_ckpt_dir(trainer) + assert self.dirpath is not None if trainer.is_global_zero and stage == "fit": self.__warn_if_dir_not_empty(self.dirpath) @@ -362,7 +364,7 @@ def save_checkpoint(self, trainer: "pl.Trainer") -> None: # pragma: no-cover self._save_topk_checkpoint(trainer, monitor_candidates) self._save_last_checkpoint(trainer, monitor_candidates) - def _save_topk_checkpoint(self, trainer: "pl.Trainer", monitor_candidates: Dict[str, _METRIC]) -> None: + def _save_topk_checkpoint(self, trainer: "pl.Trainer", monitor_candidates: Dict[str, Tensor]) -> None: if self.save_top_k == 0: return @@ -395,7 +397,7 @@ def _should_skip_saving_checkpoint(self, trainer: "pl.Trainer") -> bool: from pytorch_lightning.trainer.states import TrainerFn return ( - trainer.fast_dev_run # disable checkpointing with fast_dev_run + bool(trainer.fast_dev_run) # disable checkpointing with fast_dev_run or trainer.state.fn != TrainerFn.FITTING # don't save anything during non-fit or trainer.sanity_checking # don't save anything during sanity check or self._last_global_step_saved == trainer.global_step # already saved at the last step @@ -493,7 +495,7 @@ def check_monitor_top_k(self, trainer: "pl.Trainer", current: Optional[Tensor] = should_update_best_and_save = monitor_op(current, self.best_k_models[self.kth_best_model_path]) # If using multiple devices, make sure all processes are unanimous on the decision. - should_update_best_and_save = trainer.strategy.reduce_boolean_decision(should_update_best_and_save) + should_update_best_and_save = trainer.strategy.reduce_boolean_decision(bool(should_update_best_and_save)) return should_update_best_and_save @@ -501,7 +503,7 @@ def check_monitor_top_k(self, trainer: "pl.Trainer", current: Optional[Tensor] = def _format_checkpoint_name( cls, filename: Optional[str], - metrics: Dict[str, _METRIC], + metrics: Dict[str, Tensor], prefix: str = "", auto_insert_metric_name: bool = True, ) -> str: @@ -522,7 +524,7 @@ def _format_checkpoint_name( filename = filename.replace(group, f"{{0[{name}]") if name not in metrics: - metrics[name] = 0 + metrics[name] = torch.tensor(0) filename = filename.format(metrics) if prefix: @@ -531,7 +533,7 @@ def _format_checkpoint_name( return filename def format_checkpoint_name( - self, metrics: Dict[str, _METRIC], filename: Optional[str] = None, ver: Optional[int] = None + self, metrics: Dict[str, Tensor], filename: Optional[str] = None, ver: Optional[int] = None ) -> str: """Generate a filename according to the defined template. @@ -591,6 +593,7 @@ def __resolve_ckpt_dir(self, trainer: "pl.Trainer") -> None: ckpt_path = os.path.join(trainer._weights_save_path_internal, "checkpoints") elif trainer.loggers: if len(trainer.loggers) == 1: + assert trainer.logger is not None save_dir = trainer.logger.save_dir or trainer.default_root_dir else: save_dir = trainer.default_root_dir @@ -613,7 +616,7 @@ def __warn_if_dir_not_empty(self, dirpath: _PATH) -> None: rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.") def _get_metric_interpolated_filepath_name( - self, monitor_candidates: Dict[str, _METRIC], trainer: "pl.Trainer", del_filepath: Optional[str] = None + self, monitor_candidates: Dict[str, Tensor], trainer: "pl.Trainer", del_filepath: Optional[str] = None ) -> str: filepath = self.format_checkpoint_name(monitor_candidates) @@ -624,7 +627,7 @@ def _get_metric_interpolated_filepath_name( return filepath - def _monitor_candidates(self, trainer: "pl.Trainer") -> Dict[str, _METRIC]: + def _monitor_candidates(self, trainer: "pl.Trainer") -> Dict[str, Tensor]: monitor_candidates = deepcopy(trainer.callback_metrics) # cast to int if necessary because `self.log("epoch", 123)` will convert it to float. if it's not a tensor # or does not exist we overwrite it as it's likely an error @@ -634,7 +637,7 @@ def _monitor_candidates(self, trainer: "pl.Trainer") -> Dict[str, _METRIC]: monitor_candidates["step"] = step.int() if isinstance(step, Tensor) else torch.tensor(trainer.global_step) return monitor_candidates - def _save_last_checkpoint(self, trainer: "pl.Trainer", monitor_candidates: Dict[str, _METRIC]) -> None: + def _save_last_checkpoint(self, trainer: "pl.Trainer", monitor_candidates: Dict[str, Tensor]) -> None: if not self.save_last: return @@ -651,16 +654,18 @@ def _save_last_checkpoint(self, trainer: "pl.Trainer", monitor_candidates: Dict[ if previous and previous != filepath: trainer.strategy.remove_checkpoint(previous) - def _save_monitor_checkpoint(self, trainer: "pl.Trainer", monitor_candidates: Dict[str, _METRIC]) -> None: + def _save_monitor_checkpoint(self, trainer: "pl.Trainer", monitor_candidates: Dict[str, Tensor]) -> None: + assert self.monitor current = monitor_candidates.get(self.monitor) if self.check_monitor_top_k(trainer, current): + assert current is not None self._update_best_and_save(current, trainer, monitor_candidates) elif self.verbose: epoch = monitor_candidates["epoch"] step = monitor_candidates["step"] rank_zero_info(f"Epoch {epoch:d}, global step {step:d}: {self.monitor!r} was not in top {self.save_top_k}") - def _save_none_monitor_checkpoint(self, trainer: "pl.Trainer", monitor_candidates: Dict[str, _METRIC]) -> None: + def _save_none_monitor_checkpoint(self, trainer: "pl.Trainer", monitor_candidates: Dict[str, Tensor]) -> None: filepath = self._get_metric_interpolated_filepath_name(monitor_candidates, trainer) # set the best model path before saving because it will be part of the state. previous, self.best_model_path = self.best_model_path, filepath @@ -669,7 +674,7 @@ def _save_none_monitor_checkpoint(self, trainer: "pl.Trainer", monitor_candidate trainer.strategy.remove_checkpoint(previous) def _update_best_and_save( - self, current: Tensor, trainer: "pl.Trainer", monitor_candidates: Dict[str, _METRIC] + self, current: Tensor, trainer: "pl.Trainer", monitor_candidates: Dict[str, Tensor] ) -> None: k = len(self.best_k_models) + 1 if self.save_top_k == -1 else self.save_top_k @@ -691,11 +696,11 @@ def _update_best_and_save( if len(self.best_k_models) == k: # monitor dict has reached k elements _op = max if self.mode == "min" else min - self.kth_best_model_path = _op(self.best_k_models, key=self.best_k_models.get) + self.kth_best_model_path = _op(self.best_k_models, key=self.best_k_models.get) # type: ignore[arg-type] self.kth_value = self.best_k_models[self.kth_best_model_path] _op = min if self.mode == "min" else max - self.best_model_path = _op(self.best_k_models, key=self.best_k_models.get) + self.best_model_path = _op(self.best_k_models, key=self.best_k_models.get) # type: ignore[arg-type] self.best_model_score = self.best_k_models[self.best_model_path] if self.verbose: @@ -715,6 +720,7 @@ def to_yaml(self, filepath: Optional[_PATH] = None) -> None: file.""" best_k = {k: v.item() for k, v in self.best_k_models.items()} if filepath is None: + assert self.dirpath filepath = os.path.join(self.dirpath, "best_k_models.yaml") with self._fs.open(filepath, "w") as fp: yaml.dump(best_k, fp) diff --git a/src/pytorch_lightning/core/module.py b/src/pytorch_lightning/core/module.py index 4b8770dd89a4e..d07b272c171c7 100644 --- a/src/pytorch_lightning/core/module.py +++ b/src/pytorch_lightning/core/module.py @@ -532,7 +532,7 @@ def __to_tensor(self, value: numbers.Number) -> Tensor: return torch.tensor(value, device=self.device) @staticmethod - def __check_numel_1(value: torch.Tensor, name: str) -> None: + def __check_numel_1(value: Tensor, name: str) -> None: if not torch.numel(value) == 1: raise ValueError( f"`self.log({name}, {value})` was called, but the tensor must have a single element." diff --git a/src/pytorch_lightning/strategies/strategy.py b/src/pytorch_lightning/strategies/strategy.py index 2cbf14760f83f..0a9b19376bcd4 100644 --- a/src/pytorch_lightning/strategies/strategy.py +++ b/src/pytorch_lightning/strategies/strategy.py @@ -285,7 +285,7 @@ def all_gather(self, tensor: Tensor, group: Optional[Any] = None, sync_grads: bo """ def reduce_boolean_decision(self, decision: bool) -> bool: - """Reduce the early stopping decision across all processes.""" + """Reduce a boolean decision across all processes.""" return decision def pre_backward(self, closure_loss: Tensor) -> None: diff --git a/src/pytorch_lightning/strategies/tpu_spawn.py b/src/pytorch_lightning/strategies/tpu_spawn.py index 178fe638cc0a3..b27f299fb3722 100644 --- a/src/pytorch_lightning/strategies/tpu_spawn.py +++ b/src/pytorch_lightning/strategies/tpu_spawn.py @@ -169,19 +169,13 @@ def broadcast(self, obj: object, src: int = 0) -> object: obj = torch.load(buffer) return obj - def reduce_boolean_decision(self, decision: bool) -> bool: - decision = torch.tensor(int(decision), device=self.root_device) - decision = self.reduce(decision, reduce_op="sum") - decision = bool(decision == self.world_size) - return decision - def reduce(self, output, group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = None): if not isinstance(output, Tensor): output = torch.tensor(output, device=self.root_device) - _invalid_reduce_op = isinstance(reduce_op, ReduceOp) and reduce_op != ReduceOp.SUM - _invalid_reduce_op_str = isinstance(reduce_op, str) and reduce_op.lower() not in ("sum", "mean", "avg") - if _invalid_reduce_op or _invalid_reduce_op_str: + invalid_reduce_op = isinstance(reduce_op, ReduceOp) and reduce_op != ReduceOp.SUM + invalid_reduce_op_str = isinstance(reduce_op, str) and reduce_op.lower() not in ("sum", "mean", "avg") + if invalid_reduce_op or invalid_reduce_op_str: raise MisconfigurationException( "Currently, TPUSpawn Strategy only support `sum`, `mean`, `avg` reduce operation." ) diff --git a/src/pytorch_lightning/trainer/connectors/logger_connector/result.py b/src/pytorch_lightning/trainer/connectors/logger_connector/result.py index a33359a3fe5e9..27cb3cb0323b2 100644 --- a/src/pytorch_lightning/trainer/connectors/logger_connector/result.py +++ b/src/pytorch_lightning/trainer/connectors/logger_connector/result.py @@ -529,7 +529,7 @@ def _get_cache(result_metric: _ResultMetric, on_step: bool) -> Optional[Tensor]: result_metric.meta.sync.should = should cache = result_metric._computed if cache is not None: - if not isinstance(cache, torch.Tensor): + if not isinstance(cache, Tensor): raise ValueError( f"The `.compute()` return of the metric logged as {result_metric.meta.name!r} must be a tensor." f" Found {cache}" diff --git a/src/pytorch_lightning/trainer/trainer.py b/src/pytorch_lightning/trainer/trainer.py index 882326f870de6..b53e19a11e9f6 100644 --- a/src/pytorch_lightning/trainer/trainer.py +++ b/src/pytorch_lightning/trainer/trainer.py @@ -2705,7 +2705,9 @@ def loggers(self, loggers: Optional[List[Logger]]) -> None: self._loggers = loggers if loggers else [] @property - def callback_metrics(self) -> dict: + def callback_metrics(self) -> Dict[str, Tensor]: + # TODO: the true typing return can include dictionaries as defined in + # `pytorch_lightning.trainer.connectors.logger_connector.result._OUT_DICT` return self._logger_connector.callback_metrics @property diff --git a/src/pytorch_lightning/utilities/distributed.py b/src/pytorch_lightning/utilities/distributed.py index bc7ed3debaf90..361c6dd12beeb 100644 --- a/src/pytorch_lightning/utilities/distributed.py +++ b/src/pytorch_lightning/utilities/distributed.py @@ -99,7 +99,7 @@ def gather_all_tensors(result: Tensor, group: Optional[Any] = None) -> List[Tens return gathered_result -def _simple_gather_all_tensors(result: torch.Tensor, group: Any, world_size: int) -> List[torch.Tensor]: +def _simple_gather_all_tensors(result: Tensor, group: Any, world_size: int) -> List[Tensor]: gathered_result = [torch.zeros_like(result) for _ in range(world_size)] torch.distributed.all_gather(gathered_result, result, group) return gathered_result From 588e831df1512935bd5c7676a8be875118b72a75 Mon Sep 17 00:00:00 2001 From: Nathaniel D'Amours <88633026+NathanielDamours@users.noreply.github.com> Date: Wed, 20 Jul 2022 19:29:31 +0000 Subject: [PATCH 24/35] Fix typos initialize in docs (#13557) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Mocholí Co-authored-by: Adrian Wälchli --- docs/source-app/code_samples/quickstart/app/app_1.py | 2 +- docs/source-app/code_samples/quickstart/hello_world/app.py | 2 +- docs/source-app/get_started/go_beyond_training_content.rst | 2 +- docs/source-pytorch/common/checkpointing_basic.rst | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/source-app/code_samples/quickstart/app/app_1.py b/docs/source-app/code_samples/quickstart/app/app_1.py index ac41c5ef83fa1..29d8db245a170 100644 --- a/docs/source-app/code_samples/quickstart/app/app_1.py +++ b/docs/source-app/code_samples/quickstart/app/app_1.py @@ -88,5 +88,5 @@ def run(self): # Step 4: download a dataset to your local directory under `/data` download_data("https://pl-flash-data.s3.amazonaws.com/hymenoptera_data.zip", "./data") -# Initalize your Lightning app with 5 epochs +# Initialize your Lightning app with 5 epochs app = L.LightningApp(RootFlow(5, "./data/hymenoptera_data")) diff --git a/docs/source-app/code_samples/quickstart/hello_world/app.py b/docs/source-app/code_samples/quickstart/hello_world/app.py index 7514d9085604b..07a764cb98538 100644 --- a/docs/source-app/code_samples/quickstart/hello_world/app.py +++ b/docs/source-app/code_samples/quickstart/hello_world/app.py @@ -12,5 +12,5 @@ def run(self): print("Hello World!") -# Step 3: Initalize a LightningApp with the LightningFlow you defined (in step 1) +# Step 3: Initialize a LightningApp with the LightningFlow you defined (in step 1) app = L.LightningApp(HelloWorld()) diff --git a/docs/source-app/get_started/go_beyond_training_content.rst b/docs/source-app/get_started/go_beyond_training_content.rst index a471e91d85a9c..c8baeb8fab77b 100644 --- a/docs/source-app/get_started/go_beyond_training_content.rst +++ b/docs/source-app/get_started/go_beyond_training_content.rst @@ -308,7 +308,7 @@ Implement the ``configure_layout`` method to connect them together: 5: Init the ``app`` object ^^^^^^^^^^^^^^^^^^^^^^^^^^ -Initalize an ``app`` object with the ``TrainDeploy`` component (this won't run the App yet): +Initialize an ``app`` object with the ``TrainDeploy`` component (this won't run the App yet): .. code:: python :emphasize-lines: 29 diff --git a/docs/source-pytorch/common/checkpointing_basic.rst b/docs/source-pytorch/common/checkpointing_basic.rst index 6ff54c94245d2..8a4834096c44d 100644 --- a/docs/source-pytorch/common/checkpointing_basic.rst +++ b/docs/source-pytorch/common/checkpointing_basic.rst @@ -106,8 +106,8 @@ The LightningModule also has access to the Hyperparameters ---- -Initalize with other parameters -=============================== +Initialize with other parameters +================================ If you used the *self.save_hyperparameters()* method in the init of the LightningModule, you can initialize the model with different hyperparameters. .. code-block:: python From 94cb5906983e2cbaea96b321efaeea7c06d1c8b0 Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Thu, 21 Jul 2022 02:03:00 +0530 Subject: [PATCH 25/35] Fix main progress bar counter when `val_check_interval=int` and `check_val_every_n_epoch=None` (#12832) --- docs/source-pytorch/common/trainer.rst | 10 ++++-- src/pytorch_lightning/CHANGELOG.md | 5 ++- .../callbacks/progress/base.py | 21 ++++++++++++ .../callbacks/progress/rich_progress.py | 12 ++----- .../callbacks/progress/tqdm_progress.py | 8 +---- .../loops/epoch/training_epoch_loop.py | 13 ++++--- src/pytorch_lightning/trainer/trainer.py | 3 +- .../callbacks/progress/test_base_progress.py | 34 +++++++++++++++++++ .../{ => progress}/test_rich_progress_bar.py | 0 .../{ => progress}/test_tqdm_progress_bar.py | 0 .../trainer/flags/test_val_check_interval.py | 11 +++--- 11 files changed, 85 insertions(+), 32 deletions(-) create mode 100644 tests/tests_pytorch/callbacks/progress/test_base_progress.py rename tests/tests_pytorch/callbacks/{ => progress}/test_rich_progress_bar.py (100%) rename tests/tests_pytorch/callbacks/{ => progress}/test_tqdm_progress_bar.py (100%) diff --git a/docs/source-pytorch/common/trainer.rst b/docs/source-pytorch/common/trainer.rst index b7c4c21018dcc..c3251917f8d3b 100644 --- a/docs/source-pytorch/common/trainer.rst +++ b/docs/source-pytorch/common/trainer.rst @@ -1479,7 +1479,8 @@ How often within one training epoch to check the validation set. Can specify as float or int. - pass a ``float`` in the range [0.0, 1.0] to check after a fraction of the training epoch. -- pass an ``int`` to check after a fixed number of training batches. +- pass an ``int`` to check after a fixed number of training batches. An ``int`` value can only be higher than the number of training + batches when ``check_val_every_n_epoch=None``, which validates after every ``N`` training batches across epochs or iteration-based training. .. testcode:: @@ -1489,10 +1490,13 @@ Can specify as float or int. # check validation set 4 times during a training epoch trainer = Trainer(val_check_interval=0.25) - # check validation set every 1000 training batches + # check validation set every 1000 training batches in the current epoch + trainer = Trainer(val_check_interval=1000) + + # check validation set every 1000 training batches across complete epochs or during iteration-based training # use this when using iterableDataset and your dataset has no length # (ie: production cases with streaming data) - trainer = Trainer(val_check_interval=1000) + trainer = Trainer(val_check_interval=1000, check_val_every_n_epoch=None) .. code-block:: python diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 6aed707726079..66c249db1456a 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -144,7 +144,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Enabled using any Sampler in distributed environment in Lite ([#13646](https://github.com/PyTorchLightning/pytorch-lightning/pull/13646)) -- +- Updated `val_check_interval`(int) to consider total train batches processed instead of `_batches_that_stepped` for validation check during training ([#12832](https://github.com/Lightning-AI/lightning/pull/12832) ### Deprecated @@ -345,6 +345,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed `Trainer.predict(return_predictions=False)` to track prediction's batch_indices ([#13629](https://github.com/Lightning-AI/lightning/pull/13629)) +- Fixed main progress bar counter when `val_check_interval=int` and `check_val_every_n_epoch=None` ([#12832](https://github.com/Lightning-AI/lightning/pull/12832) + + ## [1.6.5] - 2022-07-13 ### Fixed diff --git a/src/pytorch_lightning/callbacks/progress/base.py b/src/pytorch_lightning/callbacks/progress/base.py index 91a662455545f..8baab0c65fccd 100644 --- a/src/pytorch_lightning/callbacks/progress/base.py +++ b/src/pytorch_lightning/callbacks/progress/base.py @@ -172,6 +172,27 @@ def total_val_batches(self) -> Union[int, float]: assert self._trainer is not None return sum(self.trainer.num_val_batches) if self._trainer.fit_loop.epoch_loop._should_check_val_epoch() else 0 + @property + def total_batches_current_epoch(self) -> Union[int, float]: + total_train_batches = self.total_train_batches + total_val_batches = self.total_val_batches + assert self._trainer is not None + + if total_train_batches != float("inf") and total_val_batches != float("inf"): + # val can be checked multiple times per epoch + val_check_batch = self.trainer.val_check_batch + if self.trainer.check_val_every_n_epoch is None: + train_batches_processed = self.trainer.fit_loop.total_batch_idx + 1 + val_checks_per_epoch = ((train_batches_processed + total_train_batches) // val_check_batch) - ( + train_batches_processed // val_check_batch + ) + else: + val_checks_per_epoch = total_train_batches // val_check_batch + + total_val_batches = total_val_batches * val_checks_per_epoch + + return total_train_batches + total_val_batches + def has_dataloader_changed(self, dataloader_idx: int) -> bool: old_dataloader_idx = self._current_eval_dataloader_idx self._current_eval_dataloader_idx = dataloader_idx diff --git a/src/pytorch_lightning/callbacks/progress/rich_progress.py b/src/pytorch_lightning/callbacks/progress/rich_progress.py index d2cdb8411b3d2..ac27397640d4c 100644 --- a/src/pytorch_lightning/callbacks/progress/rich_progress.py +++ b/src/pytorch_lightning/callbacks/progress/rich_progress.py @@ -324,16 +324,9 @@ def on_sanity_check_end(self, trainer, pl_module): self.refresh() def on_train_epoch_start(self, trainer, pl_module): - total_train_batches = self.total_train_batches - total_val_batches = self.total_val_batches - if total_train_batches != float("inf"): - # val can be checked multiple times per epoch - val_checks_per_epoch = total_train_batches // trainer.val_check_batch - total_val_batches = total_val_batches * val_checks_per_epoch - - total_batches = total_train_batches + total_val_batches - + total_batches = self.total_batches_current_epoch train_description = self._get_train_description(trainer.current_epoch) + if self.main_progress_bar_id is not None and self._leave: self._stop_progress() self._init_progress(trainer) @@ -343,6 +336,7 @@ def on_train_epoch_start(self, trainer, pl_module): self.progress.reset( self.main_progress_bar_id, total=total_batches, description=train_description, visible=True ) + self.refresh() def on_validation_batch_start( diff --git a/src/pytorch_lightning/callbacks/progress/tqdm_progress.py b/src/pytorch_lightning/callbacks/progress/tqdm_progress.py index 204ee574e11a4..ff203c666216f 100644 --- a/src/pytorch_lightning/callbacks/progress/tqdm_progress.py +++ b/src/pytorch_lightning/callbacks/progress/tqdm_progress.py @@ -252,13 +252,7 @@ def on_train_start(self, *_: Any) -> None: self.main_progress_bar = self.init_train_tqdm() def on_train_epoch_start(self, trainer: "pl.Trainer", *_: Any) -> None: - total_train_batches = self.total_train_batches - total_val_batches = self.total_val_batches - if total_train_batches != float("inf") and total_val_batches != float("inf"): - # val can be checked multiple times per epoch - val_checks_per_epoch = total_train_batches // trainer.val_check_batch - total_val_batches = total_val_batches * val_checks_per_epoch - total_batches = total_train_batches + total_val_batches + total_batches = self.total_batches_current_epoch self.main_progress_bar.reset(convert_inf(total_batches)) self.main_progress_bar.set_description(f"Epoch {trainer.current_epoch}") diff --git a/src/pytorch_lightning/loops/epoch/training_epoch_loop.py b/src/pytorch_lightning/loops/epoch/training_epoch_loop.py index 33ee2ce484d8b..039ff51f0596b 100644 --- a/src/pytorch_lightning/loops/epoch/training_epoch_loop.py +++ b/src/pytorch_lightning/loops/epoch/training_epoch_loop.py @@ -163,7 +163,7 @@ def advance(self, data_fetcher: AbstractDataFetcher) -> None: # type: ignore[ov Raises: StopIteration: When the epoch is canceled by the user returning -1 """ - if self.restarting and self._should_check_val_fx(self.batch_idx, self.batch_progress.is_last_batch): + if self.restarting and self._should_check_val_fx(): # skip training and run validation in `on_advance_end` return # we are going to train first so the val loop does not need to restart @@ -235,7 +235,7 @@ def on_advance_end(self) -> None: # ----------------------------------------- # VALIDATE IF NEEDED # ----------------------------------------- - should_check_val = self._should_check_val_fx(self.batch_idx, self.batch_progress.is_last_batch) + should_check_val = self._should_check_val_fx() if should_check_val: self.trainer.validating = True self._run_validation() @@ -496,13 +496,14 @@ def _should_check_val_epoch(self) -> bool: or (self.trainer.current_epoch + 1) % self.trainer.check_val_every_n_epoch == 0 ) - def _should_check_val_fx(self, batch_idx: int, is_last_batch: bool) -> bool: + def _should_check_val_fx(self) -> bool: """Decide if we should run validation.""" if not self._should_check_val_epoch(): return False # val_check_batch is inf for iterable datasets with no length defined is_infinite_dataset = self.trainer.val_check_batch == float("inf") + is_last_batch = self.batch_progress.is_last_batch if is_last_batch and is_infinite_dataset: return True @@ -512,13 +513,11 @@ def _should_check_val_fx(self, batch_idx: int, is_last_batch: bool) -> bool: # TODO(@awaelchli): let training/eval loop handle logic around limit_*_batches and val_check_batch is_val_check_batch = is_last_batch if isinstance(self.trainer.limit_train_batches, int) and is_infinite_dataset: - is_val_check_batch = (batch_idx + 1) % self.trainer.limit_train_batches == 0 + is_val_check_batch = (self.batch_idx + 1) % self.trainer.limit_train_batches == 0 elif self.trainer.val_check_batch != float("inf"): # if `check_val_every_n_epoch is `None`, run a validation loop every n training batches # else condition it based on the batch_idx of the current epoch - current_iteration = ( - self._batches_that_stepped if self.trainer.check_val_every_n_epoch is None else batch_idx - ) + current_iteration = self.total_batch_idx if self.trainer.check_val_every_n_epoch is None else self.batch_idx is_val_check_batch = (current_iteration + 1) % self.trainer.val_check_batch == 0 return is_val_check_batch diff --git a/src/pytorch_lightning/trainer/trainer.py b/src/pytorch_lightning/trainer/trainer.py index b53e19a11e9f6..46e991d1bbbab 100644 --- a/src/pytorch_lightning/trainer/trainer.py +++ b/src/pytorch_lightning/trainer/trainer.py @@ -394,7 +394,8 @@ def __init__( val_check_interval: How often to check the validation set. Pass a ``float`` in the range [0.0, 1.0] to check after a fraction of the training epoch. Pass an ``int`` to check after a fixed number of training batches. An ``int`` value can only be higher than the number of training batches when - ``check_val_every_n_epoch=None``. + ``check_val_every_n_epoch=None``, which validates after every ``N`` training batches + across epochs or during iteration-based training. Default: ``1.0``. enable_model_summary: Whether to enable model summarization by default. diff --git a/tests/tests_pytorch/callbacks/progress/test_base_progress.py b/tests/tests_pytorch/callbacks/progress/test_base_progress.py new file mode 100644 index 0000000000000..75f276a6b913d --- /dev/null +++ b/tests/tests_pytorch/callbacks/progress/test_base_progress.py @@ -0,0 +1,34 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from pytorch_lightning.demos.boring_classes import BoringModel +from pytorch_lightning.trainer.trainer import Trainer + + +def test_main_progress_bar_with_val_check_interval_int(): + """Test the main progress bar count when val_check_interval=int and check_val_every_n_epoch=None.""" + train_batches = 5 + trainer = Trainer( + limit_train_batches=train_batches, limit_val_batches=10, val_check_interval=3, check_val_every_n_epoch=None + ) + model = BoringModel() + trainer.progress_bar_callback.setup(trainer, model) + trainer.strategy.connect(model) + trainer._data_connector.attach_data(model) + trainer.reset_train_dataloader() + trainer.reset_val_dataloader() + expected = [15, 25, 25, 15] + + for count in expected: + assert trainer.progress_bar_callback.total_batches_current_epoch == count + trainer.fit_loop.epoch_loop.batch_progress.total.ready += train_batches diff --git a/tests/tests_pytorch/callbacks/test_rich_progress_bar.py b/tests/tests_pytorch/callbacks/progress/test_rich_progress_bar.py similarity index 100% rename from tests/tests_pytorch/callbacks/test_rich_progress_bar.py rename to tests/tests_pytorch/callbacks/progress/test_rich_progress_bar.py diff --git a/tests/tests_pytorch/callbacks/test_tqdm_progress_bar.py b/tests/tests_pytorch/callbacks/progress/test_tqdm_progress_bar.py similarity index 100% rename from tests/tests_pytorch/callbacks/test_tqdm_progress_bar.py rename to tests/tests_pytorch/callbacks/progress/test_tqdm_progress_bar.py diff --git a/tests/tests_pytorch/trainer/flags/test_val_check_interval.py b/tests/tests_pytorch/trainer/flags/test_val_check_interval.py index b0559b9daa11f..9414fd1c5096f 100644 --- a/tests/tests_pytorch/trainer/flags/test_val_check_interval.py +++ b/tests/tests_pytorch/trainer/flags/test_val_check_interval.py @@ -63,10 +63,12 @@ def test_val_check_interval_info_message(caplog, value): @pytest.mark.parametrize("use_infinite_dataset", [True, False]) -def test_validation_check_interval_exceed_data_length_correct(tmpdir, use_infinite_dataset): +@pytest.mark.parametrize("accumulate_grad_batches", [1, 2]) +def test_validation_check_interval_exceed_data_length_correct(tmpdir, use_infinite_dataset, accumulate_grad_batches): data_samples_train = 4 max_epochs = 3 max_steps = data_samples_train * max_epochs + max_opt_steps = max_steps // accumulate_grad_batches class TestModel(BoringModel): def __init__(self): @@ -74,7 +76,7 @@ def __init__(self): self.validation_called_at_step = set() def validation_step(self, *args): - self.validation_called_at_step.add(self.global_step) + self.validation_called_at_step.add(self.trainer.fit_loop.total_batch_idx + 1) return super().validation_step(*args) def train_dataloader(self): @@ -89,16 +91,17 @@ def train_dataloader(self): trainer = Trainer( default_root_dir=tmpdir, limit_val_batches=1, - max_steps=max_steps, + max_steps=max_opt_steps, val_check_interval=3, check_val_every_n_epoch=None, num_sanity_val_steps=0, + accumulate_grad_batches=accumulate_grad_batches, ) trainer.fit(model) assert trainer.current_epoch == 1 if use_infinite_dataset else max_epochs - assert trainer.global_step == max_steps + assert trainer.global_step == max_opt_steps assert sorted(list(model.validation_called_at_step)) == [3, 6, 9, 12] From 054bf136e594e67dcc5f80012501832ca9713dd2 Mon Sep 17 00:00:00 2001 From: Justin Goheen <26209687+JustinGoheen@users.noreply.github.com> Date: Wed, 20 Jul 2022 20:01:10 -0400 Subject: [PATCH 26/35] Fix mypy errors attributed to `pytorch_lightning.loggers.tensorboard.py` (#13688) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrian Wälchli Co-authored-by: Rohit Gupta Co-authored-by: Carlos Mocholí --- pyproject.toml | 1 - src/pytorch_lightning/loggers/tensorboard.py | 18 +++++++++--------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index b6cbbbda15006..2820ee1334884 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,7 +60,6 @@ module = [ "pytorch_lightning.loggers.comet", "pytorch_lightning.loggers.mlflow", "pytorch_lightning.loggers.neptune", - "pytorch_lightning.loggers.tensorboard", "pytorch_lightning.loggers.wandb", "pytorch_lightning.profilers.advanced", "pytorch_lightning.profilers.base", diff --git a/src/pytorch_lightning/loggers/tensorboard.py b/src/pytorch_lightning/loggers/tensorboard.py index 9f3023ee443ef..12ec2e21b84ce 100644 --- a/src/pytorch_lightning/loggers/tensorboard.py +++ b/src/pytorch_lightning/loggers/tensorboard.py @@ -96,7 +96,7 @@ def __init__( sub_dir: Optional[str] = None, agg_key_funcs: Optional[Mapping[str, Callable[[Sequence[float]], float]]] = None, agg_default_func: Optional[Callable[[Sequence[float]], float]] = None, - **kwargs, + **kwargs: Any, ): super().__init__(agg_key_funcs=agg_key_funcs, agg_default_func=agg_default_func) self._save_dir = save_dir @@ -108,8 +108,8 @@ def __init__( self._prefix = prefix self._fs = get_filesystem(save_dir) - self._experiment = None - self.hparams = {} + self._experiment: Optional["SummaryWriter"] = None + self.hparams: Union[Dict[str, Any], Namespace] = {} self._kwargs = kwargs @property @@ -138,7 +138,7 @@ def log_dir(self) -> str: return log_dir @property - def save_dir(self) -> Optional[str]: + def save_dir(self) -> str: """Gets the save directory where the TensorBoard experiments are saved. Returns: @@ -155,7 +155,7 @@ def sub_dir(self) -> Optional[str]: """ return self._sub_dir - @property + @property # type: ignore[misc] @rank_zero_experiment def experiment(self) -> SummaryWriter: r""" @@ -236,7 +236,7 @@ def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> raise ValueError(m) from ex @rank_zero_only - def log_graph(self, model: "pl.LightningModule", input_array=None): + def log_graph(self, model: "pl.LightningModule", input_array: Optional[Tensor] = None) -> None: if self._log_graph: if input_array is None: input_array = model.example_input_array @@ -281,7 +281,7 @@ def name(self) -> str: return self._name @property - def version(self) -> int: + def version(self) -> Union[int, str]: """Get the experiment version. Returns: @@ -291,7 +291,7 @@ def version(self) -> int: self._version = self._get_next_version() return self._version - def _get_next_version(self): + def _get_next_version(self) -> int: root_dir = self.root_dir try: @@ -318,7 +318,7 @@ def _sanitize_params(params: Dict[str, Any]) -> Dict[str, Any]: # logging of arrays with dimension > 1 is not supported, sanitize as string return {k: str(v) if isinstance(v, (Tensor, np.ndarray)) and v.ndim > 1 else v for k, v in params.items()} - def __getstate__(self): + def __getstate__(self) -> Dict[str, Any]: state = self.__dict__.copy() state["_experiment"] = None return state From e36fd771e62955c471b117f2861892ff66e06885 Mon Sep 17 00:00:00 2001 From: Justin Goheen <26209687+JustinGoheen@users.noreply.github.com> Date: Wed, 20 Jul 2022 20:03:36 -0400 Subject: [PATCH 27/35] Fix mypy errors attributed to `pytorch_lightning.loggers.mlflow` (#13691) Co-authored-by: Jirka Borovec Co-authored-by: otaj <6065855+otaj@users.noreply.github.com> --- pyproject.toml | 1 - src/pytorch_lightning/loggers/logger.py | 4 ++-- src/pytorch_lightning/loggers/mlflow.py | 24 +++++++++++++++++------- 3 files changed, 19 insertions(+), 10 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 2820ee1334884..50b67dec5a04b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -58,7 +58,6 @@ module = [ "pytorch_lightning.demos.boring_classes", "pytorch_lightning.demos.mnist_datamodule", "pytorch_lightning.loggers.comet", - "pytorch_lightning.loggers.mlflow", "pytorch_lightning.loggers.neptune", "pytorch_lightning.loggers.wandb", "pytorch_lightning.profilers.advanced", diff --git a/src/pytorch_lightning/loggers/logger.py b/src/pytorch_lightning/loggers/logger.py index 03d934aa58760..56bf4660c29dd 100644 --- a/src/pytorch_lightning/loggers/logger.py +++ b/src/pytorch_lightning/loggers/logger.py @@ -203,12 +203,12 @@ def group_separator(self) -> str: @property @abstractmethod - def name(self) -> str: + def name(self) -> Optional[str]: """Return the experiment name.""" @property @abstractmethod - def version(self) -> Union[int, str]: + def version(self) -> Optional[Union[int, str]]: """Return the experiment version.""" diff --git a/src/pytorch_lightning/loggers/mlflow.py b/src/pytorch_lightning/loggers/mlflow.py index b8ce0ef423a31..313fcfe07f10e 100644 --- a/src/pytorch_lightning/loggers/mlflow.py +++ b/src/pytorch_lightning/loggers/mlflow.py @@ -50,7 +50,17 @@ from mlflow.tracking.context.registry import resolve_tags else: - def resolve_tags(tags=None): + def resolve_tags(tags: Optional[Dict] = None) -> Optional[Dict]: + """ + Args: + tags: A dictionary of tags to override. If specified, tags passed in this argument will + override those inferred from the context. + + Returns: A dictionary of resolved tags. + + Note: + See ``mlflow.tracking.context.registry`` for more details. + """ return tags @@ -129,7 +139,7 @@ def __init__( tracking_uri = f"{LOCAL_FILE_URI_PREFIX}{save_dir}" self._experiment_name = experiment_name - self._experiment_id = None + self._experiment_id: Optional[str] = None self._tracking_uri = tracking_uri self._run_name = run_name self._run_id = run_id @@ -141,7 +151,7 @@ def __init__( self._mlflow_client = MlflowClient(tracking_uri) - @property + @property # type: ignore[misc] @rank_zero_experiment def experiment(self) -> MlflowClient: r""" @@ -187,7 +197,7 @@ def experiment(self) -> MlflowClient: return self._mlflow_client @property - def run_id(self) -> str: + def run_id(self) -> Optional[str]: """Create the experiment if it does not exist to get the run id. Returns: @@ -197,7 +207,7 @@ def run_id(self) -> str: return self._run_id @property - def experiment_id(self) -> str: + def experiment_id(self) -> Optional[str]: """Create the experiment if it does not exist to get the experiment id. Returns: @@ -261,7 +271,7 @@ def save_dir(self) -> Optional[str]: return self._tracking_uri.lstrip(LOCAL_FILE_URI_PREFIX) @property - def name(self) -> str: + def name(self) -> Optional[str]: """Get the experiment id. Returns: @@ -270,7 +280,7 @@ def name(self) -> str: return self.experiment_id @property - def version(self) -> str: + def version(self) -> Optional[str]: """Get the run id. Returns: From 86341ba660f22657dffe0599c021f2e87429400c Mon Sep 17 00:00:00 2001 From: Gautier Dagan Date: Thu, 21 Jul 2022 03:07:24 +0200 Subject: [PATCH 28/35] fix mypy errors for loggers/wandb.py (#13483) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Mocholí Co-authored-by: Rohit Gupta Co-authored-by: Akihiro Nitta --- environment.yml | 2 +- pyproject.toml | 1 - requirements/pytorch/extra.txt | 2 +- requirements/pytorch/loggers.txt | 2 +- src/pytorch_lightning/loggers/wandb.py | 35 ++++++++++++----------- src/pytorch_lightning/utilities/cli.py | 2 +- tests/tests_pytorch/loggers/test_all.py | 5 +++- tests/tests_pytorch/loggers/test_wandb.py | 29 +++++++++++-------- 8 files changed, 43 insertions(+), 35 deletions(-) diff --git a/environment.yml b/environment.yml index d6f885f00bc72..f26e93031770e 100644 --- a/environment.yml +++ b/environment.yml @@ -50,5 +50,5 @@ dependencies: - test-tube>=0.7.5 - mlflow>=1.0.0 - comet_ml>=3.1.12 - - wandb>=0.8.21 + - wandb>=0.10.22 - neptune-client>=0.10.0 diff --git a/pyproject.toml b/pyproject.toml index 50b67dec5a04b..eb9b025e36811 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -59,7 +59,6 @@ module = [ "pytorch_lightning.demos.mnist_datamodule", "pytorch_lightning.loggers.comet", "pytorch_lightning.loggers.neptune", - "pytorch_lightning.loggers.wandb", "pytorch_lightning.profilers.advanced", "pytorch_lightning.profilers.base", "pytorch_lightning.profilers.pytorch", diff --git a/requirements/pytorch/extra.txt b/requirements/pytorch/extra.txt index 7d13068d814a8..440a80594d5d9 100644 --- a/requirements/pytorch/extra.txt +++ b/requirements/pytorch/extra.txt @@ -3,6 +3,6 @@ matplotlib>3.1, <3.5.3 torchtext>=0.10.*, <=0.12.0 omegaconf>=2.0.5, <2.3.0 hydra-core>=1.0.5, <1.3.0 -jsonargparse[signatures]>=4.10.0, <=4.10.0 +jsonargparse[signatures]>=4.10.2, <=4.10.2 gcsfs>=2021.5.0, <2022.6.0 rich>=10.14.0, !=10.15.0.a, <13.0.0 diff --git a/requirements/pytorch/loggers.txt b/requirements/pytorch/loggers.txt index 2abcb4b2df31f..a857ab5660d54 100644 --- a/requirements/pytorch/loggers.txt +++ b/requirements/pytorch/loggers.txt @@ -4,4 +4,4 @@ neptune-client>=0.10.0, <0.16.4 comet-ml>=3.1.12, <3.31.6 mlflow>=1.0.0, <1.27.0 test_tube>=0.7.5, <=0.7.5 -wandb>=0.8.21, <0.12.20 +wandb>=0.10.22, <0.12.20 diff --git a/src/pytorch_lightning/loggers/wandb.py b/src/pytorch_lightning/loggers/wandb.py index 88439cd9435db..53fbd2b1097f8 100644 --- a/src/pytorch_lightning/loggers/wandb.py +++ b/src/pytorch_lightning/loggers/wandb.py @@ -32,10 +32,11 @@ try: import wandb + from wandb.sdk.lib import RunDisabled from wandb.wandb_run import Run except ModuleNotFoundError: # needed for test mocks, these tests shall be updated - wandb, Run = None, None + wandb, Run, RunDisabled = None, None, None # type: ignore class WandbLogger(Logger): @@ -251,18 +252,18 @@ def __init__( self, name: Optional[str] = None, save_dir: Optional[str] = None, - offline: Optional[bool] = False, + offline: bool = False, id: Optional[str] = None, anonymous: Optional[bool] = None, version: Optional[str] = None, project: Optional[str] = None, log_model: Union[str, bool] = False, - experiment=None, - prefix: Optional[str] = "", + experiment: Union[Run, RunDisabled, None] = None, + prefix: str = "", agg_key_funcs: Optional[Mapping[str, Callable[[Sequence[float]], float]]] = None, agg_default_func: Optional[Callable[[Sequence[float]], float]] = None, - **kwargs, - ): + **kwargs: Any, + ) -> None: if wandb is None: raise ModuleNotFoundError( "You want to use `wandb` logger which is not installed yet," @@ -288,17 +289,16 @@ def __init__( self._log_model = log_model self._prefix = prefix self._experiment = experiment - self._logged_model_time = {} - self._checkpoint_callback = None + self._logged_model_time: Dict[str, float] = {} + self._checkpoint_callback: Optional["ReferenceType[Checkpoint]"] = None # set wandb init arguments - anonymous_lut = {True: "allow", False: None} - self._wandb_init = dict( + self._wandb_init: Dict[str, Any] = dict( name=name or project, project=project, id=version or id, dir=save_dir, resume="allow", - anonymous=anonymous_lut.get(anonymous, anonymous), + anonymous=("allow" if anonymous else None), ) self._wandb_init.update(**kwargs) # extract parameters @@ -310,7 +310,7 @@ def __init__( wandb.require("service") _ = self.experiment - def __getstate__(self): + def __getstate__(self) -> Dict[str, Any]: state = self.__dict__.copy() # args needed to reload correct experiment if self._experiment is not None: @@ -322,7 +322,7 @@ def __getstate__(self): state["_experiment"] = None return state - @property + @property # type: ignore[misc] @rank_zero_experiment def experiment(self) -> Run: r""" @@ -357,13 +357,14 @@ def experiment(self) -> Run: self._experiment = wandb.init(**self._wandb_init) # define default x-axis - if getattr(self._experiment, "define_metric", None): + if isinstance(self._experiment, Run) and getattr(self._experiment, "define_metric", None): self._experiment.define_metric("trainer/global_step") self._experiment.define_metric("*", step_metric="trainer/global_step", step_sync=True) + assert isinstance(self._experiment, Run) return self._experiment - def watch(self, model: nn.Module, log: str = "gradients", log_freq: int = 100, log_graph: bool = True): + def watch(self, model: nn.Module, log: str = "gradients", log_freq: int = 100, log_graph: bool = True) -> None: self.experiment.watch(model, log=log, log_freq=log_freq, log_graph=log_graph) @rank_zero_only @@ -379,7 +380,7 @@ def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> metrics = _add_prefix(metrics, self._prefix, self.LOGGER_JOIN_CHAR) if step is not None: - self.experiment.log({**metrics, "trainer/global_step": step}) + self.experiment.log(dict(metrics, **{"trainer/global_step": step})) else: self.experiment.log(metrics) @@ -417,7 +418,7 @@ def log_text( self.log_table(key, columns, data, dataframe, step) @rank_zero_only - def log_image(self, key: str, images: List[Any], step: Optional[int] = None, **kwargs: str) -> None: + def log_image(self, key: str, images: List[Any], step: Optional[int] = None, **kwargs: Any) -> None: """Log images (tensors, numpy arrays, PIL Images or file paths). Optional kwargs are lists passed to each image (ex: caption, masks, boxes). diff --git a/src/pytorch_lightning/utilities/cli.py b/src/pytorch_lightning/utilities/cli.py index f9d3375a6c6d8..a66cd6c0899cd 100644 --- a/src/pytorch_lightning/utilities/cli.py +++ b/src/pytorch_lightning/utilities/cli.py @@ -31,7 +31,7 @@ from pytorch_lightning.utilities.model_helpers import is_overridden from pytorch_lightning.utilities.rank_zero import _warn, rank_zero_deprecation, rank_zero_warn -_JSONARGPARSE_SIGNATURES_AVAILABLE = _RequirementAvailable("jsonargparse[signatures]>=4.10.0") +_JSONARGPARSE_SIGNATURES_AVAILABLE = _RequirementAvailable("jsonargparse[signatures]>=4.10.2") if _JSONARGPARSE_SIGNATURES_AVAILABLE: import docstring_parser diff --git a/tests/tests_pytorch/loggers/test_all.py b/tests/tests_pytorch/loggers/test_all.py index 96d1016cc612b..d613296abccf5 100644 --- a/tests/tests_pytorch/loggers/test_all.py +++ b/tests/tests_pytorch/loggers/test_all.py @@ -45,6 +45,7 @@ mock.patch("pytorch_lightning.loggers.mlflow.MlflowClient"), mock.patch("pytorch_lightning.loggers.neptune.neptune", new_callable=create_neptune_mock), mock.patch("pytorch_lightning.loggers.wandb.wandb"), + mock.patch("pytorch_lightning.loggers.wandb.Run", new=mock.Mock), ) ALL_LOGGER_CLASSES = ( CometLogger, @@ -363,7 +364,9 @@ def test_logger_with_prefix_all(tmpdir, monkeypatch): logger.experiment.add_scalar.assert_called_once_with("tmp-test", 1.0, 0) # WandB - with mock.patch("pytorch_lightning.loggers.wandb.wandb") as wandb: + with mock.patch("pytorch_lightning.loggers.wandb.wandb") as wandb, mock.patch( + "pytorch_lightning.loggers.wandb.Run", new=mock.Mock + ): logger = _instantiate_logger(WandbLogger, save_dir=tmpdir, prefix=prefix) wandb.run = None wandb.init().step = 0 diff --git a/tests/tests_pytorch/loggers/test_wandb.py b/tests/tests_pytorch/loggers/test_wandb.py index f62ebff9e719a..48162e6d9d2e2 100644 --- a/tests/tests_pytorch/loggers/test_wandb.py +++ b/tests/tests_pytorch/loggers/test_wandb.py @@ -24,6 +24,7 @@ from tests_pytorch.helpers.utils import no_warning_call +@mock.patch("pytorch_lightning.loggers.wandb.Run", new=mock.Mock) @mock.patch("pytorch_lightning.loggers.wandb.wandb") def test_wandb_logger_init(wandb, monkeypatch): """Verify that basic functionality of wandb logger works. @@ -111,20 +112,21 @@ class Experiment: def name(self): return "the_run_name" - wandb.run = None - wandb.init.return_value = Experiment() - logger = WandbLogger(id="the_id", offline=True) + with mock.patch("pytorch_lightning.loggers.wandb.Run", new=Experiment): + wandb.run = None + wandb.init.return_value = Experiment() + logger = WandbLogger(id="the_id", offline=True) - trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, logger=logger) - # Access the experiment to ensure it's created - assert trainer.logger.experiment, "missing experiment" - assert trainer.log_dir == logger.save_dir - pkl_bytes = pickle.dumps(trainer) - trainer2 = pickle.loads(pkl_bytes) + trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, logger=logger) + # Access the experiment to ensure it's created + assert trainer.logger.experiment, "missing experiment" + assert trainer.log_dir == logger.save_dir + pkl_bytes = pickle.dumps(trainer) + trainer2 = pickle.loads(pkl_bytes) - assert os.environ["WANDB_MODE"] == "dryrun" - assert trainer2.logger.__class__.__name__ == WandbLogger.__name__ - assert trainer2.logger.experiment, "missing experiment" + assert os.environ["WANDB_MODE"] == "dryrun" + assert trainer2.logger.__class__.__name__ == WandbLogger.__name__ + assert trainer2.logger.experiment, "missing experiment" wandb.init.assert_called() assert "id" in wandb.init.call_args[1] @@ -133,6 +135,7 @@ def name(self): del os.environ["WANDB_MODE"] +@mock.patch("pytorch_lightning.loggers.wandb.Run", new=mock.Mock) @mock.patch("pytorch_lightning.loggers.wandb.wandb") def test_wandb_logger_dirs_creation(wandb, monkeypatch, tmpdir): """Test that the logger creates the folders and files in the right place.""" @@ -169,6 +172,7 @@ def test_wandb_logger_dirs_creation(wandb, monkeypatch, tmpdir): assert trainer.log_dir == logger.save_dir +@mock.patch("pytorch_lightning.loggers.wandb.Run", new=mock.Mock) @mock.patch("pytorch_lightning.loggers.wandb.wandb") def test_wandb_log_model(wandb, monkeypatch, tmpdir): """Test that the logger creates the folders and files in the right place.""" @@ -234,6 +238,7 @@ def test_wandb_log_model(wandb, monkeypatch, tmpdir): ) +@mock.patch("pytorch_lightning.loggers.wandb.Run", new=mock.Mock) @mock.patch("pytorch_lightning.loggers.wandb.wandb") def test_wandb_log_media(wandb, tmpdir): """Test that the logger creates the folders and files in the right place.""" From 2181a163699b5802760fa23828d4f52f358f3de4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Thu, 21 Jul 2022 15:53:46 +0200 Subject: [PATCH 29/35] Fix gatekeeper minimum check (#13769) --- .github/gatekeeper-config_app.yml | 3 +-- .github/gatekeeper-config_pytorch.yml | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/.github/gatekeeper-config_app.yml b/.github/gatekeeper-config_app.yml index 045a4f7c9e941..9e51c23458fc0 100644 --- a/.github/gatekeeper-config_app.yml +++ b/.github/gatekeeper-config_app.yml @@ -1,8 +1,7 @@ approvals: - # check will fail if there is no approval - minimum: 1 groups: - name: 'Lightning Apps' + minimum: 1 from: - alecmerdler - awaelchli diff --git a/.github/gatekeeper-config_pytorch.yml b/.github/gatekeeper-config_pytorch.yml index 8a7b4d02793ff..75201f796c9ac 100644 --- a/.github/gatekeeper-config_pytorch.yml +++ b/.github/gatekeeper-config_pytorch.yml @@ -1,8 +1,7 @@ approvals: - # check will fail if there is no approval - minimum: 1 groups: - name: 'PyTorch Lightning' + minimum: 1 from: - awaelchli - Borda From 9f257e50a96e0e4dcefb15ba05a5c56913028b49 Mon Sep 17 00:00:00 2001 From: Justus Schock <12886177+justusschock@users.noreply.github.com> Date: Thu, 21 Jul 2022 20:07:09 +0200 Subject: [PATCH 30/35] changelog --- src/pytorch_lightning/CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 66c249db1456a..a7a9158e75dc6 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -102,6 +102,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Changed +- `accelerator="gpu"` now automatically selects an available GPU backend (CUDA and MPS currently) ([#13642](https://github.com/Lightning-AI/lightning/pull/13642)) - Enable validation during overfitting ([#12527](https://github.com/PyTorchLightning/pytorch-lightning/pull/12527)) From 323271c92a91d595d56450b2464ac21470b45c42 Mon Sep 17 00:00:00 2001 From: Justus Schock <12886177+justusschock@users.noreply.github.com> Date: Thu, 21 Jul 2022 20:07:18 +0200 Subject: [PATCH 31/35] changelog --- src/pytorch_lightning/CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index a7a9158e75dc6..6a10f97201313 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -103,6 +103,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Changed - `accelerator="gpu"` now automatically selects an available GPU backend (CUDA and MPS currently) ([#13642](https://github.com/Lightning-AI/lightning/pull/13642)) + + - Enable validation during overfitting ([#12527](https://github.com/PyTorchLightning/pytorch-lightning/pull/12527)) @@ -150,6 +152,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Deprecated +- Deprecated `pytorch_lightning.accelerators.gpu.GPUAccelerator` in favor of `pytorch_lightning.accelerators.cuda.CUDAAccelerator` ([#13636](https://github.com/Lightning-AI/lightning/pull/13636)) + + - Deprecated `pytorch_lightning.loggers.base.LightningLoggerBase` in favor of `pytorch_lightning.loggers.logger.Logger`, and deprecated `pytorch_lightning.loggers.base` in favor of `pytorch_lightning.loggers.logger` ([#120148](https://github.com/PyTorchLightning/pytorch-lightning/pull/12014)) From 90c996bf0bad4087de0f705c2e0c7e447dd84d75 Mon Sep 17 00:00:00 2001 From: Justus Schock <12886177+justusschock@users.noreply.github.com> Date: Fri, 22 Jul 2022 11:23:49 +0200 Subject: [PATCH 32/35] fix order --- .../trainer/connectors/accelerator_connector.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py index 5d3f0d1b23e1a..a0f22b4b12642 100644 --- a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -507,11 +507,10 @@ def _choose_auto_accelerator(self) -> str: @staticmethod def _choose_gpu_accelerator_backend() -> str: - if CUDAAccelerator.is_available(): - return "cuda" - if MPSAccelerator.is_available(): return "mps" + if CUDAAccelerator.is_available(): + return "cuda" raise MisconfigurationException("No supported gpu backend found!") From a0c76b9dc9d5f60b859f5281fb9edb41288276da Mon Sep 17 00:00:00 2001 From: Justus Schock <12886177+justusschock@users.noreply.github.com> Date: Fri, 22 Jul 2022 15:33:06 +0200 Subject: [PATCH 33/35] move up again --- .../trainer/connectors/accelerator_connector.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py index a0f22b4b12642..f19a40996f266 100644 --- a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -186,10 +186,6 @@ def __init__( self._amp_level_flag: Optional[str] = amp_level self._auto_select_gpus: bool = auto_select_gpus - # handle "gpu" - if accelerator == "gpu": - accelerator = self._choose_gpu_accelerator_backend() - self._check_config_and_set_final_flags( strategy=strategy, accelerator=accelerator, @@ -205,9 +201,12 @@ def __init__( # 2. Instantiate Accelerator self._set_accelerator_if_ipu_strategy_is_passed() - # handle `auto` and `None` + # handle `auto`, `None` and `gpu` if self._accelerator_flag == "auto" or self._accelerator_flag is None: self._accelerator_flag = self._choose_auto_accelerator() + elif self._accelerator_flag == "gpu": + self._accelerator_flag = self._choose_gpu_accelerator_backend() + self._set_parallel_devices_and_init_accelerator() # 3. Instantiate ClusterEnvironment @@ -283,7 +282,7 @@ def _check_config_and_set_final_flags( if ( accelerator is not None and accelerator not in self._accelerator_types - and accelerator != "auto" + and accelerator not in ("auto", "gpu") and not isinstance(accelerator, Accelerator) ): raise ValueError( From c9dc3065d94b5153db41260785c2795b3dcbf7b9 Mon Sep 17 00:00:00 2001 From: Justus Schock <12886177+justusschock@users.noreply.github.com> Date: Mon, 25 Jul 2022 11:17:12 +0200 Subject: [PATCH 34/35] add missing test --- .../accelerators/test_accelerator_connector.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/tests_pytorch/accelerators/test_accelerator_connector.py b/tests/tests_pytorch/accelerators/test_accelerator_connector.py index b0952c4cfeb41..345b8e23f6481 100644 --- a/tests/tests_pytorch/accelerators/test_accelerator_connector.py +++ b/tests/tests_pytorch/accelerators/test_accelerator_connector.py @@ -754,3 +754,10 @@ def test_gpu_accelerator_backend_choice_mps(*_): assert trainer._accelerator_connector._accelerator_flag == "mps" assert isinstance(trainer.accelerator, MPSAccelerator) + + +@mock.patch("pytorch_lightning.accelerators.mps.MPSAccelerator.is_available", return_value=False) +@mock.patch("pytorch_lightning.accelerators.cuda.CUDAAccelerator.is_available", return_value=False) +def test_gpu_accelerator_misconfiguration_exception(*_): + with pytest.raises(MisconfigurationException, match="No supported gpu backend found!"): + Trainer(accelerator="gpu") From fe66ab3bcae7c8eeece3d3c5ce189e6e39f795fe Mon Sep 17 00:00:00 2001 From: awaelchli Date: Mon, 25 Jul 2022 16:10:30 +0200 Subject: [PATCH 35/35] fix pickling issue --- tests/tests_pytorch/accelerators/test_accelerator_connector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/tests_pytorch/accelerators/test_accelerator_connector.py b/tests/tests_pytorch/accelerators/test_accelerator_connector.py index 999f33fd712c8..06f088e87ea4d 100644 --- a/tests/tests_pytorch/accelerators/test_accelerator_connector.py +++ b/tests/tests_pytorch/accelerators/test_accelerator_connector.py @@ -767,7 +767,7 @@ def test_gpu_accelerator_backend_choice(expected_accelerator_flag, expected_acce assert isinstance(trainer.accelerator, expected_accelerator_class) -@mock.patch("torch.cuda.device_count", return_value=1) +@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=1) def test_gpu_accelerator_backend_choice_cuda(_): trainer = Trainer(accelerator="gpu")