diff --git a/CHANGELOG.md b/CHANGELOG.md index c1ac8a689ce15..f562e7fad0a49 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -111,6 +111,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Add support for calling scripts using the module syntax (`python -m package.script`) ([#8073](https://github.com/PyTorchLightning/pytorch-lightning/pull/8073)) +- Add torchelastic check when sanitizing GPUs ([#8095](https://github.com/PyTorchLightning/pytorch-lightning/pull/8095)) + + ### Changed diff --git a/pytorch_lightning/utilities/device_parser.py b/pytorch_lightning/utilities/device_parser.py index ecb5d6ac00a03..bfbd1847a3092 100644 --- a/pytorch_lightning/utilities/device_parser.py +++ b/pytorch_lightning/utilities/device_parser.py @@ -16,6 +16,7 @@ import torch +from pytorch_lightning.plugins.environments import TorchElasticEnvironment from pytorch_lightning.utilities import _TPU_AVAILABLE, rank_zero_deprecation from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _compare_version @@ -78,6 +79,11 @@ def parse_gpu_ids(gpus: Optional[Union[int, str, List[int]]]) -> Optional[List[i gpus = _normalize_parse_gpu_input_to_list(gpus) if not gpus: raise MisconfigurationException("GPUs requested but none are available.") + + if TorchElasticEnvironment.is_using_torchelastic() and len(gpus) != 1 and len(_get_all_available_gpus()) == 1: + # omit sanity check on torchelastic as by default shows one visible GPU per process + return gpus + gpus = _sanitize_gpu_ids(gpus) return gpus diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index 65a1e093a9e96..cd7c90552ab2e 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -12,7 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. import operator +import os from collections import namedtuple +from unittest import mock from unittest.mock import patch import pytest @@ -21,6 +23,7 @@ import tests.helpers.pipelines as tpipes import tests.helpers.utils as tutils from pytorch_lightning import Trainer +from pytorch_lightning.plugins.environments import TorchElasticEnvironment from pytorch_lightning.utilities import device_parser from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _compare_version @@ -219,6 +222,29 @@ def test_parse_gpu_returns_none_when_no_devices_are_available(mocked_device_coun device_parser.parse_gpu_ids(gpus) +@mock.patch.dict( + os.environ, { + "CUDA_VISIBLE_DEVICES": "0", + "LOCAL_RANK": "1", + "GROUP_RANK": "1", + "RANK": "3", + "WORLD_SIZE": "4", + "LOCAL_WORLD_SIZE": "2", + } +) +@mock.patch('torch.cuda.device_count', return_value=1) +@pytest.mark.parametrize("gpus", [[0, 1, 2], 2, '0']) +def test_torchelastic_gpu_parsing(mocked_device_count, gpus): + """ + Ensure when using torchelastic and nproc_per_node is set to the default of 1 per GPU device + That we omit sanitizing the gpus as only one of the GPUs is visible. + """ + trainer = Trainer(gpus=gpus) + assert isinstance(trainer.accelerator_connector.cluster_environment, TorchElasticEnvironment) + assert trainer.accelerator_connector.parallel_device_ids == device_parser.parse_gpu_ids(gpus) + assert trainer.gpus == gpus + + @RunIf(min_gpus=1) def test_single_gpu_batch_parse(): trainer = Trainer(gpus=1)