Skip to content

Commit

Permalink
cuda gpu devices clarification
Browse files Browse the repository at this point in the history
From the context, it should hopefully be clear that we are talking about nvidia cuda gpus
update error message


update
  • Loading branch information
awaelchli committed Dec 22, 2022
1 parent 38e8cb8 commit 6500156
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 13 deletions.
21 changes: 10 additions & 11 deletions src/lightning_lite/accelerators/cuda.py
Expand Up @@ -79,17 +79,17 @@ def register_accelerators(cls, accelerator_registry: Dict) -> None:


def find_usable_cuda_devices(num_devices: int = -1) -> List[int]:
"""Returns a list of all available and usable CUDA GPUs.
"""Returns a list of all available and usable CUDA GPU devices.
A GPU is considered usable if we can successfully move a tensor to the device, and this is what this function
tests for each GPU on the system until the target number of usable GPUs is found.
tests for each GPU on the system until the target number of usable devices is found.
A subset of GPUs on the system might be used by other processes, and if the GPU is configured to operate in
'exclusive' mode (configurable by the admin), then only one process is allowed to occupy it.
Args:
num_devices: The number of GPUs you want to request. By default, this function will return as many as there are
usable GPUs available.
num_devices: The number of devices you want to request. By default, this function will return as many as there
are usable CUDA GPU devices available.
Warning:
If multiple processes call this function at the same time, there can be race conditions in the case where
Expand All @@ -98,11 +98,11 @@ def find_usable_cuda_devices(num_devices: int = -1) -> List[int]:
visible_devices = _get_all_visible_cuda_devices()
if not visible_devices:
raise ValueError(
f"You requested to find {num_devices} GPUs but there are no visible CUDA devices on this machine."
f"You requested to find {num_devices} devices but there are no visible CUDA devices on this machine."
)
if num_devices > len(visible_devices):
raise ValueError(
f"You requested to find {num_devices} GPUs but this machine only has {len(visible_devices)} GPUs."
f"You requested to find {num_devices} devices but this machine only has {len(visible_devices)} GPUs."
)

available_devices = []
Expand All @@ -122,17 +122,16 @@ def find_usable_cuda_devices(num_devices: int = -1) -> List[int]:

if len(available_devices) != num_devices:
raise RuntimeError(
f"You requested to find {num_devices} GPUs but only {len(available_devices)} are currently available."
f" GPUs {unavailable_devices} are occupied by other processes and can't be"
" used at the moment."
f"You requested to find {num_devices} devices but only {len(available_devices)} are currently available."
f" The devices {unavailable_devices} are occupied by other processes and can't be used at the moment."
)
return available_devices


def _get_all_visible_cuda_devices() -> List[int]:
"""Returns a list of all visible CUDA GPUs.
"""Returns a list of all visible CUDA GPU devices.
GPUs masked by the environment variabale ``CUDA_VISIBLE_DEVICES`` won't be returned here. For example, assume you
Devices masked by the environment variabale ``CUDA_VISIBLE_DEVICES`` won't be returned here. For example, assume you
have 8 physical GPUs. If ``CUDA_VISIBLE_DEVICES="1,3,6"``, then this function will return the list ``[0, 1, 2]``
because these are the three visible GPUs after applying the mask ``CUDA_VISIBLE_DEVICES``.
"""
Expand Down
4 changes: 2 additions & 2 deletions tests/tests_lite/accelerators/test_cuda.py
Expand Up @@ -123,7 +123,7 @@ def test_find_usable_cuda_devices_error_handling():

# Asking for GPUs if no GPUs visible
with mock.patch("lightning_lite.accelerators.cuda.num_cuda_devices", return_value=0), pytest.raises(
ValueError, match="You requested to find 2 GPUs but there are no visible CUDA"
ValueError, match="You requested to find 2 devices but there are no visible CUDA"
):
find_usable_cuda_devices(2)

Expand All @@ -137,5 +137,5 @@ def test_find_usable_cuda_devices_error_handling():
tensor_mock = Mock(side_effect=RuntimeError) # simulate device placement fails
with mock.patch("lightning_lite.accelerators.cuda.num_cuda_devices", return_value=2), mock.patch(
"lightning_lite.accelerators.cuda.torch.tensor", tensor_mock
), pytest.raises(RuntimeError, match=escape("GPUs [0, 1] are occupied by other processes")):
), pytest.raises(RuntimeError, match=escape("The devices [0, 1] are occupied by other processes")):
find_usable_cuda_devices(2)

0 comments on commit 6500156

Please sign in to comment.