Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ repos:
rev: "3e8a8703264a2f4a69428a0aa4dcb512790b2c8c" # frozen: v6.0.0
hooks:
- id: check-added-large-files
exclude: cuda_bindings/cuda/bindings/nvml.pyx
- id: check-case-conflict
- id: check-docstring-first
- id: check-merge-conflict
Expand Down
360 changes: 360 additions & 0 deletions cuda_bindings/cuda/bindings/_internal/_nvml.pxd

Large diffs are not rendered by default.

7,400 changes: 7,400 additions & 0 deletions cuda_bindings/cuda/bindings/_internal/_nvml_linux.pyx

Large diffs are not rendered by default.

6,054 changes: 6,054 additions & 0 deletions cuda_bindings/cuda/bindings/_internal/_nvml_windows.pyx

Large diffs are not rendered by default.

447 changes: 447 additions & 0 deletions cuda_bindings/cuda/bindings/_nvml.pxd

Large diffs are not rendered by default.

26,040 changes: 26,040 additions & 0 deletions cuda_bindings/cuda/bindings/_nvml.pyx

Large diffs are not rendered by default.

2,038 changes: 2,038 additions & 0 deletions cuda_bindings/cuda/bindings/cy_nvml.pxd

Large diffs are not rendered by default.

1,399 changes: 1,399 additions & 0 deletions cuda_bindings/cuda/bindings/cy_nvml.pyx

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions cuda_bindings/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ test = [
"pytest>=6.2.4",
"pytest-benchmark>=3.4.1",
"pyglet>=2.1.9",
"looseversion>=1.3.0",
]

[project.urls]
Expand Down
Empty file.
2 changes: 2 additions & 0 deletions cuda_bindings/tests/nvml/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
139 changes: 139 additions & 0 deletions cuda_bindings/tests/nvml/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE

from collections import namedtuple

import pytest
from cuda.bindings import _nvml as nvml


class NVMLInitializer:
def __init__(self):
pass

def __enter__(self):
nvml.init_v2()

def __exit__(self, exception_type, exception, trace):
nvml.shutdown()


@pytest.fixture
def nvml_init():
with NVMLInitializer():
yield


@pytest.fixture(scope="session", autouse=True)
def device_info():
dev_count = None
bus_id_to_board_details = {}

with NVMLInitializer():
dev_count = nvml.device_get_count_v2()

# Store some details for each device now when we know NVML is in known state
for i in range(dev_count):
try:
dev = nvml.device_get_handle_by_index_v2(i)
except nvml.NoPermissionError:
continue
pci_info = nvml.device_get_pci_info_v3(dev)

name = nvml.device_get_name(dev)
# Get architecture name ex: Ampere, Kepler
arch_id = nvml.device_get_architecture(dev)
# 1 = NVML_DEVICE_ARCH_KEPLER and 12 = NVML_DEVICE_ARCH_COUNT
assert 1 <= arch_id <= 12, "Architecture not found, presumably something newer"
# arch_name = (utils.nvml_architecture_name.get(archID)).split("_")[-1]
# archName = archName[0] + archName[1:].lower()

BoardCfg = namedtuple("BoardCfg", "name, ids_arr")
board = BoardCfg(name, ids_arr=[(pci_info.pci_device_id, pci_info.pci_sub_system_id)])

try:
serial = nvml.device_get_serial(dev)
except:
serial = None

bus_id = pci_info.bus_id
device_id = pci_info.device_
uuid = nvml.device_get_uuid(dev)

BoardDetails = namedtuple("BoardDetails", "name, board, arch_id, bus_id, device_id, serial")
bus_id_to_board_details[uuid] = BoardDetails(name, board, arch_id, bus_id, device_id, serial)

return bus_id_to_board_details


def get_devices(device_info):
for uuid in list(device_info.keys()):
try:
yield nvml.device_get_handle_by_uuid(uuid)
except nvml.NoPermissionError:
continue # ignore devices that can't be accessed


@pytest.fixture
def for_all_devices(device_info):
with NVMLInitializer():
unique_devices = set()
for device_id in get_devices(device_info):
if device_id not in unique_devices:
unique_devices.add(device_id)
yield device_id
# RestoreDefaultEnvironment.restore()


@pytest.fixture
def driver(nvml_init, request):
driver_vsn = nvml.system_get_driver_version()
# Return "major" version only
return int(driver_vsn.split(".")[0])


@pytest.fixture
def ngpus(nvml_init):
result = nvml.device_get_count_v2()
assert result > 0
return result


@pytest.fixture
def handles(ngpus):
handles = [nvml.device_get_handle_by_index_v2(i) for i in range(ngpus)]
assert len(handles) == ngpus
return handles


@pytest.fixture
def nmigs(handles):
return nvml.device_get_max_mig_device_count(handles[0])


@pytest.fixture
def mig_handles(nmigs):
handles = [nvml.device_get_mig_device_handle_by_index(i) for i in range(nmigs)]
assert len(handles) == nmigs
return handles


@pytest.fixture
def serials(ngpus, handles):
serials = [nvml.device_get_serial(handles[i]) for i in range(ngpus)]
assert len(serials) == ngpus
return serials


@pytest.fixture
def uuids(ngpus, handles):
uuids = [nvml.device_get_uuid(handles[i]) for i in range(ngpus)]
assert len(uuids) == ngpus
return uuids


@pytest.fixture
def pci_info(ngpus, handles):
pci_info = [nvml.device_get_pci_info_v3(handles[i]) for i in range(ngpus)]
assert len(pci_info) == ngpus
return pci_info
29 changes: 29 additions & 0 deletions cuda_bindings/tests/nvml/test_compute_mode.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE


import sys

import pytest
from cuda.bindings import _nvml as nvml

COMPUTE_MODES = [
nvml.ComputeMode.COMPUTEMODE_DEFAULT,
nvml.ComputeMode.COMPUTEMODE_PROHIBITED,
nvml.ComputeMode.COMPUTEMODE_EXCLUSIVE_PROCESS,
]


@pytest.mark.skipif(sys.platform == "win32", reason="Test not supported on Windows")
def test_compute_mode_supported_nonroot(for_all_devices):
device = for_all_devices

try:
original_compute_mode = nvml.device_get_compute_mode(device)
except nvml.NotSupportedError:
pytest.skip("nvmlDeviceGetComputeMode not supported")

for cm in COMPUTE_MODES:
with pytest.raises(nvml.NoPermissionError):
nvml.device_set_compute_mode(device, cm)
assert original_compute_mode == nvml.device_get_compute_mode(device), "Compute mode shouldn't have changed"
57 changes: 57 additions & 0 deletions cuda_bindings/tests/nvml/test_cuda.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE

import cuda.bindings.driver as cuda
from cuda.bindings import _nvml as nvml

from .conftest import NVMLInitializer


def get_nvml_device_names():
result = []
with NVMLInitializer():
# uses NVML Library to get the device count, device id and device pci id
num_devices = nvml.device_get_count_v2()
for idx in range(num_devices):
handle = nvml.device_get_handle_by_index_v2(idx)
name = nvml.device_get_name(handle)
info = nvml.device_get_pci_info_v3(handle)
assert isinstance(info.bus, int)
assert isinstance(name, str)
result.append({"name": name, "id": info.bus})

return result


def get_cuda_device_names(sort_by_bus_id=True):
result = []

(err,) = cuda.cuInit(0)
assert err == cuda.CUresult.CUDA_SUCCESS

err, device_count = cuda.cuDeviceGetCount()
assert err == cuda.CUresult.CUDA_SUCCESS

for dev in range(device_count):
size = 256
err, name = cuda.cuDeviceGetName(size, dev)
name = name.split(b"\x00")[0].decode()
assert err == cuda.CUresult.CUDA_SUCCESS

err, pci_bus_id = cuda.cuDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, dev)
assert err == cuda.CUresult.CUDA_SUCCESS
assert isinstance(pci_bus_id, int)

result.append({"name": name, "id": pci_bus_id})

if sort_by_bus_id:
result = sorted(result, key=lambda k: k["id"])

return result


def test_cuda_device_order():
cuda_devices = get_cuda_device_names()
nvml_devices = get_nvml_device_names()

assert cuda_devices == nvml_devices, "CUDA and NVML device lists do not match"
40 changes: 40 additions & 0 deletions cuda_bindings/tests/nvml/test_gpu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE

import pytest
from cuda.bindings import _nvml as nvml

from . import util


def test_gpu_get_module_id(nvml_init):
# Unique module IDs cannot exceed the number of GPUs on the system
device_count = nvml.device_get_count_v2()

for i in range(device_count):
device = nvml.device_get_handle_by_index_v2(i)
uuid = nvml.device_get_uuid(device)

if util.is_vgpu(device):
continue

module_id = nvml.device_get_module_id(device)
assert isinstance(module_id, int)


def test_gpu_get_platform_info(for_all_devices):
device = for_all_devices

if util.is_vgpu(device):
pytest.skip("Not supported on vGPU device")

# TODO
# if device.feature_dict.board.chip < board_class.Architecture.Blackwell:
# test_utils.skip_test("Not supported on chip before Blackwell")

try:
platform_info = nvml.device_get_platform_info(device)
except nvml.NotSupportedError:
pytest.skip("Not supported returned, likely NVLink is disabled.")

assert isinstance(platform_info, nvml.PlatformInfo_v2)
53 changes: 53 additions & 0 deletions cuda_bindings/tests/nvml/test_init.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE

import sys

import pytest
from cuda.bindings import _nvml as nvml


def assert_nvml_is_initialized():
assert nvml.device_get_count_v2() > 0


def assert_nvml_is_uninitialized():
with pytest.raises(nvml.UninitializedError):
nvml.device_get_count_v2()


@pytest.mark.skipif(sys.platform == "win32", reason="Test not supported on Windows")
def test_init_ref_count():
"""
Verifies that we can call NVML shutdown and init(2) multiple times, and that ref counting works
"""
with pytest.raises(nvml.UninitializedError):
nvml.shutdown()

assert_nvml_is_uninitialized()

for i in range(3):
# Init 5 times
for j in range(5):
nvml.init_v2()
assert_nvml_is_initialized()

# Shutdown 4 times, NVML should remain initailized
for j in range(4):
nvml.shutdown()
assert_nvml_is_initialized()

# Shutdown the final time
nvml.shutdown()
assert_nvml_is_uninitialized()


def test_init_check_index(nvml_init):
"""
Verifies that the index from nvmlDeviceGetIndex is correct
"""
dev_count = nvml.device_get_count_v2()
for idx in range(dev_count):
handle = nvml.device_get_handle_by_index_v2(idx)
# Verify that the index matches
assert idx == nvml.device_get_index(handle)
30 changes: 30 additions & 0 deletions cuda_bindings/tests/nvml/test_nvlink.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE


from cuda.bindings import _nvml as nvml


def test_nvlink_get_link_count(for_all_devices):
"""
Checks that the link count of the device is same.
"""
device = for_all_devices

fields = nvml.FieldValue(1)
fields[0].field_id = nvml.FI.DEV_NVLINK_LINK_COUNT
value = nvml.device_get_field_values(device, fields)[0]
assert value.nvml_return == nvml.Return.SUCCESS or value.nvml_return == nvml.Return.ERROR_NOT_SUPPORTED, (
f"Unexpected return {value.nvml_return} for link count field query"
)

# Use the alternative argument to device_get_field_values
value = nvml.device_get_field_values(device, [nvml.FI.DEV_NVLINK_LINK_COUNT])[0]
assert value.nvml_return == nvml.Return.SUCCESS or value.nvml_return == nvml.Return.ERROR_NOT_SUPPORTED, (
f"Unexpected return {value.nvml_return} for link count field query"
)

# The feature_nvlink_supported detection is not robust, so we
# can't be more specific about how many links we should find.
if value.nvml_return == nvml.Return.SUCCESS:
assert value.value.ui_val <= nvml.NVLINK_MAX_LINKS, f"Unexpected link count {value.value.ui_val}"
Loading