Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 32 additions & 21 deletions cuda_core/cuda/core/experimental/_device.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -948,9 +948,16 @@ class Device:
Default value of `None` return the currently used device.

"""
__slots__ = ("_id", "_mr", "_has_inited", "_properties", "_uuid")
__slots__ = ("_id", "_memory_resource", "_has_inited", "_properties", "_uuid")
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I renamed this to match a request in a recent change to _memory/_buffer.*


def __new__(cls, device_id: int | None = None):
def __new__(cls, device_id: Device | int | None = None):
# Handle device_id argument.
if isinstance(device_id, Device):
return device_id
else:
device_id = getattr(device_id, 'device_id', device_id)

# Initialize CUDA.
global _is_cuInit
if _is_cuInit is False:
with _lock, nogil:
Expand All @@ -976,7 +983,7 @@ class Device:
raise ValueError(f"device_id must be >= 0, got {device_id}")

# ensure Device is singleton
cdef int total, attr
cdef int total
try:
devices = _tls.devices
except AttributeError:
Expand All @@ -986,21 +993,7 @@ class Device:
for dev_id in range(total):
device = super().__new__(cls)
device._id = dev_id
# If the device is in TCC mode, or does not support memory pools for some other reason,
# use the SynchronousMemoryResource which does not use memory pools.
with nogil:
HANDLE_RETURN(
cydriver.cuDeviceGetAttribute(
&attr, cydriver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED, dev_id
)
)
if attr == 1:
from cuda.core.experimental._memory import DeviceMemoryResource
device._mr = DeviceMemoryResource(dev_id)
else:
from cuda.core.experimental._memory import _SynchronousMemoryResource
device._mr = _SynchronousMemoryResource(dev_id)

Comment on lines -989 to -1003
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Moved to @property memory_resource (lazy init). This was resulting in a circularity with DeviceMemoryResource using Device.

device._memory_resource = None
device._has_inited = False
device._properties = None
device._uuid = None
Expand Down Expand Up @@ -1128,13 +1121,31 @@ class Device:
@property
def memory_resource(self) -> MemoryResource:
"""Return :obj:`~_memory.MemoryResource` associated with this device."""
return self._mr
cdef int attr, device_id
if self._memory_resource is None:
# If the device is in TCC mode, or does not support memory pools for some other reason,
# use the SynchronousMemoryResource which does not use memory pools.
device_id = self._id
with nogil:
HANDLE_RETURN(
cydriver.cuDeviceGetAttribute(
&attr, cydriver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED, device_id
)
)
if attr == 1:
from cuda.core.experimental._memory import DeviceMemoryResource
self._memory_resource = DeviceMemoryResource(self._id)
else:
from cuda.core.experimental._memory import _SynchronousMemoryResource
self._memory_resource = _SynchronousMemoryResource(self._id)

return self._memory_resource

@memory_resource.setter
def memory_resource(self, mr):
from cuda.core.experimental._memory import MemoryResource
assert_type(mr, MemoryResource)
self._mr = mr
self._memory_resource = mr

@property
def default_stream(self) -> Stream:
Expand Down Expand Up @@ -1324,7 +1335,7 @@ class Device:
self._check_context_initialized()
if stream is None:
stream = default_stream()
return self._mr.allocate(size, stream)
return self.memory_resource.allocate(size, stream)

def sync(self):
"""Synchronize the device.
Expand Down
1 change: 0 additions & 1 deletion cuda_core/cuda/core/experimental/_event.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ from cuda.core.experimental._utils.cuda_utils import (
)
if TYPE_CHECKING:
import cuda.bindings
from cuda.core.experimental._device import Device


@dataclass
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@ from cuda.core.experimental._utils.cuda_utils cimport (
HANDLE_RETURN,
)

import cython
from dataclasses import dataclass
from typing import Optional, TYPE_CHECKING
import cython
import platform # no-cython-lint
import uuid
import weakref
Expand Down Expand Up @@ -131,7 +131,7 @@ cdef class DeviceMemoryResource(MemoryResource):

Parameters
----------
device_id : int | Device
device_id : Device | int
Device or Device ordinal for which a memory resource is constructed.

options : DeviceMemoryResourceOptions
Expand Down Expand Up @@ -211,8 +211,9 @@ cdef class DeviceMemoryResource(MemoryResource):
self._ipc_data = None
self._attributes = None

def __init__(self, device_id: int | Device, options=None):
cdef int dev_id = getattr(device_id, 'device_id', device_id)
def __init__(self, device_id: Device | int, options=None):
from .._device import Device
cdef int dev_id = Device(device_id).device_id
opts = check_or_create_options(
DeviceMemoryResourceOptions, options, "DeviceMemoryResource options",
keep_none=True
Expand Down Expand Up @@ -261,7 +262,7 @@ cdef class DeviceMemoryResource(MemoryResource):

@classmethod
def from_allocation_handle(
cls, device_id: int | Device, alloc_handle: int | IPCAllocationHandle
cls, device_id: Device | int, alloc_handle: int | IPCAllocationHandle
) -> DeviceMemoryResource:
"""Create a device memory resource from an allocation handle.

Expand Down
3 changes: 2 additions & 1 deletion cuda_core/cuda/core/experimental/_memory/_ipc.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,8 @@ cdef DeviceMemoryResource DMR_from_allocation_handle(cls, device_id, alloc_handl

# Construct a new DMR.
cdef DeviceMemoryResource self = DeviceMemoryResource.__new__(cls)
self._dev_id = getattr(device_id, 'device_id', device_id)
from .._device import Device
self._dev_id = Device(device_id).device_id
self._mempool_owned = True
self._ipc_data = IPCData(alloc_handle, mapped=True)

Expand Down
4 changes: 3 additions & 1 deletion cuda_core/cuda/core/experimental/_memory/_legacy.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,9 @@ class _SynchronousMemoryResource(MemoryResource):
__slots__ = ("_dev_id",)

def __init__(self, device_id):
self._dev_id = getattr(device_id, "device_id", device_id)
from .._device import Device

self._dev_id = Device(device_id).device_id

def allocate(self, size, stream=None) -> Buffer:
if stream is None:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from dataclasses import dataclass, field
from typing import Iterable, Literal, Union

from cuda.core.experimental._device import Device
from cuda.core.experimental._memory._buffer import Buffer, MemoryResource
from cuda.core.experimental._stream import Stream
from cuda.core.experimental._utils.cuda_utils import (
Expand Down Expand Up @@ -140,15 +141,15 @@ class VirtualMemoryResource(MemoryResource):

Parameters
----------
device_id : int
Device ordinal for which a memory resource is constructed.
device_id : Device | int
Device for which a memory resource is constructed.

config : VirtualMemoryResourceOptions
A configuration object for the VirtualMemoryResource
"""

def __init__(self, device, config: VirtualMemoryResourceOptions = None):
self.device = device
def __init__(self, device_id: Device | int, config: VirtualMemoryResourceOptions = None):
self.device = Device(device_id)
self.config = check_or_create_options(
VirtualMemoryResourceOptions, config, "VirtualMemoryResource options", keep_none=False
)
Expand Down
36 changes: 19 additions & 17 deletions cuda_core/cuda/core/experimental/_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from typing import Union
from warnings import warn

from cuda.core.experimental._device import Device
from cuda.core.experimental._launch_config import LaunchConfig, _to_native_launch_config
from cuda.core.experimental._stream import Stream
from cuda.core.experimental._utils.clear_error_support import (
Expand Down Expand Up @@ -73,8 +74,9 @@ def _init(cls, kernel):
self._loader = _backend[self._backend_version]
return self

def _get_cached_attribute(self, device_id: int, attribute: driver.CUfunction_attribute) -> int:
def _get_cached_attribute(self, device_id: Device | int, attribute: driver.CUfunction_attribute) -> int:
"""Helper function to get a cached attribute or fetch and cache it if not present."""
device_id = Device(device_id).device_id
cache_key = device_id, attribute
result = self._cache.get(cache_key, cache_key)
if result is not cache_key:
Expand All @@ -94,62 +96,62 @@ def _get_cached_attribute(self, device_id: int, attribute: driver.CUfunction_att
self._cache[cache_key] = result
return result

def max_threads_per_block(self, device_id: int = None) -> int:
def max_threads_per_block(self, device_id: Device | int = None) -> int:
"""int : The maximum number of threads per block.
This attribute is read-only."""
return self._get_cached_attribute(
device_id, driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK
)

def shared_size_bytes(self, device_id: int = None) -> int:
def shared_size_bytes(self, device_id: Device | int = None) -> int:
"""int : The size in bytes of statically-allocated shared memory required by this function.
This attribute is read-only."""
return self._get_cached_attribute(device_id, driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES)

def const_size_bytes(self, device_id: int = None) -> int:
def const_size_bytes(self, device_id: Device | int = None) -> int:
"""int : The size in bytes of user-allocated constant memory required by this function.
This attribute is read-only."""
return self._get_cached_attribute(device_id, driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES)

def local_size_bytes(self, device_id: int = None) -> int:
def local_size_bytes(self, device_id: Device | int = None) -> int:
"""int : The size in bytes of local memory used by each thread of this function.
This attribute is read-only."""
return self._get_cached_attribute(device_id, driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES)

def num_regs(self, device_id: int = None) -> int:
def num_regs(self, device_id: Device | int = None) -> int:
"""int : The number of registers used by each thread of this function.
This attribute is read-only."""
return self._get_cached_attribute(device_id, driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_NUM_REGS)

def ptx_version(self, device_id: int = None) -> int:
def ptx_version(self, device_id: Device | int = None) -> int:
"""int : The PTX virtual architecture version for which the function was compiled.
This attribute is read-only."""
return self._get_cached_attribute(device_id, driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_PTX_VERSION)

def binary_version(self, device_id: int = None) -> int:
def binary_version(self, device_id: Device | int = None) -> int:
"""int : The binary architecture version for which the function was compiled.
This attribute is read-only."""
return self._get_cached_attribute(device_id, driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_BINARY_VERSION)

def cache_mode_ca(self, device_id: int = None) -> bool:
def cache_mode_ca(self, device_id: Device | int = None) -> bool:
"""bool : Whether the function has been compiled with user specified option "-Xptxas --dlcm=ca" set.
This attribute is read-only."""
return bool(self._get_cached_attribute(device_id, driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_CACHE_MODE_CA))

def max_dynamic_shared_size_bytes(self, device_id: int = None) -> int:
def max_dynamic_shared_size_bytes(self, device_id: Device | int = None) -> int:
"""int : The maximum size in bytes of dynamically-allocated shared memory that can be used
by this function."""
return self._get_cached_attribute(
device_id, driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES
)

def preferred_shared_memory_carveout(self, device_id: int = None) -> int:
def preferred_shared_memory_carveout(self, device_id: Device | int = None) -> int:
"""int : The shared memory carveout preference, in percent of the total shared memory."""
return self._get_cached_attribute(
device_id, driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT
)

def cluster_size_must_be_set(self, device_id: int = None) -> bool:
def cluster_size_must_be_set(self, device_id: Device | int = None) -> bool:
"""bool : The kernel must launch with a valid cluster size specified.
This attribute is read-only."""
return bool(
Expand All @@ -158,33 +160,33 @@ def cluster_size_must_be_set(self, device_id: int = None) -> bool:
)
)

def required_cluster_width(self, device_id: int = None) -> int:
def required_cluster_width(self, device_id: Device | int = None) -> int:
"""int : The required cluster width in blocks."""
return self._get_cached_attribute(
device_id, driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_WIDTH
)

def required_cluster_height(self, device_id: int = None) -> int:
def required_cluster_height(self, device_id: Device | int = None) -> int:
"""int : The required cluster height in blocks."""
return self._get_cached_attribute(
device_id, driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT
)

def required_cluster_depth(self, device_id: int = None) -> int:
def required_cluster_depth(self, device_id: Device | int = None) -> int:
"""int : The required cluster depth in blocks."""
return self._get_cached_attribute(
device_id, driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH
)

def non_portable_cluster_size_allowed(self, device_id: int = None) -> bool:
def non_portable_cluster_size_allowed(self, device_id: Device | int = None) -> bool:
"""bool : Whether the function can be launched with non-portable cluster size."""
return bool(
self._get_cached_attribute(
device_id, driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED
)
)

def cluster_scheduling_policy_preference(self, device_id: int = None) -> int:
def cluster_scheduling_policy_preference(self, device_id: Device | int = None) -> int:
"""int : The block scheduling policy of a function."""
return self._get_cached_attribute(
device_id, driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE
Expand Down
6 changes: 4 additions & 2 deletions cuda_core/tests/test_memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,8 @@ def test_device_memory_resource_initialization(mempool_device, use_device_object
buffer.close()


def test_vmm_allocator_basic_allocation():
@pytest.mark.parametrize("use_device_object", [True, False])
def test_vmm_allocator_basic_allocation(use_device_object):
"""Test basic VMM allocation functionality.

This test verifies that VirtualMemoryResource can allocate memory
Expand All @@ -327,7 +328,8 @@ def test_vmm_allocator_basic_allocation():

options = VirtualMemoryResourceOptions()
# Create VMM allocator with default config
vmm_mr = VirtualMemoryResource(device, config=options)
device_arg = device if use_device_object else device.device_id
vmm_mr = VirtualMemoryResource(device_arg, config=options)

# Test basic allocation
buffer = vmm_mr.allocate(4096)
Expand Down
3 changes: 2 additions & 1 deletion cuda_core/tests/test_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,8 +134,9 @@ def test_read_only_kernel_attributes(get_saxpy_kernel_cubin, attr, expected_type
value = method()
assert value is not None

# get the value for each device on the system
# get the value for each device on the system, using either the device object or ordinal
for device in system.devices:
value = method(device)
value = method(device.device_id)
assert isinstance(value, expected_type), f"Expected {attr} to be of type {expected_type}, but got {type(value)}"

Expand Down
Loading