diff --git a/ci/test_cuda_cccl_examples_python.sh b/ci/test_cuda_cccl_examples_python.sh index 68181aca622..87a73e63dc9 100755 --- a/ci/test_cuda_cccl_examples_python.sh +++ b/ci/test_cuda_cccl_examples_python.sh @@ -23,7 +23,7 @@ fi # Install cuda_cccl CUDA_CCCL_WHEEL_PATH="$(ls /home/coder/cccl/wheelhouse/cuda_cccl-*.whl)" -python -m pip install "${CUDA_CCCL_WHEEL_PATH}[test-cu${cuda_major_version}]" +python -m pip install "${CUDA_CCCL_WHEEL_PATH}[test-with-examples-cu${cuda_major_version}]" # Run tests for parallel module cd "/home/coder/cccl/python/cuda_cccl/tests/" diff --git a/ci/test_cuda_compute_minimal_python.sh b/ci/test_cuda_compute_minimal_python.sh index a43bbac7bfa..03392736012 100755 --- a/ci/test_cuda_compute_minimal_python.sh +++ b/ci/test_cuda_compute_minimal_python.sh @@ -30,7 +30,7 @@ fi # full cu* extras because those pull in numba/numba-cuda. CUDA_CCCL_WHEEL_PATH="$(ls "${wheelhouse_dir}"/cuda_cccl-*.whl)" python -m pip install "${CUDA_CCCL_WHEEL_PATH}[minimal-cu${cuda_major_version}]" -python -m pip install pytest pytest-xdist "cupy-cuda${cuda_major_version}x" +python -m pip install pytest pytest-xdist cd "${repo_root}/python/cuda_cccl/tests/" python -m pytest -n 6 -v compute/test_no_numba.py diff --git a/ci/windows/test_cuda_cccl_examples_python.ps1 b/ci/windows/test_cuda_cccl_examples_python.ps1 index 0c108328822..16e01b4443c 100644 --- a/ci/windows/test_cuda_cccl_examples_python.ps1 +++ b/ci/windows/test_cuda_cccl_examples_python.ps1 @@ -18,7 +18,7 @@ $repoRoot = Get-RepoRoot ${wheelPath} = Get-CudaCcclWheel & $python -m pip install -U pip pytest pytest-xdist -& $python -m pip install "${wheelPath}[test-cu$cudaMajor]" +& $python -m pip install "${wheelPath}[test-with-examples-cu$cudaMajor]" Push-Location (Join-Path $repoRoot "python/cuda_cccl/tests") try { diff --git a/docs/python/setup.rst b/docs/python/setup.rst index 15c9510ce5e..b44c5ea2064 100644 --- a/docs/python/setup.rst +++ b/docs/python/setup.rst @@ -72,6 +72,10 @@ For development or to access the latest features: cd cccl/python/cuda_cccl pip install -e .[test-cu13] # or .[test-cu12], .[test-sysctk13], .[test-sysctk12] +The standard test extras do not install CuPy. To also run the CuPy-based +``cuda.compute`` examples, use the corresponding ``test-with-examples-*`` extra, +for example ``pip install -e .[test-with-examples-cu13]``. + Development Setup ~~~~~~~~~~~~~~~~~~ diff --git a/python/cuda_cccl/pyproject.toml b/python/cuda_cccl/pyproject.toml index 98e0e6533c0..1acf7b8dcc5 100644 --- a/python/cuda_cccl/pyproject.toml +++ b/python/cuda_cccl/pyproject.toml @@ -83,11 +83,22 @@ test-cu12 = [ "cuda-cccl[cu12]", "pytest", "pytest-xdist", - "cupy-cuda12x", ] -test-cu13 = ["cuda-cccl[cu13]", "pytest", "pytest-xdist", "cupy-cuda13x"] -test-sysctk12 = ["cuda-cccl[sysctk12]", "pytest", "pytest-xdist", "cupy-cuda12x"] -test-sysctk13 = ["cuda-cccl[sysctk13]", "pytest", "pytest-xdist", "cupy-cuda13x"] +test-cu13 = ["cuda-cccl[cu13]", "pytest", "pytest-xdist"] +test-sysctk12 = [ + "cuda-cccl[sysctk12]", + "pytest", + "pytest-xdist", +] +test-sysctk13 = [ + "cuda-cccl[sysctk13]", + "pytest", + "pytest-xdist", +] +test-with-examples-cu12 = ["cuda-cccl[test-cu12]", "cupy-cuda12x"] +test-with-examples-cu13 = ["cuda-cccl[test-cu13]", "cupy-cuda13x"] +test-with-examples-sysctk12 = ["cuda-cccl[test-sysctk12]", "cupy-cuda12x"] +test-with-examples-sysctk13 = ["cuda-cccl[test-sysctk13]", "cupy-cuda13x"] bench-cu12 = [ "cuda-cccl[cu12]", "cuda-bench[cu12]", @@ -173,6 +184,8 @@ known-first-party = [ ] [tool.pytest.ini_options] +# Make shared helpers in tests/_utils importable when pytest runs outside tests/. +pythonpath = ["tests"] markers = [ "no_verify_sass: skip SASS verification check", "large: tests requiring large device memory allocations", diff --git a/python/cuda_cccl/tests/_utils/__init__.py b/python/cuda_cccl/tests/_utils/__init__.py new file mode 100644 index 00000000000..b221129eb3a --- /dev/null +++ b/python/cuda_cccl/tests/_utils/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +"""Shared test utilities for cuda-cccl.""" diff --git a/python/cuda_cccl/tests/_utils/device_array.py b/python/cuda_cccl/tests/_utils/device_array.py new file mode 100644 index 00000000000..b2496173a28 --- /dev/null +++ b/python/cuda_cccl/tests/_utils/device_array.py @@ -0,0 +1,252 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +from __future__ import annotations + +import math +import operator +from collections.abc import Iterable + +import numpy as np +from numpy.typing import DTypeLike + +from cuda.core import Buffer, Device, Stream + + +def get_compute_capability() -> tuple[int, int]: + return Device().compute_capability + + +def _normalize_shape(shape: int | Iterable[int]) -> tuple[int, ...]: + try: + dimensions = (operator.index(shape),) # type: ignore[arg-type] + except TypeError: + dimensions = tuple(operator.index(dimension) for dimension in shape) # type: ignore[union-attr] + + if any(dimension < 0 for dimension in dimensions): + raise ValueError("negative dimensions are not allowed") + + return dimensions + + +def _contiguous_strides( + shape: tuple[int, ...], itemsize: int, order: str +) -> tuple[int, ...]: + if any(dimension == 0 for dimension in shape): + return (0,) * len(shape) + + strides = [0] * len(shape) + stride = itemsize + + if order == "C": + for index in range(len(shape) - 1, -1, -1): + strides[index] = stride + stride *= shape[index] + else: + for index, dimension in enumerate(shape): + strides[index] = stride + stride *= dimension + + return tuple(strides) + + +def _resolve_device_and_stream( + device: Device | None, stream: Stream | None +) -> tuple[Device, Stream]: + if device is None: + device = stream.device if stream is not None else Device() + + if stream is not None and stream.device.device_id != device.device_id: + raise ValueError("device and stream must refer to the same device") + + device.set_current() + return device, device.default_stream if stream is None else stream + + +class DeviceArray: + """A small, Buffer-backed device array for cuda-cccl tests. + + The class intentionally provides only allocation, NumPy transfers, array + metadata, and the CUDA Array Interface. Array operations and initialization + belong on the NumPy host arrays used by the tests. + """ + + def __init__( + self, + buffer: Buffer, + device: Device, + stream: Stream, + shape: tuple[int, ...], + dtype: np.dtype, + strides: tuple[int, ...], + order: str, + ) -> None: + self._buffer = buffer + self._device = device + self._stream = stream + self._order = order + self._shape = shape + self._dtype = dtype + self._strides = strides + + @classmethod + def empty( + cls, + shape: int | Iterable[int], + dtype: DTypeLike, + *, + order: str = "C", + device: Device | None = None, + stream: Stream | None = None, + ) -> DeviceArray: + """Allocate an uninitialized device array.""" + shape = _normalize_shape(shape) + dtype = np.dtype(dtype) + order = order.upper() + if order not in ("C", "F"): + raise ValueError("order must be either 'C' or 'F'") + if dtype.itemsize == 0: + raise ValueError("zero-sized dtypes are not supported") + + device, stream = _resolve_device_and_stream(device, stream) + buffer = device.allocate(math.prod(shape) * dtype.itemsize, stream=stream) + result = cls( + buffer, + device, + stream, + shape, + dtype, + _contiguous_strides(shape, dtype.itemsize, order), + order, + ) + + # Device allocation is stream ordered. Synchronizing makes an empty array + # safe to hand to a test that subsequently uses a different stream. + stream.sync() + return result + + @classmethod + def from_numpy( + cls, + array: np.ndarray, + *, + device: Device | None = None, + stream: Stream | None = None, + ) -> DeviceArray: + """Allocate a device array and initialize it from a NumPy array.""" + host_array = np.asarray(array) + if host_array.dtype.itemsize == 0: + raise ValueError("zero-sized dtypes are not supported") + + if host_array.flags.c_contiguous: + order = "C" + elif host_array.flags.f_contiguous: + order = "F" + else: + host_array = np.ascontiguousarray(host_array) + order = "C" + + device, stream = _resolve_device_and_stream(device, stream) + buffer = device.allocate(host_array.nbytes, stream=stream) + result = cls( + buffer, + device, + stream, + host_array.shape, + host_array.dtype, + host_array.strides, + order, + ) + result._copy_from_host_array(host_array, stream) + stream.sync() + return result + + @property + def nbytes(self) -> int: + return self._buffer.size + + @property + def dtype(self) -> np.dtype: + return self._dtype + + def __len__(self) -> int: + if not self._shape: + raise TypeError("len() of unsized object") + return self._shape[0] + + @property + def __cuda_array_interface__(self) -> dict[str, object]: + interface: dict[str, object] = { + "data": (0 if self.nbytes == 0 else int(self._buffer.handle), False), + "shape": self._shape, + "strides": None if self._is_c_contiguous() else self._strides, + "typestr": self._dtype.str, + "version": 3, + } + if self._dtype.fields is not None: + interface["descr"] = self._dtype.descr + return interface + + def _is_c_contiguous(self) -> bool: + return ( + self._order == "C" + or self.nbytes == 0 + or sum(dimension > 1 for dimension in self._shape) <= 1 + ) + + @staticmethod + def _host_buffer(array: np.ndarray) -> Buffer: + # Buffer.from_handle does not own the host memory. `owner` ties the NumPy + # allocation to this temporary Buffer; the caller also retains the array + # and synchronizes the copy stream before returning. + return Buffer.from_handle( + ptr=int(array.ctypes.data), size=array.nbytes, owner=array + ) + + def _copy_stream(self, stream: Stream | None) -> Stream: + if stream is None: + # The allocation stream is not necessarily the last stream to have + # used the array. Synchronize the device when that stream is unknown. + self._device.sync() + return self._stream + if stream.device.device_id != self._device.device_id: + raise ValueError("copy stream must belong to the array's device") + return stream + + def _copy_from_host_array(self, array: np.ndarray, stream: Stream) -> None: + self._buffer.copy_from(self._host_buffer(array), stream=stream) + + def copy_from_host( + self, array: np.ndarray, *, stream: Stream | None = None + ) -> None: + """Replace the array's contents from a shape- and dtype-matched NumPy array.""" + host_array = np.asarray(array) + if host_array.shape != self._shape: + raise ValueError( + f"source shape {host_array.shape} does not match {self._shape}" + ) + if host_array.dtype != self._dtype: + raise TypeError( + f"source dtype {host_array.dtype} does not match {self._dtype}" + ) + + if self._order == "F": + host_array = np.asfortranarray(host_array) + else: + host_array = np.ascontiguousarray(host_array) + + self._device.set_current() + stream = self._copy_stream(stream) + self._copy_from_host_array(host_array, stream) + stream.sync() + + def copy_to_host(self, *, stream: Stream | None = None) -> np.ndarray: + """Return an owning NumPy copy of the array.""" + self._device.set_current() + stream = self._copy_stream(stream) + + result = np.empty(self._shape, dtype=self._dtype, order=self._order) + self._buffer.copy_to(self._host_buffer(result), stream=stream) + stream.sync() + return result diff --git a/python/cuda_cccl/tests/compute/conftest.py b/python/cuda_cccl/tests/compute/conftest.py index 1fa66c48360..ef72dc0fc54 100644 --- a/python/cuda_cccl/tests/compute/conftest.py +++ b/python/cuda_cccl/tests/compute/conftest.py @@ -1,9 +1,11 @@ import builtins +from collections.abc import Generator -import cupy as cp import numpy as np import pytest +from cuda.core import Device, Stream + check_ldl_stl_in_sass = False @@ -37,15 +39,15 @@ def input_array(request): low_inclusive, high_exclusive = 0, 8 else: low_inclusive, high_exclusive = -5, 6 - array = cp.random.randint( + array = np.random.randint( low=low_inclusive, high=high_exclusive, size=sample_size, dtype=dtype ) elif np.issubdtype(dtype, np.floating): # For floating-point types, use np.random.random and cast to the required dtype - array = cp.random.random(sample_size).astype(dtype) + array = np.random.random(sample_size).astype(dtype) elif np.issubdtype(dtype, np.complexfloating): # For complex types, generate random real and imaginary parts - packed = cp.random.random(2 * sample_size) + packed = np.random.random(2 * sample_size) real_part = packed[:sample_size] imag_part = packed[sample_size:] array = (real_part + 1j * imag_part).astype(dtype) @@ -65,29 +67,19 @@ def floating_array(request): sample_size = 1000 # Generate random floating-point values - array = cp.random.random(sample_size).astype(dtype) + array = np.random.random(sample_size).astype(dtype) return array -class Stream: - """ - Simple cupy stream wrapper that implements the __cuda_stream__ protocol. - """ - - def __init__(self, cp_stream): - self.cp_stream = cp_stream - - def __cuda_stream__(self): - return (0, self.cp_stream.ptr) - - @property - def ptr(self): - return self.cp_stream.ptr - - @pytest.fixture(scope="function") -def cuda_stream() -> Stream: - return Stream(cp.cuda.Stream()) +def cuda_stream() -> Generator[Stream, None, None]: + device = Device() + device.set_current() + stream = device.create_stream() + try: + yield stream + finally: + stream.close() @pytest.fixture(scope="function", autouse=True) diff --git a/python/cuda_cccl/tests/compute/test_binary_search.py b/python/cuda_cccl/tests/compute/test_binary_search.py index 585a998c44c..ac38d92fc1d 100644 --- a/python/cuda_cccl/tests/compute/test_binary_search.py +++ b/python/cuda_cccl/tests/compute/test_binary_search.py @@ -1,9 +1,9 @@ # Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -import cupy as cp import numpy as np import pytest +from _utils.device_array import DeviceArray import cuda.compute from cuda.compute import OpKind @@ -51,21 +51,21 @@ def test_binary_search_explicit_opkind_less(search, side): h_data = np.array([1, 3, 3, 7, 9], dtype=np.int32) h_values = np.array([0, 3, 4, 10], dtype=np.int32) - d_data = cp.asarray(h_data) - d_values = cp.asarray(h_values) - d_out = cp.empty(len(h_values), dtype=np.uintp) + d_data = DeviceArray.from_numpy(h_data) + d_values = DeviceArray.from_numpy(h_values) + d_out = DeviceArray.empty(len(h_values), np.uintp) search( d_data=d_data, - num_items=len(d_data), + num_items=len(h_data), d_values=d_values, - num_values=len(d_values), + num_values=len(h_values), d_out=d_out, comp=OpKind.LESS, ) expected = np.searchsorted(h_data, h_values, side=side).astype(np.uintp) - np.testing.assert_array_equal(d_out.get(), expected) + np.testing.assert_array_equal(d_out.copy_to_host(), expected) @pytest.mark.parametrize( @@ -82,21 +82,21 @@ def test_binary_search_custom_comparator(search, side): def greater(lhs, rhs): return lhs > rhs - d_data = cp.asarray(h_data) - d_values = cp.asarray(h_values) - d_out = cp.empty(len(h_values), dtype=np.uintp) + d_data = DeviceArray.from_numpy(h_data) + d_values = DeviceArray.from_numpy(h_values) + d_out = DeviceArray.empty(len(h_values), np.uintp) search( d_data=d_data, - num_items=len(d_data), + num_items=len(h_data), d_values=d_values, - num_values=len(d_values), + num_values=len(h_values), d_out=d_out, comp=greater, ) expected = np.searchsorted(-h_data, -h_values, side=side).astype(np.uintp) - np.testing.assert_array_equal(d_out.get(), expected) + np.testing.assert_array_equal(d_out.copy_to_host(), expected) @pytest.mark.parametrize("dtype", DTYPE_LIST) @@ -107,9 +107,9 @@ def test_lower_bound_basic(dtype, num_items, num_values): h_data = random_sorted_array(num_items, dtype) h_values = random_sorted_array(num_values, dtype) - d_data = cp.asarray(h_data) - d_values = cp.asarray(h_values) - d_out = cp.empty(num_values, dtype=np.uintp) + d_data = DeviceArray.from_numpy(h_data) + d_values = DeviceArray.from_numpy(h_values) + d_out = DeviceArray.empty(num_values, np.uintp) cuda.compute.lower_bound( d_data=d_data, @@ -120,7 +120,7 @@ def test_lower_bound_basic(dtype, num_items, num_values): ) expected = np.searchsorted(h_data, h_values, side="left").astype(np.uintp) - got = cp.asnumpy(d_out) + got = d_out.copy_to_host() assert np.array_equal(got, expected) @@ -132,9 +132,9 @@ def test_upper_bound_basic(dtype, num_items, num_values): h_data = random_sorted_array(num_items, dtype) h_values = random_sorted_array(num_values, dtype) - d_data = cp.asarray(h_data) - d_values = cp.asarray(h_values) - d_out = cp.empty(num_values, dtype=np.uintp) + d_data = DeviceArray.from_numpy(h_data) + d_values = DeviceArray.from_numpy(h_values) + d_out = DeviceArray.empty(num_values, np.uintp) cuda.compute.upper_bound( d_data=d_data, @@ -145,7 +145,7 @@ def test_upper_bound_basic(dtype, num_items, num_values): ) expected = np.searchsorted(h_data, h_values, side="right").astype(np.uintp) - got = cp.asnumpy(d_out) + got = d_out.copy_to_host() assert np.array_equal(got, expected) @@ -164,9 +164,9 @@ def test_binary_search_with_duplicates(dtype): else rng.random(128, dtype=dtype) ) - d_data = cp.asarray(h_data) - d_values = cp.asarray(h_values) - d_out = cp.empty(len(h_values), dtype=np.uintp) + d_data = DeviceArray.from_numpy(h_data) + d_values = DeviceArray.from_numpy(h_values) + d_out = DeviceArray.empty(len(h_values), np.uintp) cuda.compute.lower_bound( d_data=d_data, @@ -176,7 +176,7 @@ def test_binary_search_with_duplicates(dtype): d_out=d_out, ) expected = np.searchsorted(h_data, h_values, side="left").astype(np.uintp) - got = cp.asnumpy(d_out) + got = d_out.copy_to_host() assert np.array_equal(got, expected) cuda.compute.upper_bound( @@ -187,39 +187,43 @@ def test_binary_search_with_duplicates(dtype): d_out=d_out, ) expected = np.searchsorted(h_data, h_values, side="right").astype(np.uintp) - got = cp.asnumpy(d_out) + got = d_out.copy_to_host() assert np.array_equal(got, expected) def test_binary_search_requires_unsigned_output(): """Output must be unsigned integer dtype for indices.""" - d_data = cp.asarray(np.array([1, 2, 3, 4], dtype=np.int32)) - d_values = cp.asarray(np.array([2, 3], dtype=np.int32)) - d_out = cp.empty(len(d_values), dtype=np.int32) # signed, should fail + h_data = np.array([1, 2, 3, 4], dtype=np.int32) + h_values = np.array([2, 3], dtype=np.int32) + d_data = DeviceArray.from_numpy(h_data) + d_values = DeviceArray.from_numpy(h_values) + d_out = DeviceArray.empty(len(h_values), np.int32) # signed, should fail with pytest.raises(TypeError, match="unsigned integer"): cuda.compute.lower_bound( d_data=d_data, - num_items=len(d_data), + num_items=len(h_data), d_values=d_values, - num_values=len(d_values), + num_values=len(h_values), d_out=d_out, ) def test_binary_search_requires_pointer_sized_output(): """Output must be pointer-sized (np.uintp) to hold any valid index.""" - d_data = cp.asarray(np.array([1, 2, 3, 4], dtype=np.int32)) - d_values = cp.asarray(np.array([2, 3], dtype=np.int32)) - d_out = cp.empty( - len(d_values), dtype=np.uint32 + h_data = np.array([1, 2, 3, 4], dtype=np.int32) + h_values = np.array([2, 3], dtype=np.int32) + d_data = DeviceArray.from_numpy(h_data) + d_values = DeviceArray.from_numpy(h_values) + d_out = DeviceArray.empty( + len(h_values), np.uint32 ) # unsigned but not pointer-sized (on 64-bit) with pytest.raises(ValueError, match="pointer-sized"): cuda.compute.lower_bound( d_data=d_data, - num_items=len(d_data), + num_items=len(h_data), d_values=d_values, - num_values=len(d_values), + num_values=len(h_values), d_out=d_out, ) diff --git a/python/cuda_cccl/tests/compute/test_deferred_annotations.py b/python/cuda_cccl/tests/compute/test_deferred_annotations.py index c30aeda4068..757c5d6e71f 100644 --- a/python/cuda_cccl/tests/compute/test_deferred_annotations.py +++ b/python/cuda_cccl/tests/compute/test_deferred_annotations.py @@ -5,8 +5,8 @@ from __future__ import annotations -import cupy as cp import numpy as np +from _utils.device_array import DeviceArray from cuda.compute import OpKind, TransformIterator, gpu_struct, reduce_into @@ -25,18 +25,19 @@ def test_transform_iterator_future_annotations(): def add_one(x: "np.int32") -> "np.int32": return x + np.int32(1) - d_in = cp.arange(8, dtype=np.int32) - d_out = cp.empty(1, dtype=np.int32) + h_in = np.arange(8, dtype=np.int32) + d_in = DeviceArray.from_numpy(h_in) + d_out = DeviceArray.empty(1, np.int32) h_init = np.array([0], dtype=np.int32) transform_it = TransformIterator(d_in, add_one) reduce_into( d_in=transform_it, d_out=d_out, - num_items=d_in.size, + num_items=h_in.size, op=OpKind.PLUS, h_init=h_init, ) - expected = int(cp.sum(d_in + 1).get()) - assert int(d_out.get()[0]) == expected + expected = int(np.sum(h_in + 1)) + assert int(d_out.copy_to_host()[0]) == expected diff --git a/python/cuda_cccl/tests/compute/test_histogram.py b/python/cuda_cccl/tests/compute/test_histogram.py index a002b0f9131..cea8a2519cf 100644 --- a/python/cuda_cccl/tests/compute/test_histogram.py +++ b/python/cuda_cccl/tests/compute/test_histogram.py @@ -4,9 +4,9 @@ import math -import cupy as cp import numpy as np import pytest +from _utils.device_array import DeviceArray import cuda.compute from cuda.compute import ( @@ -106,9 +106,9 @@ def test_device_histogram_basic_use(dtype, num_samples): upper_level = dtype(max_level) h_samples = random_int_array(num_samples, dtype) - d_samples = cp.asarray(h_samples) + d_samples = DeviceArray.from_numpy(h_samples) - d_histogram = cp.zeros(num_levels - 1, dtype=np.int32) + d_histogram = DeviceArray.from_numpy(np.zeros(num_levels - 1, dtype=np.int32)) cuda.compute.histogram_even( d_samples=d_samples, @@ -122,7 +122,7 @@ def test_device_histogram_basic_use(dtype, num_samples): h_expected = compute_reference_histogram( h_samples, num_levels, lower_level, upper_level ) - h_result = cp.asnumpy(d_histogram) + h_result = d_histogram.copy_to_host() np.testing.assert_array_equal(h_result, h_expected) @@ -138,7 +138,7 @@ def test_device_histogram_sample_iterator(): counting_it = CountingIterator(np.int32(0)) - d_histogram = cp.zeros(num_levels - 1, dtype=np.int32) + d_histogram = DeviceArray.from_numpy(np.zeros(num_levels - 1, dtype=np.int32)) # Set up levels so that values 0 to adjusted_total_samples-1 are evenly distributed lower_level = np.int32(0.0) @@ -155,20 +155,20 @@ def test_device_histogram_sample_iterator(): # Each bin should have exactly samples_per_bin elements h_expected = np.full(num_bins, samples_per_bin, dtype=np.int32) - h_result = cp.asnumpy(d_histogram) + h_result = d_histogram.copy_to_host() np.testing.assert_array_equal(h_result, h_expected) def test_device_histogram_single_sample(): h_samples = np.array([5.0], dtype=np.float32) - d_samples = cp.asarray(h_samples) + d_samples = DeviceArray.from_numpy(h_samples) num_levels = 5 lower_level = np.float32(0.0) upper_level = np.float32(10.0) - d_histogram = cp.zeros(num_levels - 1, dtype=np.int32) + d_histogram = DeviceArray.from_numpy(np.zeros(num_levels - 1, dtype=np.int32)) cuda.compute.histogram_even( d_samples=d_samples, @@ -181,20 +181,20 @@ def test_device_histogram_single_sample(): # Sample 5.0 should go into bin 2 (bins: [0,2.5), [2.5,5), [5,7.5), [7.5,10)) h_expected = np.array([0, 0, 1, 0], dtype=np.int32) - h_result = cp.asnumpy(d_histogram) + h_result = d_histogram.copy_to_host() np.testing.assert_array_equal(h_result, h_expected) def test_device_histogram_out_of_range(): h_samples = np.array([-1.0, 0.5, 5.5, 10.5, 15.0], dtype=np.float32) - d_samples = cp.asarray(h_samples) + d_samples = DeviceArray.from_numpy(h_samples) num_levels = 3 # 2 bins: [0,5), [5,10) lower_level = np.float32(0.0) upper_level = np.float32(10.0) - d_histogram = cp.zeros(num_levels - 1, dtype=np.int32) + d_histogram = DeviceArray.from_numpy(np.zeros(num_levels - 1, dtype=np.int32)) cuda.compute.histogram_even( d_samples=d_samples, @@ -208,26 +208,22 @@ def test_device_histogram_out_of_range(): # Only 0.5 (bin 0) and 5.5 (bin 1) should be counted # -1.0, 10.5, and 15.0 are out of range h_expected = np.array([1, 1], dtype=np.int32) - h_result = cp.asnumpy(d_histogram) + h_result = d_histogram.copy_to_host() np.testing.assert_array_equal(h_result, h_expected) def test_device_histogram_with_stream(cuda_stream): - cp_stream = cp.cuda.ExternalStream(cuda_stream.ptr) - h_samples = np.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0], dtype=np.float32) - d_samples = cp.asarray(h_samples) + d_samples = DeviceArray.from_numpy(h_samples, stream=cuda_stream) num_levels = 5 # 4 bins: [0,2), [2,4), [4,6), [6,8) lower_level = np.float32(0.0) upper_level = np.float32(8.0) - d_histogram = cp.zeros(num_levels - 1, dtype=np.int32) - - with cp_stream: - d_samples = cp.asarray(h_samples) - d_histogram = cp.zeros(num_levels - 1, dtype=np.int32) + d_histogram = DeviceArray.from_numpy( + np.zeros(num_levels - 1, dtype=np.int32), stream=cuda_stream + ) cuda.compute.histogram_even( d_samples=d_samples, @@ -239,8 +235,7 @@ def test_device_histogram_with_stream(cuda_stream): stream=cuda_stream, ) - with cp_stream: - h_result = cp.asnumpy(d_histogram) + h_result = d_histogram.copy_to_host(stream=cuda_stream) # Expected: bin 0: [1.0, 2.0), bin 1: [2.0, 4.0), bin 2: [4.0, 6.0), bin 3: [6.0, 8.0) # Values: 1.0->bin0, 2.0->bin1, 3.0->bin1, 4.0->bin2, 5.0->bin2, 6.0->bin3, 7.0->bin3, 8.0->out_of_range @@ -258,7 +253,7 @@ def test_device_histogram_with_constant_iterator(): lower_level = np.float32(0.0) upper_level = np.float32(8.0) - d_histogram = cp.zeros(num_levels - 1, dtype=np.int32) + d_histogram = DeviceArray.from_numpy(np.zeros(num_levels - 1, dtype=np.int32)) cuda.compute.histogram_even( d_samples=constant_it, @@ -269,7 +264,7 @@ def test_device_histogram_with_constant_iterator(): num_samples=num_samples, ) - h_result = cp.asnumpy(d_histogram) + h_result = d_histogram.copy_to_host() # Expected: All 10 samples have value 3.0, which falls in bin 1 [2,4) h_expected = np.array([0, 10, 0, 0], dtype=np.int32) @@ -278,16 +273,13 @@ def test_device_histogram_with_constant_iterator(): def test_histogram_even(): - import cupy as cp - import numpy as np - num_samples = 10 h_samples = np.array( [2.2, 6.1, 7.1, 2.9, 3.5, 0.3, 2.9, 2.1, 6.1, 999.5], dtype="float32" ) - d_samples = cp.asarray(h_samples) + d_samples = DeviceArray.from_numpy(h_samples) num_levels = 7 - d_histogram = cp.empty(num_levels - 1, dtype="int32") + d_histogram = DeviceArray.empty(num_levels - 1, np.int32) lower_level = np.float32(0) upper_level = np.float32(12) @@ -302,7 +294,7 @@ def test_histogram_even(): ) # Check the result is correct - h_actual_histogram = cp.asnumpy(d_histogram) + h_actual_histogram = d_histogram.copy_to_host() # Calculate expected histogram using numpy h_expected_histogram, _ = np.histogram( h_samples, bins=num_levels - 1, range=(lower_level, upper_level) @@ -319,8 +311,6 @@ def test_histogram_cache_bug_crosses_256_bin_threshold(): # in invalid shared memory accesses, because a different shared # memory strategy is used for num_bins > 256. num_samples = 128 - d_samples = cp.empty(num_samples, dtype=np.int32) - d_histogram = cp.empty(2048, dtype=np.int32) h_num_output_levels = np.array([0], dtype=np.int32) h_lower_level = np.array([0], dtype=np.int32) h_upper_level = np.array([0], dtype=np.int32) @@ -331,12 +321,13 @@ def test_histogram_cache_bug_crosses_256_bin_threshold(): h_lower_level[0] = 0 h_upper_level[0] = num_bins_1 - d_samples[:] = cp.random.randint(0, num_bins_1, size=num_samples, dtype=np.int32) - d_histogram[:num_bins_1] = 0 + h_samples = np.random.randint(0, num_bins_1, size=num_samples, dtype=np.int32) + d_samples = DeviceArray.from_numpy(h_samples) + d_histogram = DeviceArray.from_numpy(np.zeros(num_bins_1, dtype=np.int32)) hist = cuda.compute.make_histogram_even( d_samples=d_samples, - d_histogram=d_histogram[:num_bins_1], + d_histogram=d_histogram, h_num_output_levels=h_num_output_levels, h_lower_level=h_lower_level, h_upper_level=h_upper_level, @@ -345,36 +336,36 @@ def test_histogram_cache_bug_crosses_256_bin_threshold(): temp_bytes = hist( temp_storage=None, d_samples=d_samples, - d_histogram=d_histogram[:num_bins_1], + d_histogram=d_histogram, h_num_output_levels=h_num_output_levels, h_lower_level=h_lower_level, h_upper_level=h_upper_level, num_samples=num_samples, ) - temp_storage = cp.empty(temp_bytes, dtype=np.uint8) + temp_storage = DeviceArray.empty(temp_bytes, np.uint8) hist( temp_storage=temp_storage, d_samples=d_samples, - d_histogram=d_histogram[:num_bins_1], + d_histogram=d_histogram, h_num_output_levels=h_num_output_levels, h_lower_level=h_lower_level, h_upper_level=h_upper_level, num_samples=num_samples, ) - cp.cuda.Device().synchronize() - assert int(d_histogram[:num_bins_1].sum()) == num_samples + assert int(d_histogram.copy_to_host().sum()) == num_samples num_bins_2 = 2048 h_num_output_levels[0] = num_bins_2 + 1 h_lower_level[0] = 0 h_upper_level[0] = num_bins_2 - d_samples[:] = cp.random.randint(0, num_bins_2, size=num_samples, dtype=np.int32) - d_histogram[:num_bins_2] = 0 + h_samples = np.random.randint(0, num_bins_2, size=num_samples, dtype=np.int32) + d_samples = DeviceArray.from_numpy(h_samples) + d_histogram = DeviceArray.from_numpy(np.zeros(num_bins_2, dtype=np.int32)) hist2 = cuda.compute.make_histogram_even( d_samples=d_samples, - d_histogram=d_histogram[:num_bins_2], + d_histogram=d_histogram, h_num_output_levels=h_num_output_levels, h_lower_level=h_lower_level, h_upper_level=h_upper_level, @@ -384,24 +375,23 @@ def test_histogram_cache_bug_crosses_256_bin_threshold(): temp_bytes2 = hist2( temp_storage=None, d_samples=d_samples, - d_histogram=d_histogram[:num_bins_2], + d_histogram=d_histogram, h_num_output_levels=h_num_output_levels, h_lower_level=h_lower_level, h_upper_level=h_upper_level, num_samples=num_samples, ) - temp_storage2 = cp.empty(temp_bytes2, dtype=np.uint8) + temp_storage2 = DeviceArray.empty(temp_bytes2, np.uint8) hist2( temp_storage=temp_storage2, d_samples=d_samples, - d_histogram=d_histogram[:num_bins_2], + d_histogram=d_histogram, h_num_output_levels=h_num_output_levels, h_lower_level=h_lower_level, h_upper_level=h_upper_level, num_samples=num_samples, ) - cp.cuda.Device().synchronize() - assert int(d_histogram[:num_bins_2].sum()) == num_samples + assert int(d_histogram.copy_to_host().sum()) == num_samples def test_histogram_cache_reuses_artifact_when_bounds_change(): @@ -409,8 +399,8 @@ def test_histogram_cache_reuses_artifact_when_bounds_change(): num_samples = 8 num_levels = 5 - d_samples = cp.asarray(np.arange(num_samples, dtype=np.float32)) - d_histogram = cp.empty(num_levels - 1, dtype=np.int32) + d_samples = DeviceArray.from_numpy(np.arange(num_samples, dtype=np.float32)) + d_histogram = DeviceArray.empty(num_levels - 1, np.int32) h_num_output_levels = np.array([num_levels], dtype=np.int32) h_lower_level_1 = np.array([0], dtype=np.float32) @@ -436,8 +426,8 @@ def test_histogram_cache_reuses_artifact_when_bounds_change(): ) assert hist1 is hist2 - d_samples = cp.asarray(np.arange(10, 18, dtype=np.float32)) - d_histogram.fill(0) + d_samples = DeviceArray.from_numpy(np.arange(10, 18, dtype=np.float32)) + d_histogram.copy_from_host(np.zeros(num_levels - 1, dtype=np.int32)) temp_bytes = hist2( temp_storage=None, d_samples=d_samples, @@ -447,7 +437,7 @@ def test_histogram_cache_reuses_artifact_when_bounds_change(): h_upper_level=h_upper_level_2, num_samples=num_samples, ) - temp_storage = cp.empty(temp_bytes, dtype=np.uint8) + temp_storage = DeviceArray.empty(temp_bytes, np.uint8) hist2( temp_storage=temp_storage, d_samples=d_samples, @@ -457,10 +447,8 @@ def test_histogram_cache_reuses_artifact_when_bounds_change(): h_upper_level=h_upper_level_2, num_samples=num_samples, ) - cp.cuda.Device().synchronize() - np.testing.assert_array_equal( - cp.asnumpy(d_histogram), np.array([2, 2, 2, 2], dtype=np.int32) + d_histogram.copy_to_host(), np.array([2, 2, 2, 2], dtype=np.int32) ) @@ -468,13 +456,13 @@ def test_histogram_cache_reuses_artifact_for_same_offset_width(): cuda.compute.clear_all_caches() num_levels = 5 - d_histogram = cp.empty(num_levels - 1, dtype=np.int32) + d_histogram = DeviceArray.empty(num_levels - 1, np.int32) h_num_output_levels = np.array([num_levels], dtype=np.int32) h_lower_level = np.array([0], dtype=np.float32) h_upper_level = np.array([12], dtype=np.float32) hist1 = cuda.compute.make_histogram_even( - d_samples=cp.asarray(np.arange(8, dtype=np.float32)), + d_samples=DeviceArray.from_numpy(np.arange(8, dtype=np.float32)), d_histogram=d_histogram, h_num_output_levels=h_num_output_levels, h_lower_level=h_lower_level, @@ -482,7 +470,7 @@ def test_histogram_cache_reuses_artifact_for_same_offset_width(): num_samples=8, ) hist2 = cuda.compute.make_histogram_even( - d_samples=cp.asarray(np.arange(12, dtype=np.float32)), + d_samples=DeviceArray.from_numpy(np.arange(12, dtype=np.float32)), d_histogram=d_histogram, h_num_output_levels=h_num_output_levels, h_lower_level=h_lower_level, @@ -497,7 +485,7 @@ def test_histogram_cache_reuses_artifact_for_same_offset_width(): np.iinfo(np.int32).max / np.dtype(np.float32).itemsize ) hist3 = cuda.compute.make_histogram_even( - d_samples=cp.asarray(np.arange(12, dtype=np.float32)), + d_samples=DeviceArray.from_numpy(np.arange(12, dtype=np.float32)), d_histogram=d_histogram, h_num_output_levels=h_num_output_levels, h_lower_level=h_lower_level, @@ -506,8 +494,8 @@ def test_histogram_cache_reuses_artifact_for_same_offset_width(): ) assert hist3 is not hist1 - d_samples = cp.asarray(np.arange(12, dtype=np.float32)) - d_histogram.fill(0) + d_samples = DeviceArray.from_numpy(np.arange(12, dtype=np.float32)) + d_histogram.copy_from_host(np.zeros(num_levels - 1, dtype=np.int32)) temp_bytes = hist2( temp_storage=None, d_samples=d_samples, @@ -517,7 +505,7 @@ def test_histogram_cache_reuses_artifact_for_same_offset_width(): h_upper_level=h_upper_level, num_samples=12, ) - temp_storage = cp.empty(temp_bytes, dtype=np.uint8) + temp_storage = DeviceArray.empty(temp_bytes, np.uint8) hist2( temp_storage=temp_storage, d_samples=d_samples, @@ -527,17 +515,15 @@ def test_histogram_cache_reuses_artifact_for_same_offset_width(): h_upper_level=h_upper_level, num_samples=12, ) - cp.cuda.Device().synchronize() - np.testing.assert_array_equal( - cp.asnumpy(d_histogram), np.array([3, 3, 3, 3], dtype=np.int32) + d_histogram.copy_to_host(), np.array([3, 3, 3, 3], dtype=np.int32) ) def test_make_histogram_even_rejects_mismatched_bound_dtypes(): num_samples = 8 - d_samples = cp.asarray(np.arange(num_samples, dtype=np.int32)) - d_histogram = cp.empty(4, dtype=np.int32) + d_samples = DeviceArray.from_numpy(np.arange(num_samples, dtype=np.int32)) + d_histogram = DeviceArray.empty(4, np.int32) with pytest.raises(TypeError, match="must have the same dtype"): cuda.compute.make_histogram_even( diff --git a/python/cuda_cccl/tests/compute/test_iterators.py b/python/cuda_cccl/tests/compute/test_iterators.py index 49e2ab39d15..c5b57e0581d 100644 --- a/python/cuda_cccl/tests/compute/test_iterators.py +++ b/python/cuda_cccl/tests/compute/test_iterators.py @@ -3,10 +3,9 @@ # # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -import cupy as cp -import numba.cuda import numpy as np import pytest +from _utils.device_array import DeviceArray import cuda.compute from cuda.compute import OpKind @@ -43,9 +42,9 @@ def test_counting_iterator_equality(): def test_cache_modified_input_iterator_equality(): - ary1 = cp.asarray([0, 1, 2], dtype="int32") - ary2 = cp.asarray([3, 4, 5], dtype="int32") - ary3 = cp.asarray([0, 1, 2], dtype="int64") + ary1 = DeviceArray.from_numpy(np.asarray([0, 1, 2], dtype="int32")) + ary2 = DeviceArray.from_numpy(np.asarray([3, 4, 5], dtype="int32")) + ary3 = DeviceArray.from_numpy(np.asarray([0, 1, 2], dtype="int64")) it1 = CacheModifiedInputIterator(ary1, "stream") it2 = CacheModifiedInputIterator(ary1, "stream") @@ -76,8 +75,8 @@ def op3(x): # op3 has a different name than op1, so should have a different kind assert it1.kind != it3.kind - ary1 = cp.asarray([0, 1, 2]) - ary2 = cp.asarray([3, 4, 5]) + ary1 = DeviceArray.from_numpy(np.asarray([0, 1, 2])) + ary2 = DeviceArray.from_numpy(np.asarray([3, 4, 5])) it4 = TransformIterator(ary1, op1) it5 = TransformIterator(ary1, op1) it6 = TransformIterator(ary1, op2) @@ -91,46 +90,10 @@ def op3(x): assert it4.kind != it7.kind -@pytest.fixture( - params=[ - # Each tuple is (shape, layout, array_type) - ((5,), "C", "cupy"), - ((5,), "F", "cupy"), - ((5,), "C", "numba"), - ((5,), "F", "numba"), - ((4, 3), "C", "cupy"), - ((4, 3), "F", "cupy"), - ((4, 3), "C", "numba"), - ((4, 3), "F", "numba"), - ((3, 4, 2), "C", "cupy"), - ((3, 4, 2), "F", "cupy"), - ((3, 4, 2), "C", "numba"), - ((3, 4, 2), "F", "numba"), - ], - ids=lambda param: f"{param[2]}_{param[1]}_{len(param[0])}D", -) -def reverse_iterator_array(request): - shape, layout, array_type = request.param - - # Create base numpy array - base_array = np.arange(np.prod(shape)) - base_array[-1] = -999 - base_array = base_array.reshape(shape) - if layout == "F": - base_array = np.asfortranarray(base_array) - - if array_type == "cupy": - array = cp.array(base_array) - else: - array = numba.cuda.to_device(base_array) - - return array - - def test_reverse_input_iterator_equality(): - ary1 = cp.asarray([0, 1, 2], dtype="int32") - ary2 = cp.asarray([3, 4, 5], dtype="int32") - ary3 = cp.asarray([0, 1, 2], dtype="int64") + ary1 = DeviceArray.from_numpy(np.asarray([0, 1, 2], dtype="int32")) + ary2 = DeviceArray.from_numpy(np.asarray([3, 4, 5], dtype="int32")) + ary3 = DeviceArray.from_numpy(np.asarray([0, 1, 2], dtype="int64")) it1 = ReverseIterator(ary1) it2 = ReverseIterator(ary1) @@ -142,9 +105,9 @@ def test_reverse_input_iterator_equality(): def test_reverse_output_iterator_equality(): - ary1 = cp.asarray([0, 1, 2], dtype="int32") - ary2 = cp.asarray([3, 4, 5], dtype="int32") - ary3 = cp.asarray([0, 1, 2], dtype="int64") + ary1 = DeviceArray.from_numpy(np.asarray([0, 1, 2], dtype="int32")) + ary2 = DeviceArray.from_numpy(np.asarray([3, 4, 5], dtype="int32")) + ary3 = DeviceArray.from_numpy(np.asarray([0, 1, 2], dtype="int64")) it1 = ReverseIterator(ary1) it2 = ReverseIterator(ary1) @@ -205,7 +168,7 @@ def test_transform_iterator_with_lambda(): CountingIterator(np.int32(first_item)), lambda x: x * 2 ) h_init = np.array([0], dtype=np.int32) - d_output = cp.empty(1, dtype=np.int32) + d_output = DeviceArray.empty(1, np.int32) # Perform reduction on the transformed iterator cuda.compute.reduce_into( @@ -218,7 +181,7 @@ def test_transform_iterator_with_lambda(): # Expected: sum of (10*2, 11*2, ..., 109*2) = 2 * sum(10..109) expected = 2 * sum(range(first_item, first_item + num_items)) - assert d_output.get()[0] == expected + assert d_output.copy_to_host()[0] == expected def test_transform_iterator_with_zip_iterator(): @@ -226,8 +189,10 @@ def test_transform_iterator_with_zip_iterator(): from cuda.compute.iterators import ZipIterator # Create a ZipIterator with two int32 arrays - d_a = cp.arange(10, dtype=np.int32) - d_b = cp.arange(100, 110, dtype=np.int32) + h_a = np.arange(10, dtype=np.int32) + h_b = np.arange(100, 110, dtype=np.int32) + d_a = DeviceArray.from_numpy(h_a) + d_b = DeviceArray.from_numpy(h_b) zip_it = ZipIterator(d_a, d_b) @@ -242,17 +207,17 @@ def sum_fields(pair): # Use it in a reduction h_init = np.array([0], dtype=np.int32) - d_output = cp.empty(1, dtype=np.int32) + d_output = DeviceArray.empty(1, np.int32) cuda.compute.reduce_into( d_in=transform_it, d_out=d_output, - num_items=len(d_a), + num_items=len(h_a), op=OpKind.PLUS, h_init=h_init, ) - result = d_output.get()[0] - expected = (d_a + d_b).sum().get() + result = d_output.copy_to_host()[0] + expected = (h_a + h_b).sum() assert result == expected, f"Expected {expected}, got {result}" diff --git a/python/cuda_cccl/tests/compute/test_merge_sort.py b/python/cuda_cccl/tests/compute/test_merge_sort.py index 33d7d15de58..0d929ae6f14 100644 --- a/python/cuda_cccl/tests/compute/test_merge_sort.py +++ b/python/cuda_cccl/tests/compute/test_merge_sort.py @@ -4,10 +4,9 @@ from typing import List -import cupy as cp -import numba.cuda import numpy as np import pytest +from _utils.device_array import DeviceArray import cuda.compute from cuda.compute import ( @@ -82,7 +81,7 @@ def compare_op(lhs, rhs): def test_merge_sort_keys(dtype, num_items, op): h_in_keys = random_array(num_items, dtype) - d_in_keys = numba.cuda.to_device(h_in_keys) + d_in_keys = DeviceArray.from_numpy(h_in_keys) merge_sort_device(d_in_keys, None, d_in_keys, None, op, num_items) @@ -102,8 +101,8 @@ def test_merge_sort_pairs(dtype, num_items, op, monkeypatch): h_in_keys = random_array(num_items, dtype) h_in_items = random_array(num_items, np.float32) - d_in_keys = numba.cuda.to_device(h_in_keys) - d_in_items = numba.cuda.to_device(h_in_items) + d_in_keys = DeviceArray.from_numpy(h_in_keys) + d_in_items = DeviceArray.from_numpy(h_in_items) merge_sort_device(d_in_keys, d_in_items, d_in_keys, d_in_items, op, num_items) @@ -123,8 +122,8 @@ def test_merge_sort_keys_copy(dtype, num_items, op): h_in_keys = random_array(num_items, dtype) h_out_keys = np.empty(num_items, dtype=dtype) - d_in_keys = numba.cuda.to_device(h_in_keys) - d_out_keys = numba.cuda.to_device(h_out_keys) + d_in_keys = DeviceArray.from_numpy(h_in_keys) + d_out_keys = DeviceArray.empty(h_out_keys.shape, h_out_keys.dtype) merge_sort_device(d_in_keys, None, d_out_keys, None, op, num_items) @@ -146,10 +145,10 @@ def test_merge_sort_pairs_copy(dtype, num_items, op, monkeypatch): h_out_keys = np.empty(num_items, dtype=dtype) h_out_items = np.empty(num_items, dtype=np.float32) - d_in_keys = numba.cuda.to_device(h_in_keys) - d_in_items = numba.cuda.to_device(h_in_items) - d_out_keys = numba.cuda.to_device(h_out_keys) - d_out_items = numba.cuda.to_device(h_out_items) + d_in_keys = DeviceArray.from_numpy(h_in_keys) + d_in_items = DeviceArray.from_numpy(h_in_items) + d_out_keys = DeviceArray.empty(h_out_keys.shape, h_out_keys.dtype) + d_out_items = DeviceArray.empty(h_out_items.shape, h_out_items.dtype) merge_sort_device(d_in_keys, d_in_items, d_out_keys, d_out_items, op, num_items) @@ -195,17 +194,15 @@ def struct_compare_op(lhs, rhs): h_in_items["a"] = a_items h_in_items["b"] = b_items - d_in_keys = cp.empty_like(h_in_keys) - d_in_items = cp.empty_like(h_in_items) - d_in_keys.set(h_in_keys) - d_in_items.set(h_in_items) + d_in_keys = DeviceArray.from_numpy(h_in_keys) + d_in_items = DeviceArray.from_numpy(h_in_items) merge_sort_device( d_in_keys, d_in_items, d_in_keys, d_in_items, struct_compare_op, num_items ) - h_out_keys = d_in_keys.get() - h_out_items = d_in_items.get() + h_out_keys = d_in_keys.copy_to_host() + h_out_items = d_in_items.copy_to_host() argsort = np.argsort(h_in_keys, stable=True) h_in_keys = np.array(h_in_keys)[argsort] @@ -225,7 +222,7 @@ def compare_complex(lhs, rhs): imaginary = random_array(num_items, np.int64, max_value) h_in_keys = real + 1j * imaginary - d_in_keys = numba.cuda.to_device(h_in_keys) + d_in_keys = DeviceArray.from_numpy(h_in_keys) merge_sort_device(d_in_keys, None, d_in_keys, None, compare_complex, num_items) @@ -240,8 +237,8 @@ def test_merge_sort_keys_copy_iterator_input(dtype, num_items, op): h_in_keys = random_array(num_items, dtype) h_out_keys = np.empty(num_items, dtype=dtype) - d_in_keys = numba.cuda.to_device(h_in_keys) - d_out_keys = numba.cuda.to_device(h_out_keys) + d_in_keys = DeviceArray.from_numpy(h_in_keys) + d_out_keys = DeviceArray.empty(h_out_keys.shape, h_out_keys.dtype) i_input = CacheModifiedInputIterator(d_in_keys, modifier="stream") @@ -265,10 +262,10 @@ def test_merge_sort_pairs_copy_iterator_input(dtype, num_items, op, monkeypatch) h_out_keys = np.empty(num_items, dtype=dtype) h_out_items = np.empty(num_items, dtype=np.float32) - d_in_keys = numba.cuda.to_device(h_in_keys) - d_in_items = numba.cuda.to_device(h_in_items) - d_out_keys = numba.cuda.to_device(h_out_keys) - d_out_items = numba.cuda.to_device(h_out_items) + d_in_keys = DeviceArray.from_numpy(h_in_keys) + d_in_items = DeviceArray.from_numpy(h_in_items) + d_out_keys = DeviceArray.empty(h_out_keys.shape, h_out_keys.dtype) + d_out_items = DeviceArray.empty(h_out_items.shape, h_out_items.dtype) i_input_keys = CacheModifiedInputIterator(d_in_keys, modifier="stream") i_input_items = CacheModifiedInputIterator(d_in_items, modifier="stream") @@ -289,19 +286,17 @@ def test_merge_sort_pairs_copy_iterator_input(dtype, num_items, op, monkeypatch) def test_merge_sort_with_stream(cuda_stream): - cp_stream = cp.cuda.ExternalStream(cuda_stream.ptr) num_items = 10000 - with cp_stream: - h_in_keys = random_array(num_items, np.int32) - d_in_keys = cp.asarray(h_in_keys) - d_out_keys = cp.empty_like(d_in_keys) + h_in_keys = random_array(num_items, np.int32) + d_in_keys = DeviceArray.from_numpy(h_in_keys, stream=cuda_stream) + d_out_keys = DeviceArray.empty(h_in_keys.shape, h_in_keys.dtype, stream=cuda_stream) merge_sort_device( d_in_keys, None, d_out_keys, None, compare_op, num_items, stream=cuda_stream ) - got = d_out_keys.get() + got = d_out_keys.copy_to_host(stream=cuda_stream) h_in_keys.sort() np.testing.assert_array_equal(got, h_in_keys) @@ -310,39 +305,41 @@ def test_merge_sort_with_stream(cuda_stream): def test_merge_sort_well_known_less(): dtype = np.int32 - d_in_keys = cp.array([5, 2, 8, 1, 9, 3], dtype=dtype) - d_out_keys = cp.empty_like(d_in_keys) + h_in_keys = np.array([5, 2, 8, 1, 9, 3], dtype=dtype) + d_in_keys = DeviceArray.from_numpy(h_in_keys) + d_out_keys = DeviceArray.empty(h_in_keys.shape, h_in_keys.dtype) cuda.compute.merge_sort( d_in_keys=d_in_keys, d_in_values=None, d_out_keys=d_out_keys, d_out_values=None, - num_items=len(d_in_keys), + num_items=len(h_in_keys), op=OpKind.LESS, ) expected = np.array([1, 2, 3, 5, 8, 9]) - np.testing.assert_equal(d_out_keys.get(), expected) + np.testing.assert_equal(d_out_keys.copy_to_host(), expected) def test_merge_sort_well_known_greater(): dtype = np.int32 - d_in_keys = cp.array([5, 2, 8, 1, 9, 3], dtype=dtype) - d_out_keys = cp.empty_like(d_in_keys) + h_in_keys = np.array([5, 2, 8, 1, 9, 3], dtype=dtype) + d_in_keys = DeviceArray.from_numpy(h_in_keys) + d_out_keys = DeviceArray.empty(h_in_keys.shape, h_in_keys.dtype) cuda.compute.merge_sort( d_in_keys=d_in_keys, d_in_values=None, d_out_keys=d_out_keys, d_out_values=None, - num_items=len(d_in_keys), + num_items=len(h_in_keys), op=OpKind.GREATER, ) expected = np.array([9, 8, 5, 3, 2, 1]) - np.testing.assert_equal(d_out_keys.get(), expected) + np.testing.assert_equal(d_out_keys.copy_to_host(), expected) def test_merge_sort_large_temp_storage_not_negative(): @@ -353,8 +350,8 @@ def test_merge_sort_large_temp_storage_not_negative(): """ num_items = 2**28 dtype = np.int64 - d_in_keys = cp.zeros(num_items, dtype=dtype) - d_out_keys = cp.empty(num_items, dtype=dtype) + d_in_keys = DeviceArray.empty(num_items, dtype) + d_out_keys = DeviceArray.empty(num_items, dtype) sorter = cuda.compute.make_merge_sort( d_in_keys=d_in_keys, @@ -380,21 +377,23 @@ def test_merge_sort_large_temp_storage_not_negative(): def test_merge_sort_with_values_well_known(): dtype = np.int32 - d_in_keys = cp.array([3, 1, 4, 2], dtype=dtype) - d_in_values = cp.array([30, 10, 40, 20], dtype=dtype) - d_out_keys = cp.empty_like(d_in_keys) - d_out_values = cp.empty_like(d_in_values) + h_in_keys = np.array([3, 1, 4, 2], dtype=dtype) + h_in_values = np.array([30, 10, 40, 20], dtype=dtype) + d_in_keys = DeviceArray.from_numpy(h_in_keys) + d_in_values = DeviceArray.from_numpy(h_in_values) + d_out_keys = DeviceArray.empty(h_in_keys.shape, h_in_keys.dtype) + d_out_values = DeviceArray.empty(h_in_values.shape, h_in_values.dtype) cuda.compute.merge_sort( d_in_keys=d_in_keys, d_in_values=d_in_values, d_out_keys=d_out_keys, d_out_values=d_out_values, - num_items=len(d_in_keys), + num_items=len(h_in_keys), op=OpKind.LESS, ) expected_keys = np.array([1, 2, 3, 4]) expected_values = np.array([10, 20, 30, 40]) - np.testing.assert_equal(d_out_keys.get(), expected_keys) - np.testing.assert_equal(d_out_values.get(), expected_values) + np.testing.assert_equal(d_out_keys.copy_to_host(), expected_keys) + np.testing.assert_equal(d_out_values.copy_to_host(), expected_values) diff --git a/python/cuda_cccl/tests/compute/test_nested_struct.py b/python/cuda_cccl/tests/compute/test_nested_struct.py index 9cbfa40e393..d9aa986bf9c 100644 --- a/python/cuda_cccl/tests/compute/test_nested_struct.py +++ b/python/cuda_cccl/tests/compute/test_nested_struct.py @@ -1,8 +1,8 @@ # Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -import cupy as cp import numpy as np +from _utils.device_array import DeviceArray import cuda.compute from cuda.compute import ZipIterator, gpu_struct @@ -25,13 +25,8 @@ def sum_nested(s1, s2): h_data[i]["inner"]["a"] = i * 2 h_data[i]["inner"]["b"] = float(i * 3) - itemsize = h_data.dtype.itemsize - d_input = cp.empty(num_items * itemsize, dtype=np.uint8) - d_input.set(h_data.view(np.uint8)) - d_input = d_input.view(Outer.dtype) - - d_output = cp.empty(itemsize, dtype=np.uint8) - d_output = d_output.view(Outer.dtype) + d_input = DeviceArray.from_numpy(h_data) + d_output = DeviceArray.empty(1, Outer.dtype) h_init = Outer(0, Inner(0, 0.0)) @@ -39,7 +34,7 @@ def sum_nested(s1, s2): d_in=d_input, d_out=d_output, num_items=num_items, op=sum_nested, h_init=h_init ) - result = d_output.view(np.uint8).get().view(Outer.dtype)[0] + result = d_output.copy_to_host()[0] expected_x = sum(range(num_items)) expected_a = sum(i * 2 for i in range(num_items)) @@ -71,13 +66,8 @@ def sum_nested(s1, s2): h_data[i]["inner"]["a"] = i * 2 h_data[i]["inner"]["b"] = float(i * 3) - itemsize = h_data.dtype.itemsize - d_input = cp.empty(num_items * itemsize, dtype=np.uint8) - d_input.set(h_data.view(np.uint8)) - d_input = d_input.view(Outer.dtype) - - d_output = cp.empty(itemsize, dtype=np.uint8) - d_output = d_output.view(Outer.dtype) + d_input = DeviceArray.from_numpy(h_data) + d_output = DeviceArray.empty(1, Outer.dtype) h_init = Outer(0, Inner(0, 0.0)) @@ -85,7 +75,7 @@ def sum_nested(s1, s2): d_in=d_input, d_out=d_output, num_items=num_items, op=sum_nested, h_init=h_init ) - result = d_output.view(np.uint8).get().view(Outer.dtype)[0] + result = d_output.copy_to_host()[0] expected_x = sum(range(num_items)) expected_a = sum(i * 2 for i in range(num_items)) @@ -113,28 +103,25 @@ def sum_pixels(p1, p2): num_items = 100 - d_points = cp.empty(num_items, dtype=Point.dtype) - d_colors = cp.empty(num_items, dtype=Color.dtype) - h_points = np.array([(i, i * 2) for i in range(num_items)], dtype=Point.dtype) h_colors = np.array( [(i % 256, (i * 2) % 256, (i * 3) % 256) for i in range(num_items)], dtype=Color.dtype, ) - d_points.set(h_points) - d_colors.set(h_colors) + d_points = DeviceArray.from_numpy(h_points) + d_colors = DeviceArray.from_numpy(h_colors) zip_it = ZipIterator(d_points, d_colors) - d_output = cp.empty(1, dtype=Pixel.dtype) + d_output = DeviceArray.empty(1, Pixel.dtype) h_init = Pixel(Point(0, 0), Color(0, 0, 0)) cuda.compute.reduce_into( d_in=zip_it, d_out=d_output, num_items=num_items, op=sum_pixels, h_init=h_init ) - result = d_output.get()[0] + result = d_output.copy_to_host()[0] expected_x = sum(i for i in range(num_items)) expected_y = sum(i * 2 for i in range(num_items)) @@ -224,13 +211,8 @@ def sum_nested(s1, s2): h_data[i]["inner"]["a"] = i * 2 h_data[i]["inner"]["b"] = float(i * 3) - itemsize = h_data.dtype.itemsize - d_input = cp.empty(num_items * itemsize, dtype=np.uint8) - d_input.set(h_data.view(np.uint8)) - d_input = d_input.view(Outer.dtype) - - d_output = cp.empty(itemsize, dtype=np.uint8) - d_output = d_output.view(Outer.dtype) + d_input = DeviceArray.from_numpy(h_data) + d_output = DeviceArray.empty(1, Outer.dtype) # Use dictionary initialization for the init value h_init = Outer({"x": 0, "inner": {"a": 0, "b": 0.0}}) @@ -239,7 +221,7 @@ def sum_nested(s1, s2): d_in=d_input, d_out=d_output, num_items=num_items, op=sum_nested, h_init=h_init ) - result = d_output.view(np.uint8).get().view(Outer.dtype)[0] + result = d_output.copy_to_host()[0] expected_x = sum(range(num_items)) expected_a = sum(i * 2 for i in range(num_items)) @@ -267,13 +249,8 @@ def sum_nested_with_tuples(s1, s2): h_data[i]["inner"]["a"] = i * 2 h_data[i]["inner"]["b"] = float(i * 3) - itemsize = h_data.dtype.itemsize - d_input = cp.empty(num_items * itemsize, dtype=np.uint8) - d_input.set(h_data.view(np.uint8)) - d_input = d_input.view(Outer.dtype) - - d_output = cp.empty(itemsize, dtype=np.uint8) - d_output = d_output.view(Outer.dtype) + d_input = DeviceArray.from_numpy(h_data) + d_output = DeviceArray.empty(1, Outer.dtype) h_init = Outer(0, Inner(0, 0.0)) @@ -285,7 +262,7 @@ def sum_nested_with_tuples(s1, s2): h_init=h_init, ) - result = d_output.view(np.uint8).get().view(Outer.dtype)[0] + result = d_output.copy_to_host()[0] expected_x = sum(range(num_items)) expected_a = sum(i * 2 for i in range(num_items)) @@ -320,13 +297,8 @@ def sum_deeply_nested(v1, v2): h_data[i]["middle"]["data"] = float(i * 2.5) h_data[i]["middle"]["nested"]["value"] = i * 3 - itemsize = h_data.dtype.itemsize - d_input = cp.empty(num_items * itemsize, dtype=np.uint8) - d_input.set(h_data.view(np.uint8)) - d_input = d_input.view(Level3.dtype) - - d_output = cp.empty(itemsize, dtype=np.uint8) - d_output = d_output.view(Level3.dtype) + d_input = DeviceArray.from_numpy(h_data) + d_output = DeviceArray.empty(1, Level3.dtype) h_init = Level3(0, Level2(0.0, Level1(0))) @@ -338,7 +310,7 @@ def sum_deeply_nested(v1, v2): h_init=h_init, ) - result = d_output.view(np.uint8).get().view(Level3.dtype)[0] + result = d_output.copy_to_host()[0] expected_id = sum(i * 10 for i in range(num_items)) expected_data = sum(float(i * 2.5) for i in range(num_items)) @@ -373,13 +345,8 @@ def sum_mixed(s1, s2): h_data[i]["inner2"]["c"] = float(i * 4) h_data[i]["inner2"]["d"] = float(i * 5) - itemsize = h_data.dtype.itemsize - d_input = cp.empty(num_items * itemsize, dtype=np.uint8) - d_input.set(h_data.view(np.uint8)) - d_input = d_input.view(Outer.dtype) - - d_output = cp.empty(itemsize, dtype=np.uint8) - d_output = d_output.view(Outer.dtype) + d_input = DeviceArray.from_numpy(h_data) + d_output = DeviceArray.empty(1, Outer.dtype) h_init = Outer(0, Inner1(0, 0), Inner2(0.0, 0.0)) @@ -387,7 +354,7 @@ def sum_mixed(s1, s2): d_in=d_input, d_out=d_output, num_items=num_items, op=sum_mixed, h_init=h_init ) - result = d_output.view(np.uint8).get().view(Outer.dtype)[0] + result = d_output.copy_to_host()[0] expected_x = sum(range(num_items)) expected_a = sum(i * 2 for i in range(num_items)) @@ -421,21 +388,18 @@ def sum_pixels_with_tuples(p1, p2): num_items = 100 - d_points = cp.empty(num_items, dtype=Point.dtype) - d_colors = cp.empty(num_items, dtype=Color.dtype) - h_points = np.array([(i, i * 2) for i in range(num_items)], dtype=Point.dtype) h_colors = np.array( [(i % 256, (i * 2) % 256, (i * 3) % 256) for i in range(num_items)], dtype=Color.dtype, ) - d_points.set(h_points) - d_colors.set(h_colors) + d_points = DeviceArray.from_numpy(h_points) + d_colors = DeviceArray.from_numpy(h_colors) zip_it = ZipIterator(d_points, d_colors) - d_output = cp.empty(1, dtype=Pixel.dtype) + d_output = DeviceArray.empty(1, Pixel.dtype) h_init = Pixel(Point(0, 0), Color(0, 0, 0)) cuda.compute.reduce_into( @@ -446,7 +410,7 @@ def sum_pixels_with_tuples(p1, p2): h_init=h_init, ) - result = d_output.get()[0] + result = d_output.copy_to_host()[0] expected_x = sum(i for i in range(num_items)) expected_y = sum(i * 2 for i in range(num_items)) @@ -478,13 +442,8 @@ def sum_all_tuples(s1, s2): h_data[i]["field1"]["a"] = i h_data[i]["field2"]["b"] = float(i * 2) - itemsize = h_data.dtype.itemsize - d_input = cp.empty(num_items * itemsize, dtype=np.uint8) - d_input.set(h_data.view(np.uint8)) - d_input = d_input.view(Outer.dtype) - - d_output = cp.empty(itemsize, dtype=np.uint8) - d_output = d_output.view(Outer.dtype) + d_input = DeviceArray.from_numpy(h_data) + d_output = DeviceArray.empty(1, Outer.dtype) h_init = Outer(Inner1(0), Inner2(0.0)) @@ -496,7 +455,7 @@ def sum_all_tuples(s1, s2): h_init=h_init, ) - result = d_output.view(np.uint8).get().view(Outer.dtype)[0] + result = d_output.copy_to_host()[0] expected_a = sum(range(num_items)) expected_b = sum(float(i * 2) for i in range(num_items)) diff --git a/python/cuda_cccl/tests/compute/test_no_numba.py b/python/cuda_cccl/tests/compute/test_no_numba.py index 6fb1ef0e811..264cf117aa0 100644 --- a/python/cuda_cccl/tests/compute/test_no_numba.py +++ b/python/cuda_cccl/tests/compute/test_no_numba.py @@ -2,9 +2,9 @@ # # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -import cupy as cp import numpy as np import pytest +from _utils.device_array import DeviceArray import cuda.compute from cuda.compute import ( @@ -112,24 +112,25 @@ def test_import_numba_raises(): def test_reduce_well_known_plus(): h_input = np.arange(1, 14, dtype=np.int32) - d_input = cp.asarray(h_input) - d_output = cp.empty(1, dtype=np.int32) + d_input = DeviceArray.from_numpy(h_input) + d_output = DeviceArray.empty(1, np.int32) h_init = np.array([5], dtype=np.int32) cuda.compute.reduce_into( d_in=d_input, d_out=d_output, - num_items=d_input.size, + num_items=h_input.size, op=OpKind.PLUS, h_init=h_init, ) - assert d_output.get()[0] == np.sum(h_input, initial=h_init[0]) + assert d_output.copy_to_host()[0] == np.sum(h_input, initial=h_init[0]) def test_exclusive_scan_well_known_plus(): - d_input = cp.asarray([2, 4, 6, 8, 10, 12], dtype=np.uint16) - d_output = cp.empty_like(d_input) + h_input = np.asarray([2, 4, 6, 8, 10, 12], dtype=np.uint16) + d_input = DeviceArray.from_numpy(h_input) + d_output = DeviceArray.empty(h_input.shape, h_input.dtype) h_init = np.array([1], dtype=np.uint16) cuda.compute.exclusive_scan( @@ -137,41 +138,46 @@ def test_exclusive_scan_well_known_plus(): d_out=d_output, op=OpKind.PLUS, init_value=h_init, - num_items=d_input.size, + num_items=h_input.size, ) expected = np.asarray([1, 3, 7, 13, 21, 31], dtype=np.uint16) - np.testing.assert_array_equal(d_output.get(), expected) + np.testing.assert_array_equal(d_output.copy_to_host(), expected) def test_binary_transform_well_known_plus(): - d_lhs = cp.asarray([1.5, 2.5, 3.5, 4.5], dtype=np.float32) - d_rhs = cp.asarray([10.0, 20.0, 30.0, 40.0], dtype=np.float32) - d_output = cp.empty_like(d_lhs) + h_lhs = np.asarray([1.5, 2.5, 3.5, 4.5], dtype=np.float32) + h_rhs = np.asarray([10.0, 20.0, 30.0, 40.0], dtype=np.float32) + d_lhs = DeviceArray.from_numpy(h_lhs) + d_rhs = DeviceArray.from_numpy(h_rhs) + d_output = DeviceArray.empty(h_lhs.shape, h_lhs.dtype) cuda.compute.binary_transform( d_in1=d_lhs, d_in2=d_rhs, d_out=d_output, op=OpKind.PLUS, - num_items=d_lhs.size, + num_items=h_lhs.size, ) - np.testing.assert_allclose(d_output.get(), d_lhs.get() + d_rhs.get()) + np.testing.assert_allclose(d_output.copy_to_host(), h_lhs + h_rhs) def test_unary_transform_well_known_negate(): - d_input = cp.asarray([-4, -2, 0, 2, 4], dtype=np.int8) - d_output = cp.empty_like(d_input) + h_input = np.asarray([-4, -2, 0, 2, 4], dtype=np.int8) + d_input = DeviceArray.from_numpy(h_input) + d_output = DeviceArray.empty(h_input.shape, h_input.dtype) cuda.compute.unary_transform( d_in=d_input, d_out=d_output, op=OpKind.NEGATE, - num_items=d_input.size, + num_items=h_input.size, ) - np.testing.assert_array_equal(d_output.get(), np.asarray([4, 2, 0, -2, -4])) + np.testing.assert_array_equal( + d_output.copy_to_host(), np.asarray([4, 2, 0, -2, -4]) + ) @pytest.mark.parametrize( @@ -184,28 +190,31 @@ def test_unary_transform_well_known_negate(): def test_binary_search_explicit_opkind_less(search, side): h_data = np.asarray([1, 3, 3, 7, 9, 11], dtype=np.int64) h_values = np.asarray([0, 3, 4, 10, 12], dtype=np.int64) - d_out = cp.empty(h_values.size, dtype=np.uintp) + d_out = DeviceArray.empty(h_values.shape, np.uintp) search( - d_data=cp.asarray(h_data), + d_data=DeviceArray.from_numpy(h_data), num_items=h_data.size, - d_values=cp.asarray(h_values), + d_values=DeviceArray.from_numpy(h_values), num_values=h_values.size, d_out=d_out, comp=OpKind.LESS, ) expected = np.searchsorted(h_data, h_values, side=side).astype(np.uintp) - np.testing.assert_array_equal(d_out.get(), expected) + np.testing.assert_array_equal(d_out.copy_to_host(), expected) def test_segmented_reduce_well_known_plus(monkeypatch): monkeypatch.setattr(cuda.compute._cccl_interop, "_check_sass", False) - d_input = cp.asarray([1, 2, 3, 4, 5, 6, 7, 8], dtype=np.uint32) - d_starts = cp.asarray([0, 3, 5], dtype=np.int32) - d_ends = cp.asarray([3, 5, 8], dtype=np.int32) - d_output = cp.empty(3, dtype=np.uint32) + h_input = np.asarray([1, 2, 3, 4, 5, 6, 7, 8], dtype=np.uint32) + h_starts = np.asarray([0, 3, 5], dtype=np.int32) + h_ends = np.asarray([3, 5, 8], dtype=np.int32) + d_input = DeviceArray.from_numpy(h_input) + d_starts = DeviceArray.from_numpy(h_starts) + d_ends = DeviceArray.from_numpy(h_ends) + d_output = DeviceArray.empty(3, np.uint32) h_init = np.array([0], dtype=np.uint32) cuda.compute.segmented_reduce( @@ -218,64 +227,65 @@ def test_segmented_reduce_well_known_plus(monkeypatch): h_init=h_init, ) - np.testing.assert_array_equal(d_output.get(), np.asarray([6, 9, 21])) + np.testing.assert_array_equal(d_output.copy_to_host(), np.asarray([6, 9, 21])) def test_merge_sort_well_known_less(): - d_input = cp.asarray([3.5, -1.0, 2.25, 2.0, 7.0], dtype=np.float64) - d_output = cp.empty_like(d_input) + h_input = np.asarray([3.5, -1.0, 2.25, 2.0, 7.0], dtype=np.float64) + d_input = DeviceArray.from_numpy(h_input) + d_output = DeviceArray.empty(h_input.shape, h_input.dtype) cuda.compute.merge_sort( d_in_keys=d_input, d_in_values=None, d_out_keys=d_output, d_out_values=None, - num_items=d_input.size, + num_items=h_input.size, op=OpKind.LESS, ) - np.testing.assert_array_equal(d_output.get(), np.sort(d_input.get())) + np.testing.assert_array_equal(d_output.copy_to_host(), np.sort(h_input)) def test_radix_sort_key_value_pairs(): h_keys = np.asarray([4, -2, 7, 1, -2, 0], dtype=np.int16) h_values = np.asarray([40, 20, 70, 10, 21, 0], dtype=np.uint8) - d_out_keys = cp.empty_like(cp.asarray(h_keys)) - d_out_values = cp.empty_like(cp.asarray(h_values)) + d_out_keys = DeviceArray.empty(h_keys.shape, h_keys.dtype) + d_out_values = DeviceArray.empty(h_values.shape, h_values.dtype) cuda.compute.radix_sort( - d_in_keys=cp.asarray(h_keys), + d_in_keys=DeviceArray.from_numpy(h_keys), d_out_keys=d_out_keys, - d_in_values=cp.asarray(h_values), + d_in_values=DeviceArray.from_numpy(h_values), d_out_values=d_out_values, num_items=h_keys.size, order=SortOrder.ASCENDING, ) order = np.argsort(h_keys, stable=True) - np.testing.assert_array_equal(d_out_keys.get(), h_keys[order]) - np.testing.assert_array_equal(d_out_values.get(), h_values[order]) + np.testing.assert_array_equal(d_out_keys.copy_to_host(), h_keys[order]) + np.testing.assert_array_equal(d_out_values.copy_to_host(), h_values[order]) def test_segmented_sort_keys(): h_keys = np.asarray([3, 1, 2, 9, 7, 8, 6, 5], dtype=np.uint64) h_offsets = np.asarray([0, 3, 6, 8], dtype=np.int64) - d_output = cp.empty_like(cp.asarray(h_keys)) + d_output = DeviceArray.empty(h_keys.shape, h_keys.dtype) cuda.compute.segmented_sort( - d_in_keys=cp.asarray(h_keys), + d_in_keys=DeviceArray.from_numpy(h_keys), d_out_keys=d_output, d_in_values=None, d_out_values=None, num_items=h_keys.size, num_segments=h_offsets.size - 1, - start_offsets_in=cp.asarray(h_offsets[:-1]), - end_offsets_in=cp.asarray(h_offsets[1:]), + start_offsets_in=DeviceArray.from_numpy(h_offsets[:-1]), + end_offsets_in=DeviceArray.from_numpy(h_offsets[1:]), order=SortOrder.ASCENDING, ) expected = np.asarray([1, 2, 3, 7, 8, 9, 5, 6], dtype=np.uint64) - np.testing.assert_array_equal(d_output.get(), expected) + np.testing.assert_array_equal(d_output.copy_to_host(), expected) def test_unique_by_key_well_known_equal_to(monkeypatch): @@ -283,11 +293,13 @@ def test_unique_by_key_well_known_equal_to(monkeypatch): if cc_major >= 9: monkeypatch.setattr(cuda.compute._cccl_interop, "_check_sass", False) - d_keys = cp.asarray([1, 1, 2, 2, 2, 3, 4, 4], dtype=np.int16) - d_values = cp.asarray([10, 11, 20, 21, 22, 30, 40, 41], dtype=np.int8) - d_out_keys = cp.empty_like(d_keys) - d_out_values = cp.empty_like(d_values) - d_num_selected = cp.empty(1, dtype=np.int64) + h_keys = np.asarray([1, 1, 2, 2, 2, 3, 4, 4], dtype=np.int16) + h_values = np.asarray([10, 11, 20, 21, 22, 30, 40, 41], dtype=np.int8) + d_keys = DeviceArray.from_numpy(h_keys) + d_values = DeviceArray.from_numpy(h_values) + d_out_keys = DeviceArray.empty(h_keys.shape, h_keys.dtype) + d_out_values = DeviceArray.empty(h_values.shape, h_values.dtype) + d_num_selected = DeviceArray.empty(1, np.int64) cuda.compute.unique_by_key( d_in_keys=d_keys, @@ -296,20 +308,24 @@ def test_unique_by_key_well_known_equal_to(monkeypatch): d_out_items=d_out_values, d_out_num_selected=d_num_selected, op=OpKind.EQUAL_TO, - num_items=d_keys.size, + num_items=h_keys.size, ) - num_selected = int(d_num_selected.get()[0]) - np.testing.assert_array_equal(d_out_keys.get()[:num_selected], [1, 2, 3, 4]) - np.testing.assert_array_equal(d_out_values.get()[:num_selected], [10, 20, 30, 40]) + num_selected = int(d_num_selected.copy_to_host()[0]) + np.testing.assert_array_equal( + d_out_keys.copy_to_host()[:num_selected], [1, 2, 3, 4] + ) + np.testing.assert_array_equal( + d_out_values.copy_to_host()[:num_selected], [10, 20, 30, 40] + ) def test_histogram_even_small_range(): h_samples = np.asarray([0.5, 1.5, 2.5, 2.75, 3.0, 3.5], dtype=np.float32) - d_histogram = cp.empty(4, dtype=np.int32) + d_histogram = DeviceArray.empty(4, np.int32) cuda.compute.histogram_even( - d_samples=cp.asarray(h_samples), + d_samples=DeviceArray.from_numpy(h_samples), d_histogram=d_histogram, num_output_levels=5, lower_level=np.float32(0.0), @@ -318,35 +334,35 @@ def test_histogram_even_small_range(): ) expected, _ = np.histogram(h_samples, bins=4, range=(0.0, 4.0)) - np.testing.assert_array_equal(d_histogram.get(), expected.astype(np.int32)) + np.testing.assert_array_equal(d_histogram.copy_to_host(), expected.astype(np.int32)) def test_select_raw_op(): h_input = np.arange(12, dtype=np.int32) - d_output = cp.empty_like(cp.asarray(h_input)) - d_num_selected = cp.empty(1, dtype=np.uint64) + d_output = DeviceArray.empty(h_input.shape, h_input.dtype) + d_num_selected = DeviceArray.empty(1, np.uint64) cuda.compute.select( - d_in=cp.asarray(h_input), + d_in=DeviceArray.from_numpy(h_input), d_out=d_output, d_num_selected_out=d_num_selected, cond=_raw_even_i32_op(), num_items=h_input.size, ) - num_selected = int(d_num_selected.get()[0]) - np.testing.assert_array_equal(d_output.get()[:num_selected], h_input[::2]) + num_selected = int(d_num_selected.copy_to_host()[0]) + np.testing.assert_array_equal(d_output.copy_to_host()[:num_selected], h_input[::2]) def test_three_way_partition_raw_op(): h_input = np.arange(12, dtype=np.int32) - d_first = cp.empty_like(cp.asarray(h_input)) - d_second = cp.empty_like(cp.asarray(h_input)) - d_unselected = cp.empty_like(cp.asarray(h_input)) - d_num_selected = cp.empty(2, dtype=np.uint64) + d_first = DeviceArray.empty(h_input.shape, h_input.dtype) + d_second = DeviceArray.empty(h_input.shape, h_input.dtype) + d_unselected = DeviceArray.empty(h_input.shape, h_input.dtype) + d_num_selected = DeviceArray.empty(2, np.uint64) cuda.compute.three_way_partition( - d_in=cp.asarray(h_input), + d_in=DeviceArray.from_numpy(h_input), d_first_part_out=d_first, d_second_part_out=d_second, d_unselected_out=d_unselected, @@ -356,51 +372,57 @@ def test_three_way_partition_raw_op(): num_items=h_input.size, ) - selected = d_num_selected.get() + selected = d_num_selected.copy_to_host() first_count = int(selected[0]) second_count = int(selected[1]) unselected_count = h_input.size - first_count - second_count - np.testing.assert_array_equal(d_first.get()[:first_count], h_input[:4]) - np.testing.assert_array_equal(d_second.get()[:second_count], h_input[4:8]) - np.testing.assert_array_equal(d_unselected.get()[:unselected_count], h_input[8:]) + np.testing.assert_array_equal(d_first.copy_to_host()[:first_count], h_input[:4]) + np.testing.assert_array_equal(d_second.copy_to_host()[:second_count], h_input[4:8]) + np.testing.assert_array_equal( + d_unselected.copy_to_host()[:unselected_count], h_input[8:] + ) def test_raw_op_reduce(): h_input = np.asarray([10, 20, 30, 40], dtype=np.int64) - d_output = cp.empty(1, dtype=np.int64) + d_output = DeviceArray.empty(1, np.int64) cuda.compute.reduce_into( - d_in=cp.asarray(h_input), + d_in=DeviceArray.from_numpy(h_input), d_out=d_output, num_items=h_input.size, op=_raw_plus_i64_op(), h_init=np.array([5], dtype=np.int64), ) - assert d_output.get()[0] == 105 + assert d_output.copy_to_host()[0] == 105 def test_stream_argument(cuda_stream): - d_lhs = cp.asarray([2, 4, 6, 8, 10], dtype=np.int32) - d_rhs = cp.asarray([1, 3, 5, 7, 9], dtype=np.int32) - d_output = cp.empty_like(d_lhs) + h_lhs = np.asarray([2, 4, 6, 8, 10], dtype=np.int32) + h_rhs = np.asarray([1, 3, 5, 7, 9], dtype=np.int32) + d_lhs = DeviceArray.from_numpy(h_lhs, stream=cuda_stream) + d_rhs = DeviceArray.from_numpy(h_rhs, stream=cuda_stream) + d_output = DeviceArray.empty(h_lhs.shape, h_lhs.dtype, stream=cuda_stream) cuda.compute.binary_transform( d_in1=d_lhs, d_in2=d_rhs, d_out=d_output, op=OpKind.PLUS, - num_items=d_lhs.size, + num_items=h_lhs.size, stream=cuda_stream, ) - cp.cuda.Device().synchronize() - np.testing.assert_array_equal(d_output.get(), np.asarray([3, 7, 11, 15, 19])) + np.testing.assert_array_equal( + d_output.copy_to_host(stream=cuda_stream), + np.asarray([3, 7, 11, 15, 19]), + ) def test_counting_iterator_reduce(): - d_output = cp.empty(1, dtype=np.int32) + d_output = DeviceArray.empty(1, np.int32) cuda.compute.reduce_into( d_in=CountingIterator(np.int32(3)), @@ -410,11 +432,11 @@ def test_counting_iterator_reduce(): h_init=np.array([0], dtype=np.int32), ) - assert d_output.get()[0] == 52 + assert d_output.copy_to_host()[0] == 52 def test_constant_iterator_reduce(): - d_output = cp.empty(1, dtype=np.float32) + d_output = DeviceArray.empty(1, np.float32) cuda.compute.reduce_into( d_in=ConstantIterator(np.float32(1.5)), @@ -424,73 +446,82 @@ def test_constant_iterator_reduce(): h_init=np.array([0], dtype=np.float32), ) - np.testing.assert_allclose(d_output.get()[0], np.float32(12.0)) + np.testing.assert_allclose(d_output.copy_to_host()[0], np.float32(12.0)) def test_cache_modified_input_iterator_reduce(): - d_input = cp.asarray([2, 4, 6, 8, 10], dtype=np.uint16) - d_output = cp.empty(1, dtype=np.uint16) + h_input = np.asarray([2, 4, 6, 8, 10], dtype=np.uint16) + d_input = DeviceArray.from_numpy(h_input) + d_output = DeviceArray.empty(1, np.uint16) iterator = CacheModifiedInputIterator(d_input, modifier="stream") cuda.compute.reduce_into( d_in=iterator, d_out=d_output, - num_items=d_input.size, + num_items=h_input.size, op=OpKind.PLUS, h_init=np.array([0], dtype=np.uint16), ) - assert d_output.get()[0] == 30 + assert d_output.copy_to_host()[0] == 30 def test_reverse_input_iterator_scan(): - d_input = cp.asarray([1, 2, 3, 4, 5], dtype=np.int32) - d_output = cp.empty_like(d_input) + h_input = np.asarray([1, 2, 3, 4, 5], dtype=np.int32) + d_input = DeviceArray.from_numpy(h_input) + d_output = DeviceArray.empty(h_input.shape, h_input.dtype) cuda.compute.inclusive_scan( d_in=ReverseIterator(d_input), d_out=d_output, op=OpKind.PLUS, init_value=np.array([0], dtype=np.int32), - num_items=d_input.size, + num_items=h_input.size, ) - np.testing.assert_array_equal(d_output.get(), np.asarray([5, 9, 12, 14, 15])) + np.testing.assert_array_equal( + d_output.copy_to_host(), np.asarray([5, 9, 12, 14, 15]) + ) def test_reverse_output_iterator_scan(): - d_input = cp.asarray([1, 2, 3, 4, 5], dtype=np.int32) - d_output = cp.empty_like(d_input) + h_input = np.asarray([1, 2, 3, 4, 5], dtype=np.int32) + d_input = DeviceArray.from_numpy(h_input) + d_output = DeviceArray.empty(h_input.shape, h_input.dtype) cuda.compute.inclusive_scan( d_in=d_input, d_out=ReverseIterator(d_output), op=OpKind.PLUS, init_value=np.array([0], dtype=np.int32), - num_items=d_input.size, + num_items=h_input.size, ) - np.testing.assert_array_equal(d_output.get(), np.asarray([15, 10, 6, 3, 1])) + np.testing.assert_array_equal( + d_output.copy_to_host(), np.asarray([15, 10, 6, 3, 1]) + ) def test_permutation_iterator_reduce(): - d_values = cp.asarray([10, 20, 30, 40, 50, 60], dtype=np.int64) - d_indices = cp.asarray([4, 2, 5, 1], dtype=np.int32) - d_output = cp.empty(1, dtype=np.int64) + h_values = np.asarray([10, 20, 30, 40, 50, 60], dtype=np.int64) + h_indices = np.asarray([4, 2, 5, 1], dtype=np.int32) + d_values = DeviceArray.from_numpy(h_values) + d_indices = DeviceArray.from_numpy(h_indices) + d_output = DeviceArray.empty(1, np.int64) cuda.compute.reduce_into( d_in=PermutationIterator(d_values, d_indices), d_out=d_output, - num_items=d_indices.size, + num_items=h_indices.size, op=OpKind.PLUS, h_init=np.array([0], dtype=np.int64), ) - assert d_output.get()[0] == 160 + assert d_output.copy_to_host()[0] == 160 def test_transform_iterator_reduce(): - d_output = cp.empty(1, dtype=np.int32) + d_output = DeviceArray.empty(1, np.int32) iterator = TransformIterator( CountingIterator(np.int32(1)), _raw_square_i32_op(), value_type=cccl_int32 ) @@ -503,12 +534,13 @@ def test_transform_iterator_reduce(): h_init=np.array([0], dtype=np.int32), ) - assert d_output.get()[0] == 91 + assert d_output.copy_to_host()[0] == 91 def test_transform_output_iterator_reduce(): - d_input = cp.asarray([1, 2, 3, 4], dtype=np.int16) - d_output = cp.empty(1, dtype=np.int16) + h_input = np.asarray([1, 2, 3, 4], dtype=np.int16) + d_input = DeviceArray.from_numpy(h_input) + d_output = DeviceArray.empty(1, np.int16) output_iterator = TransformOutputIterator( d_output, _raw_negate_i16_op(), output_value_type=cccl_int16 ) @@ -516,32 +548,34 @@ def test_transform_output_iterator_reduce(): cuda.compute.reduce_into( d_in=d_input, d_out=output_iterator, - num_items=d_input.size, + num_items=h_input.size, op=OpKind.PLUS, h_init=np.array([0], dtype=np.int16), ) - assert d_output.get()[0] == -10 + assert d_output.copy_to_host()[0] == -10 def test_zip_iterator_transform(): - d_lhs = cp.asarray([1, 2, 3, 4, 5], dtype=np.int32) - d_rhs = cp.asarray([10, 20, 30, 40, 50], dtype=np.int32) - d_output = cp.empty_like(d_lhs) + h_lhs = np.asarray([1, 2, 3, 4, 5], dtype=np.int32) + h_rhs = np.asarray([10, 20, 30, 40, 50], dtype=np.int32) + d_lhs = DeviceArray.from_numpy(h_lhs) + d_rhs = DeviceArray.from_numpy(h_rhs) + d_output = DeviceArray.empty(h_lhs.shape, h_lhs.dtype) cuda.compute.unary_transform( d_in=ZipIterator(d_lhs, d_rhs), d_out=d_output, op=_raw_zip_sum_i32_op(), - num_items=d_lhs.size, + num_items=h_lhs.size, ) - np.testing.assert_array_equal(d_output.get(), d_lhs.get() + d_rhs.get()) + np.testing.assert_array_equal(d_output.copy_to_host(), h_lhs + h_rhs) def test_shuffle_iterator_transform(): num_items = 17 - d_output = cp.empty(num_items, dtype=np.int64) + d_output = DeviceArray.empty(num_items, np.int64) cuda.compute.unary_transform( d_in=ShuffleIterator(num_items, seed=123), @@ -550,19 +584,23 @@ def test_shuffle_iterator_transform(): num_items=num_items, ) - result = d_output.get() + result = d_output.copy_to_host() assert sorted(result.tolist()) == list(range(num_items)) def test_discard_iterator_transform(): - d_input = cp.asarray([1, 2, 3, 4, 5], dtype=np.int32) - d_reference = cp.full_like(d_input, -1) + h_input = np.asarray([1, 2, 3, 4, 5], dtype=np.int32) + h_reference = np.full_like(h_input, -1) + d_input = DeviceArray.from_numpy(h_input) + d_reference = DeviceArray.from_numpy(h_reference) cuda.compute.unary_transform( d_in=d_input, d_out=DiscardIterator(d_reference), op=OpKind.IDENTITY, - num_items=d_input.size, + num_items=h_input.size, ) - np.testing.assert_array_equal(d_reference.get(), np.full(5, -1, dtype=np.int32)) + np.testing.assert_array_equal( + d_reference.copy_to_host(), np.full(5, -1, dtype=np.int32) + ) diff --git a/python/cuda_cccl/tests/compute/test_permutation_iterator.py b/python/cuda_cccl/tests/compute/test_permutation_iterator.py index 74d74a7b4e6..862f058f56b 100644 --- a/python/cuda_cccl/tests/compute/test_permutation_iterator.py +++ b/python/cuda_cccl/tests/compute/test_permutation_iterator.py @@ -1,8 +1,8 @@ # Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -import cupy as cp import numpy as np +from _utils.device_array import DeviceArray import cuda.compute from cuda.compute.iterators import ( @@ -13,65 +13,68 @@ def test_permutation_iterator_equality(): - values1 = cp.asarray([10, 20, 30, 40, 50], dtype="int32") - values2 = cp.asarray([100, 200, 300], dtype="int32") - values3 = cp.asarray([10, 20, 30, 40, 50], dtype="int64") + d_values1 = DeviceArray.from_numpy(np.asarray([10, 20, 30, 40, 50], dtype="int32")) + d_values2 = DeviceArray.from_numpy(np.asarray([100, 200, 300], dtype="int32")) + d_values3 = DeviceArray.from_numpy(np.asarray([10, 20, 30, 40, 50], dtype="int64")) - indices1 = cp.asarray([0, 2, 1], dtype="int32") - indices2 = cp.asarray([1, 0, 2], dtype="int32") - indices3 = cp.asarray([0, 2, 1], dtype="int64") + d_indices1 = DeviceArray.from_numpy(np.asarray([0, 2, 1], dtype="int32")) + d_indices2 = DeviceArray.from_numpy(np.asarray([1, 0, 2], dtype="int32")) + d_indices3 = DeviceArray.from_numpy(np.asarray([0, 2, 1], dtype="int64")) # Same value and index types should have same kind - it1 = PermutationIterator(values1, indices1) - it2 = PermutationIterator(values1, indices2) - it3 = PermutationIterator(values2, indices1) + it1 = PermutationIterator(d_values1, d_indices1) + it2 = PermutationIterator(d_values1, d_indices2) + it3 = PermutationIterator(d_values2, d_indices1) assert it1.kind == it2.kind == it3.kind # Different value type should have different kind - it4 = PermutationIterator(values3, indices1) + it4 = PermutationIterator(d_values3, d_indices1) assert it1.kind != it4.kind # Different index type should have different kind - it5 = PermutationIterator(values1, indices3) + it5 = PermutationIterator(d_values1, d_indices3) assert it1.kind != it5.kind def test_permutation_iterator_with_array_values(): - values = cp.asarray([10, 20, 30, 40, 50], dtype="int32") - indices = cp.asarray([2, 0, 4, 1], dtype="int32") - perm_it = PermutationIterator(values, indices) + h_values = np.asarray([10, 20, 30, 40, 50], dtype="int32") + h_indices = np.asarray([2, 0, 4, 1], dtype="int32") + d_values = DeviceArray.from_numpy(h_values) + d_indices = DeviceArray.from_numpy(h_indices) + perm_it = PermutationIterator(d_values, d_indices) h_init = np.array([0], dtype="int32") - d_output = cp.empty(1, dtype="int32") + d_output = DeviceArray.empty(1, np.int32) cuda.compute.reduce_into( d_in=perm_it, d_out=d_output, - num_items=len(indices), + num_items=len(h_indices), op=cuda.compute.OpKind.PLUS, h_init=h_init, ) - assert d_output[0] == values[indices].sum() + assert d_output.copy_to_host()[0] == h_values[h_indices].sum() def test_permutation_iterator_with_iterator_values(): values_it = CountingIterator(np.int32(10)) - indices = cp.asarray([2, 0, 4, 1], dtype="int32") - perm_it = PermutationIterator(values_it, indices) + h_indices = np.asarray([2, 0, 4, 1], dtype="int32") + d_indices = DeviceArray.from_numpy(h_indices) + perm_it = PermutationIterator(values_it, d_indices) h_init = np.array([0], dtype="int32") - d_output = cp.empty(1, dtype="int32") + d_output = DeviceArray.empty(1, np.int32) cuda.compute.reduce_into( d_in=perm_it, d_out=d_output, - num_items=len(indices), + num_items=len(h_indices), op=cuda.compute.OpKind.PLUS, h_init=h_init, ) - expected = cp.arange(10, 20)[indices].sum() - assert d_output[0] == expected + expected = np.arange(10, 20)[h_indices].sum() + assert d_output.copy_to_host()[0] == expected def test_permutation_iterator_of_zip_iterator(): @@ -80,29 +83,32 @@ class Pair: value_0: np.int32 value_1: np.int32 - d_values1 = cp.asarray([10, 20, 30, 40, 50], dtype="int32") - d_values2 = cp.asarray([1, 2, 3, 4, 5], dtype="int32") + h_values1 = np.asarray([10, 20, 30, 40, 50], dtype="int32") + h_values2 = np.asarray([1, 2, 3, 4, 5], dtype="int32") + d_values1 = DeviceArray.from_numpy(h_values1) + d_values2 = DeviceArray.from_numpy(h_values2) zip_it = ZipIterator(d_values1, d_values2) - indices = cp.asarray([2, 0, 4], dtype="int32") - perm_it = PermutationIterator(zip_it, indices) + h_indices = np.asarray([2, 0, 4], dtype="int32") + d_indices = DeviceArray.from_numpy(h_indices) + perm_it = PermutationIterator(zip_it, d_indices) def sum_both_fields(a, b): return Pair(a.value_0 + b.value_0, a.value_1 + b.value_1) h_init = Pair(0, 0) - d_output = cp.empty(1, dtype=Pair.dtype) + d_output = DeviceArray.empty(1, Pair.dtype) cuda.compute.reduce_into( d_in=perm_it, d_out=d_output, - num_items=len(indices), + num_items=len(h_indices), op=sum_both_fields, h_init=h_init, ) - result = d_output.get()[0] - assert result["value_0"] == d_values1[indices].sum() - assert result["value_1"] == d_values2[indices].sum() + result = d_output.copy_to_host()[0] + assert result["value_0"] == h_values1[h_indices].sum() + assert result["value_1"] == h_values2[h_indices].sum() def test_zip_iterator_of_permutation_iterators(): @@ -111,12 +117,16 @@ class Pair: value_0: np.int32 value_1: np.int32 - d_values1 = cp.asarray([10, 20, 30, 40, 50], dtype="int32") - d_values2 = cp.asarray([100, 200, 300, 400, 500], dtype="int32") - indices1 = cp.asarray([4, 1, 3, 0], dtype="int32") - indices2 = cp.asarray([2, 4, 0, 1], dtype="int32") - perm_it1 = PermutationIterator(d_values1, indices1) - perm_it2 = PermutationIterator(d_values2, indices2) + h_values1 = np.asarray([10, 20, 30, 40, 50], dtype="int32") + h_values2 = np.asarray([100, 200, 300, 400, 500], dtype="int32") + h_indices1 = np.asarray([4, 1, 3, 0], dtype="int32") + h_indices2 = np.asarray([2, 4, 0, 1], dtype="int32") + d_values1 = DeviceArray.from_numpy(h_values1) + d_values2 = DeviceArray.from_numpy(h_values2) + d_indices1 = DeviceArray.from_numpy(h_indices1) + d_indices2 = DeviceArray.from_numpy(h_indices2) + perm_it1 = PermutationIterator(d_values1, d_indices1) + perm_it2 = PermutationIterator(d_values2, d_indices2) zip_it = ZipIterator(perm_it1, perm_it2) @@ -124,9 +134,9 @@ def sum_both_fields(a, b): return Pair(a.value_0 + b.value_0, a.value_1 + b.value_1) h_init = Pair(0, 0) - d_output = cp.empty(1, dtype=Pair.dtype) + d_output = DeviceArray.empty(1, Pair.dtype) - num_items = len(indices1) + num_items = len(h_indices1) cuda.compute.reduce_into( d_in=zip_it, d_out=d_output, @@ -135,26 +145,28 @@ def sum_both_fields(a, b): h_init=h_init, ) - result = d_output.get()[0] - assert result["value_0"] == d_values1[indices1].sum() - assert result["value_1"] == d_values2[indices2].sum() + result = d_output.copy_to_host()[0] + assert result["value_0"] == h_values1[h_indices1].sum() + assert result["value_1"] == h_values2[h_indices2].sum() def test_unary_transform_of_permutation_iterator(): - values = cp.asarray([10, 20, 30, 40, 50], dtype="int32") - indices = cp.asarray([2, 0, 4, 1], dtype="int32") - perm_it = PermutationIterator(values, indices) + h_values = np.asarray([10, 20, 30, 40, 50], dtype="int32") + h_indices = np.asarray([2, 0, 4, 1], dtype="int32") + d_values = DeviceArray.from_numpy(h_values) + d_indices = DeviceArray.from_numpy(h_indices) + perm_it = PermutationIterator(d_values, d_indices) def op(a): return a + 1 - d_out = cp.empty_like(values, shape=(len(indices),)) + d_out = DeviceArray.empty(len(h_indices), h_values.dtype) cuda.compute.unary_transform( - d_in=perm_it, d_out=d_out, op=op, num_items=len(indices) + d_in=perm_it, d_out=d_out, op=op, num_items=len(h_indices) ) - expected = values[indices] + 1 - assert cp.all(d_out == expected) + expected = h_values[h_indices] + 1 + np.testing.assert_array_equal(d_out.copy_to_host(), expected) def test_caching_permutation_iterator(): @@ -163,22 +175,26 @@ def test_caching_permutation_iterator(): # Test 1: Same structure → same kind it1 = PermutationIterator( - cp.arange(10, dtype=np.int32), cp.arange(10, dtype=np.int32) + DeviceArray.from_numpy(np.arange(10, dtype=np.int32)), + DeviceArray.from_numpy(np.arange(10, dtype=np.int32)), ) it2 = PermutationIterator( - cp.arange(20, dtype=np.int32), cp.arange(5, dtype=np.int32) + DeviceArray.from_numpy(np.arange(20, dtype=np.int32)), + DeviceArray.from_numpy(np.arange(5, dtype=np.int32)), ) assert it1.kind == it2.kind, "Same structure should have same kind" # Test 2: Different index type → different kind it3 = PermutationIterator( - cp.arange(10, dtype=np.int32), cp.arange(10, dtype=np.int64) + DeviceArray.from_numpy(np.arange(10, dtype=np.int32)), + DeviceArray.from_numpy(np.arange(10, dtype=np.int64)), ) assert it1.kind != it3.kind, "Different index type should have different kind" # Test 3: Different value type → different kind it4 = PermutationIterator( - cp.arange(10, dtype=np.int64), cp.arange(10, dtype=np.int32) + DeviceArray.from_numpy(np.arange(10, dtype=np.int64)), + DeviceArray.from_numpy(np.arange(10, dtype=np.int32)), ) assert it1.kind != it4.kind, "Different value type should have different kind" @@ -189,8 +205,8 @@ def test_caching_permutation_iterator(): iterators = [] for i in range(3): it = PermutationIterator( - cp.arange(i * 10, (i + 1) * 10, dtype=np.float32), - cp.arange(5, dtype=np.int32), + DeviceArray.from_numpy(np.arange(i * 10, (i + 1) * 10, dtype=np.float32)), + DeviceArray.from_numpy(np.arange(5, dtype=np.int32)), ) # Trigger compilation by accessing Op objects it.get_advance_op() @@ -207,7 +223,8 @@ def test_caching_permutation_iterator(): def test_permutation_iterator_advance(): """Test PermutationIterator.__add__ only advances indices, not values.""" # Create values array [10, 20, 30, 40, 50, 60, 70] - values = cp.asarray([10, 20, 30, 40, 50, 60, 70], dtype="int32") + h_values = np.asarray([10, 20, 30, 40, 50, 60, 70], dtype="int32") + d_values = DeviceArray.from_numpy(h_values) # Create indices array [2, 0, 4, 1, 3, 5] # indices[0] = 2 -> values[2] = 30 @@ -216,9 +233,10 @@ def test_permutation_iterator_advance(): # indices[3] = 1 -> values[1] = 20 # indices[4] = 3 -> values[3] = 40 # indices[5] = 5 -> values[5] = 60 - indices = cp.asarray([2, 0, 4, 1, 3, 5], dtype="int32") + h_indices = np.asarray([2, 0, 4, 1, 3, 5], dtype="int32") + d_indices = DeviceArray.from_numpy(h_indices) - perm_it = PermutationIterator(values, indices) + perm_it = PermutationIterator(d_values, d_indices) # Advance by 2 positions (should skip first 2 indices) offset = 2 @@ -228,9 +246,9 @@ def test_permutation_iterator_advance(): # Should process indices[2:] = [4, 1, 3, 5] # Which accesses values[4, 1, 3, 5] = [50, 20, 40, 60] h_init = np.array([0], dtype="int32") - d_output = cp.empty(1, dtype="int32") + d_output = DeviceArray.empty(1, np.int32) - remaining_items = len(indices) - offset + remaining_items = len(h_indices) - offset cuda.compute.reduce_into( d_in=advanced_perm_it, d_out=d_output, @@ -240,7 +258,6 @@ def test_permutation_iterator_advance(): ) # Expected: values[indices[2:]] = values[[4, 1, 3, 5]] = [50, 20, 40, 60] - expected = values[indices[offset:]].sum().get() - assert d_output[0].get() == expected, ( - f"Expected {expected}, got {d_output[0].get()}" - ) + expected = h_values[h_indices[offset:]].sum() + result = d_output.copy_to_host()[0] + assert result == expected, f"Expected {expected}, got {result}" diff --git a/python/cuda_cccl/tests/compute/test_radix_sort.py b/python/cuda_cccl/tests/compute/test_radix_sort.py index 62b8e7dcb44..61e407c556b 100644 --- a/python/cuda_cccl/tests/compute/test_radix_sort.py +++ b/python/cuda_cccl/tests/compute/test_radix_sort.py @@ -5,10 +5,9 @@ import itertools from typing import Tuple -import cupy as cp -import numba import numpy as np import pytest +from _utils.device_array import DeviceArray, get_compute_capability import cuda.compute from cuda.compute import ( @@ -148,7 +147,7 @@ def host_sort(h_in_keys, h_in_values, order, begin_bit=None, end_bit=None) -> Tu DTYPE_SIZE, ) def test_radix_sort_keys(dtype, num_items, monkeypatch): - cc_major, _ = numba.cuda.get_current_device().compute_capability + cc_major, _ = get_compute_capability() # Skip sass verification for CC 9.0+ due to a bug in NVRTC. # TODO: add NVRTC version check, ref nvbug 5243118 if cc_major >= 9: @@ -164,8 +163,8 @@ def test_radix_sort_keys(dtype, num_items, monkeypatch): h_in_keys = random_array(num_items, dtype, max_value=20) h_out_keys = np.empty(num_items, dtype=dtype) - d_in_keys = numba.cuda.to_device(h_in_keys) - d_out_keys = numba.cuda.to_device(h_out_keys) + d_in_keys = DeviceArray.from_numpy(h_in_keys) + d_out_keys = DeviceArray.empty(h_out_keys.shape, h_out_keys.dtype) radix_sort_device(d_in_keys, d_out_keys, None, None, order, num_items) @@ -195,10 +194,10 @@ def test_radix_sort_pairs(dtype, num_items, monkeypatch): h_out_keys = np.empty(num_items, dtype=dtype) h_out_values = np.empty(num_items, dtype=np.float32) - d_in_keys = numba.cuda.to_device(h_in_keys) - d_in_values = numba.cuda.to_device(h_in_values) - d_out_keys = numba.cuda.to_device(h_out_keys) - d_out_values = numba.cuda.to_device(h_out_values) + d_in_keys = DeviceArray.from_numpy(h_in_keys) + d_in_values = DeviceArray.from_numpy(h_in_values) + d_out_keys = DeviceArray.empty(h_out_keys.shape, h_out_keys.dtype) + d_out_values = DeviceArray.empty(h_out_values.shape, h_out_values.dtype) radix_sort_device( d_in_keys, d_out_keys, d_in_values, d_out_values, order, num_items @@ -218,7 +217,7 @@ def test_radix_sort_pairs(dtype, num_items, monkeypatch): DTYPE_SIZE, ) def test_radix_sort_keys_double_buffer(dtype, num_items, monkeypatch): - cc_major, _ = numba.cuda.get_current_device().compute_capability + cc_major, _ = get_compute_capability() # Skip sass verification for CC 9.0+ due to a bug in NVRTC. # TODO: add NVRTC version check, ref nvbug 5243118 if cc_major >= 9: @@ -234,8 +233,8 @@ def test_radix_sort_keys_double_buffer(dtype, num_items, monkeypatch): h_in_keys = random_array(num_items, dtype, max_value=20) h_out_keys = np.empty(num_items, dtype=dtype) - d_in_keys = numba.cuda.to_device(h_in_keys) - d_out_keys = numba.cuda.to_device(h_out_keys) + d_in_keys = DeviceArray.from_numpy(h_in_keys) + d_out_keys = DeviceArray.empty(h_out_keys.shape, h_out_keys.dtype) keys_double_buffer = DoubleBuffer(d_in_keys, d_out_keys) @@ -253,7 +252,7 @@ def test_radix_sort_keys_double_buffer(dtype, num_items, monkeypatch): DTYPE_SIZE, ) def test_radix_sort_pairs_double_buffer(dtype, num_items, monkeypatch): - cc_major, _ = numba.cuda.get_current_device().compute_capability + cc_major, _ = get_compute_capability() # NOTE: int16 failures seen only with NVRTC 13.1: if cc_major >= 9 or np.isdtype(dtype, (np.int16, np.uint32)): import cuda.compute._cccl_interop @@ -270,10 +269,10 @@ def test_radix_sort_pairs_double_buffer(dtype, num_items, monkeypatch): h_out_keys = np.empty(num_items, dtype=dtype) h_out_values = np.empty(num_items, dtype=np.float32) - d_in_keys = numba.cuda.to_device(h_in_keys) - d_in_values = numba.cuda.to_device(h_in_values) - d_out_keys = numba.cuda.to_device(h_out_keys) - d_out_values = numba.cuda.to_device(h_out_values) + d_in_keys = DeviceArray.from_numpy(h_in_keys) + d_in_values = DeviceArray.from_numpy(h_in_values) + d_out_keys = DeviceArray.empty(h_out_keys.shape, h_out_keys.dtype) + d_out_values = DeviceArray.empty(h_out_values.shape, h_out_values.dtype) keys_double_buffer = DoubleBuffer(d_in_keys, d_out_keys) values_double_buffer = DoubleBuffer(d_in_values, d_out_values) @@ -304,7 +303,7 @@ def test_radix_sort_pairs_double_buffer(dtype, num_items, monkeypatch): DTYPE_SIZE_BIT_WINDOW, ) def test_radix_sort_pairs_bit_window(dtype, num_items, monkeypatch): - cc_major, _ = numba.cuda.get_current_device().compute_capability + cc_major, _ = get_compute_capability() # NOTE: int16 failures seen only with NVRTC 13.1: if cc_major >= 9 or np.isdtype(dtype, (np.int16, np.uint32)): import cuda.compute._cccl_interop @@ -329,10 +328,10 @@ def test_radix_sort_pairs_bit_window(dtype, num_items, monkeypatch): h_out_keys = np.empty(num_items, dtype=dtype) h_out_values = np.empty(num_items, dtype=np.float32) - d_in_keys = numba.cuda.to_device(h_in_keys) - d_in_values = numba.cuda.to_device(h_in_values) - d_out_keys = numba.cuda.to_device(h_out_keys) - d_out_values = numba.cuda.to_device(h_out_values) + d_in_keys = DeviceArray.from_numpy(h_in_keys) + d_in_values = DeviceArray.from_numpy(h_in_values) + d_out_keys = DeviceArray.empty(h_out_keys.shape, h_out_keys.dtype) + d_out_values = DeviceArray.empty(h_out_values.shape, h_out_values.dtype) radix_sort_device( d_in_keys, @@ -384,10 +383,10 @@ def test_radix_sort_pairs_double_buffer_bit_window(dtype, num_items, monkeypatch h_out_keys = np.empty(num_items, dtype=dtype) h_out_values = np.empty(num_items, dtype=np.float32) - d_in_keys = numba.cuda.to_device(h_in_keys) - d_in_values = numba.cuda.to_device(h_in_values) - d_out_keys = numba.cuda.to_device(h_out_keys) - d_out_values = numba.cuda.to_device(h_out_values) + d_in_keys = DeviceArray.from_numpy(h_in_keys) + d_in_values = DeviceArray.from_numpy(h_in_values) + d_out_keys = DeviceArray.empty(h_out_keys.shape, h_out_keys.dtype) + d_out_values = DeviceArray.empty(h_out_values.shape, h_out_values.dtype) keys_double_buffer = DoubleBuffer(d_in_keys, d_out_keys) values_double_buffer = DoubleBuffer(d_in_values, d_out_values) @@ -434,8 +433,8 @@ def test_radix_sort_large_num_items(dtype, monkeypatch): h_in_keys = np.arange(num_items - 1, -1, -1, dtype=dtype) - d_in_keys = cp.asarray(h_in_keys) - d_out_keys = cp.empty(num_items, dtype=dtype) + d_in_keys = DeviceArray.from_numpy(h_in_keys) + d_out_keys = DeviceArray.empty(num_items, dtype) cuda.compute.radix_sort( d_in_keys=d_in_keys, @@ -446,31 +445,37 @@ def test_radix_sort_large_num_items(dtype, monkeypatch): order=SortOrder.ASCENDING, ) - h_out_keys = d_out_keys.get() + h_out_keys = d_out_keys.copy_to_host() h_expected, _ = host_sort(h_in_keys, None, SortOrder.ASCENDING) np.testing.assert_array_equal(h_out_keys, h_expected) def test_radix_sort_with_stream(cuda_stream): - cp_stream = cp.cuda.ExternalStream(cuda_stream.ptr) num_items = 10000 - with cp_stream: - h_in_keys = random_array(num_items, np.int32) - d_in_keys = cp.asarray(h_in_keys) - d_out_keys = cp.empty_like(d_in_keys) + h_in_keys = random_array(num_items, np.int32) + d_in_keys = DeviceArray.from_numpy(h_in_keys, stream=cuda_stream) + d_out_keys = DeviceArray.empty(h_in_keys.shape, h_in_keys.dtype, stream=cuda_stream) - radix_sort_device(d_in_keys, d_out_keys, None, None, SortOrder.ASCENDING, num_items) + radix_sort_device( + d_in_keys, + d_out_keys, + None, + None, + SortOrder.ASCENDING, + num_items, + stream=cuda_stream, + ) - got = d_out_keys.get() + got = d_out_keys.copy_to_host(stream=cuda_stream) h_in_keys.sort() np.testing.assert_array_equal(got, h_in_keys) def test_radix_sort(monkeypatch): - cc_major, _ = numba.cuda.get_current_device().compute_capability + cc_major, _ = get_compute_capability() # Skip sass verification for CC 9.0+ due to a bug in NVRTC. # TODO: add NVRTC version check, ref nvbug 5243118 if cc_major >= 9: @@ -482,19 +487,16 @@ def test_radix_sort(monkeypatch): False, ) - import cupy as cp - import numpy as np - h_in_keys = np.array([-5, 0, 2, -3, 2, 4, 0, -1, 2, 8], dtype="int32") h_in_values = np.array( [-3.2, 2.2, 1.9, 4.0, -3.9, 2.7, 0, 8.3 - 1, 2.9, 5.4], dtype="float32" ) - d_in_keys = cp.asarray(h_in_keys) - d_in_values = cp.asarray(h_in_values) + d_in_keys = DeviceArray.from_numpy(h_in_keys) + d_in_values = DeviceArray.from_numpy(h_in_values) - d_out_keys = cp.empty_like(d_in_keys) - d_out_values = cp.empty_like(d_in_values) + d_out_keys = DeviceArray.empty(h_in_keys.shape, h_in_keys.dtype) + d_out_values = DeviceArray.empty(h_in_values.shape, h_in_values.dtype) # Call single-phase API directly with num_items parameter cuda.compute.radix_sort( @@ -502,13 +504,13 @@ def test_radix_sort(monkeypatch): d_out_keys=d_out_keys, d_in_values=d_in_values, d_out_values=d_out_values, - num_items=d_in_keys.size, + num_items=h_in_keys.size, order=SortOrder.ASCENDING, ) # Check the result is correct - h_out_keys = cp.asnumpy(d_out_keys) - h_out_items = cp.asnumpy(d_out_values) + h_out_keys = d_out_keys.copy_to_host() + h_out_items = d_out_values.copy_to_host() argsort = np.argsort(h_in_keys, stable=True) h_in_keys = np.array(h_in_keys)[argsort] @@ -519,7 +521,7 @@ def test_radix_sort(monkeypatch): def test_radix_sort_double_buffer(monkeypatch): - cc_major, _ = numba.cuda.get_current_device().compute_capability + cc_major, _ = get_compute_capability() # Skip sass verification for CC 9.0+ due to a bug in NVRTC. # TODO: add NVRTC version check, ref nvbug 5243118 if cc_major >= 9: @@ -531,19 +533,16 @@ def test_radix_sort_double_buffer(monkeypatch): False, ) - import cupy as cp - import numpy as np - h_in_keys = np.array([-5, 0, 2, -3, 2, 4, 0, -1, 2, 8], dtype="int32") h_in_values = np.array( [-3.2, 2.2, 1.9, 4.0, -3.9, 2.7, 0, 8.3 - 1, 2.9, 5.4], dtype="float32" ) - d_in_keys = cp.asarray(h_in_keys) - d_in_values = cp.asarray(h_in_values) + d_in_keys = DeviceArray.from_numpy(h_in_keys) + d_in_values = DeviceArray.from_numpy(h_in_values) - d_out_keys = cp.empty_like(d_in_keys) - d_out_values = cp.empty_like(d_in_values) + d_out_keys = DeviceArray.empty(h_in_keys.shape, h_in_keys.dtype) + d_out_values = DeviceArray.empty(h_in_values.shape, h_in_values.dtype) keys_double_buffer = DoubleBuffer(d_in_keys, d_out_keys) values_double_buffer = DoubleBuffer(d_in_values, d_out_values) @@ -554,13 +553,13 @@ def test_radix_sort_double_buffer(monkeypatch): d_out_keys=None, d_in_values=values_double_buffer, d_out_values=None, - num_items=d_in_keys.size, + num_items=h_in_keys.size, order=SortOrder.ASCENDING, ) # Check the result is correct - h_out_keys = cp.asnumpy(keys_double_buffer.current()) - h_out_values = cp.asnumpy(values_double_buffer.current()) + h_out_keys = keys_double_buffer.current().copy_to_host() + h_out_values = values_double_buffer.current().copy_to_host() argsort = np.argsort(h_in_keys, stable=True) h_in_keys = np.array(h_in_keys)[argsort] diff --git a/python/cuda_cccl/tests/compute/test_raw_op.py b/python/cuda_cccl/tests/compute/test_raw_op.py index 9ad88fe6182..b8544658a01 100644 --- a/python/cuda_cccl/tests/compute/test_raw_op.py +++ b/python/cuda_cccl/tests/compute/test_raw_op.py @@ -6,15 +6,15 @@ import re import struct -import cupy as cp import numpy as np import pytest +from _utils.device_array import DeviceArray, get_compute_capability import cuda.compute from cuda.compute import types from cuda.compute._cpp_compile import _get_include_paths from cuda.compute.op import RawOp -from cuda.core import Device, Program, ProgramOptions +from cuda.core import Program, ProgramOptions # Mark all tests in this module as no_numba pytestmark = pytest.mark.no_numba @@ -22,9 +22,7 @@ def get_arch(): """Get the SM architecture string for the current device.""" - device = Device() - device.set_current() - cc_major, cc_minor = device.compute_capability + cc_major, cc_minor = get_compute_capability() return f"sm_{cc_major}{cc_minor}" @@ -115,8 +113,8 @@ def test_cpp_op_basic_add(): # Create test data num_items = 100 h_input = np.arange(num_items, dtype=np.int32) - d_input = cp.array(h_input) - d_output = cp.empty(1, dtype=np.int32) + d_input = DeviceArray.from_numpy(h_input) + d_output = DeviceArray.empty(1, np.int32) # Use the custom op with reduce_into h_init = np.array(0, dtype=np.int32) @@ -125,7 +123,7 @@ def test_cpp_op_basic_add(): ) # Verify result - result = d_output.get()[0] + result = d_output.copy_to_host()[0] expected = np.sum(h_input) assert result == expected, f"Expected {expected}, got {result}" @@ -148,8 +146,8 @@ def test_cpp_op_max(): # Create test data num_items = 100 h_input = np.random.randn(num_items).astype(np.float32) - d_input = cp.array(h_input) - d_output = cp.empty(1, dtype=np.float32) + d_input = DeviceArray.from_numpy(h_input) + d_output = DeviceArray.empty(1, np.float32) # Use the custom op with reduce_into h_init = np.array(-np.inf, dtype=np.float32) @@ -158,7 +156,7 @@ def test_cpp_op_max(): ) # Verify result - result = d_output.get()[0] + result = d_output.copy_to_host()[0] expected = np.max(h_input) assert np.isclose(result, expected), f"Expected {expected}, got {result}" @@ -176,8 +174,8 @@ def test_cpp_op_multiply(): # Create test data - use small numbers to avoid overflow num_items = 5 h_input = np.array([1, 2, 3, 4, 5], dtype=np.int32) - d_input = cp.array(h_input) - d_output = cp.empty(1, dtype=np.int32) + d_input = DeviceArray.from_numpy(h_input) + d_output = DeviceArray.empty(1, np.int32) # Use the custom op with reduce_into h_init = np.array(1, dtype=np.int32) @@ -186,7 +184,7 @@ def test_cpp_op_multiply(): ) # Verify result - result = d_output.get()[0] + result = d_output.copy_to_host()[0] expected = np.prod(h_input) assert result == expected, f"Expected {expected}, got {result}" @@ -207,8 +205,8 @@ def test_cpp_op_complex_logic(): # Create test data with specific bit patterns num_items = 5 h_input = np.array([1, 2, 4, 8, 16], dtype=np.int32) # Powers of 2 - d_input = cp.array(h_input) - d_output = cp.empty(1, dtype=np.int32) + d_input = DeviceArray.from_numpy(h_input) + d_output = DeviceArray.empty(1, np.int32) # Use the custom op with reduce_into h_init = np.array(0, dtype=np.int32) @@ -217,7 +215,7 @@ def test_cpp_op_complex_logic(): ) # Expected: 1 | 2 | 4 | 8 | 16 = 31 (all bits set) - result = d_output.get()[0] + result = d_output.copy_to_host()[0] expected = 31 assert result == expected, f"Expected {expected}, got {result}" @@ -235,8 +233,8 @@ def test_cpp_op_different_types(): # Create test data num_items = 50 h_input = np.random.randn(num_items).astype(np.float64) - d_input = cp.array(h_input) - d_output = cp.empty(1, dtype=np.float64) + d_input = DeviceArray.from_numpy(h_input) + d_output = DeviceArray.empty(1, np.float64) # Use the custom op with reduce_into h_init = np.array(0.0, dtype=np.float64) @@ -245,7 +243,7 @@ def test_cpp_op_different_types(): ) # Verify result - result = d_output.get()[0] + result = d_output.copy_to_host()[0] expected = np.sum(h_input) assert np.isclose(result, expected), f"Expected {expected}, got {result}" @@ -264,8 +262,8 @@ def test_cpp_op_name_extraction(): # Create test data num_items = 10 h_input = np.arange(num_items, dtype=np.int32) - d_input = cp.array(h_input) - d_output = cp.empty(1, dtype=np.int32) + d_input = DeviceArray.from_numpy(h_input) + d_output = DeviceArray.empty(1, np.int32) # Use the custom op with reduce_into h_init = np.array(0, dtype=np.int32) @@ -274,7 +272,7 @@ def test_cpp_op_name_extraction(): ) # Verify result - result = d_output.get()[0] + result = d_output.copy_to_host()[0] expected = np.sum(h_input) assert result == expected, f"Expected {expected}, got {result}" @@ -294,8 +292,8 @@ def test_cpp_op_min(): # Create test data num_items = 100 h_input = np.random.randint(-1000, 1000, num_items, dtype=np.int32) - d_input = cp.array(h_input) - d_output = cp.empty(1, dtype=np.int32) + d_input = DeviceArray.from_numpy(h_input) + d_output = DeviceArray.empty(1, np.int32) # Use the custom op with reduce_into h_init = np.array(np.iinfo(np.int32).max, dtype=np.int32) @@ -304,7 +302,7 @@ def test_cpp_op_min(): ) # Verify result - result = d_output.get()[0] + result = d_output.copy_to_host()[0] expected = np.min(h_input) assert result == expected, f"Expected {expected}, got {result}" @@ -341,14 +339,8 @@ def test_cpp_op_with_struct(): h_data[i]["x"] = i h_data[i]["y"] = i * 2 - # Convert to device arrays using uint8 view - itemsize = h_data.dtype.itemsize - d_input = cp.empty(num_items * itemsize, dtype=np.uint8) - d_input.set(h_data.view(np.uint8)) - d_input = d_input.view(Point.dtype) - - d_output = cp.empty(itemsize, dtype=np.uint8) - d_output = d_output.view(Point.dtype) + d_input = DeviceArray.from_numpy(h_data) + d_output = DeviceArray.empty(1, Point.dtype) # Initial point (0, 0) h_init = Point(0, 0) @@ -359,7 +351,7 @@ def test_cpp_op_with_struct(): ) # Verify result - result = d_output.view(np.uint8).get().view(Point.dtype)[0] + result = d_output.copy_to_host()[0] expected_x = sum(range(num_items)) # 0+1+2+...+9 = 45 expected_y = sum(i * 2 for i in range(num_items)) # 0+2+4+...+18 = 90 @@ -383,13 +375,13 @@ def test_cpp_op_with_transform_iterator(): # Create input data num_items = 10 h_input = np.arange(num_items, dtype=np.int32) - d_input = cp.array(h_input) + d_input = DeviceArray.from_numpy(h_input) # Create transform iterator with RawOp transform_iter = TransformIterator(d_input, op, value_type=types.int32) # Use the transform iterator with reduce - d_output = cp.empty(1, dtype=np.int32) + d_output = DeviceArray.empty(1, np.int32) h_init = np.array(0, dtype=np.int32) # Sum the doubled values using built-in PLUS operator @@ -402,7 +394,7 @@ def test_cpp_op_with_transform_iterator(): ) # Verify result: sum of (0*2, 1*2, 2*2, ..., 9*2) = 2 * sum(0..9) = 2 * 45 = 90 - result = d_output.get()[0] + result = d_output.copy_to_host()[0] expected = 2 * np.sum(h_input) assert result == expected, f"Expected {expected}, got {result}" @@ -410,7 +402,7 @@ def test_cpp_op_with_transform_iterator(): def test_cpp_stateful_op_reduce_with_constant(): """Test stateful RawOp with a simple stateful reduce.""" # State: a single int32 constant value (10) on device - d_constant = cp.array([10], dtype=np.int32) + d_constant = DeviceArray.from_numpy(np.array([10], dtype=np.int32)) constant_ptr = d_constant.__cuda_array_interface__["data"][0] state_data = struct.pack("P", constant_ptr) state_alignment = np.dtype(np.intp).alignment @@ -435,8 +427,8 @@ def test_cpp_stateful_op_reduce_with_constant(): # Create test data num_items = 5 h_input = np.array([1, 2, 3, 4, 5], dtype=np.int32) - d_input = cp.array(h_input) - d_output = cp.empty(1, dtype=np.int32) + d_input = DeviceArray.from_numpy(h_input) + d_output = DeviceArray.empty(1, np.int32) # Use the stateful op with reduce_into h_init = np.array(0, dtype=np.int32) @@ -445,7 +437,7 @@ def test_cpp_stateful_op_reduce_with_constant(): ) # Get result - result = d_output.get()[0] + result = d_output.copy_to_host()[0] # Each reduction adds 10, so we expect input sum + some multiple of 10 # The exact value depends on tree structure, but should be > sum(inputs) sum_inputs = np.sum(h_input) @@ -455,7 +447,7 @@ def test_cpp_stateful_op_reduce_with_constant(): def test_cpp_stateful_op_select_with_counter(): """Test stateful RawOp with select_if that atomically updates a counter.""" # Create a device counter initialized to 0 - d_counter = cp.zeros(1, dtype=np.int32) + d_counter = DeviceArray.from_numpy(np.zeros(1, dtype=np.int32)) # State: pointer to the counter counter_ptr = d_counter.__cuda_array_interface__["data"][0] @@ -497,11 +489,11 @@ def test_cpp_stateful_op_select_with_counter(): # Create test data: 0 to 19 num_items = 20 h_input = np.arange(num_items, dtype=np.int32) - d_input = cp.array(h_input) + d_input = DeviceArray.from_numpy(h_input) # Allocate output arrays - d_output = cp.empty(num_items, dtype=np.int32) - d_num_selected = cp.empty(1, dtype=np.int32) + d_output = DeviceArray.empty(num_items, np.int32) + d_num_selected = DeviceArray.empty(1, np.int32) # Run select cuda.compute.select( @@ -513,8 +505,8 @@ def test_cpp_stateful_op_select_with_counter(): ) # Get results - num_selected = d_num_selected.get()[0] - counter_value = d_counter.get()[0] + num_selected = d_num_selected.copy_to_host()[0] + counter_value = d_counter.copy_to_host()[0] # Verify: should have selected 10 even numbers (0, 2, 4, ..., 18) expected_count = 10 @@ -526,7 +518,7 @@ def test_cpp_stateful_op_select_with_counter(): ) # Verify the selected values are correct - selected_values = d_output.get()[:num_selected] + selected_values = d_output.copy_to_host()[:num_selected] expected_selected = np.arange(0, 20, 2, dtype=np.int32) assert np.array_equal(selected_values, expected_selected), ( "Selected values don't match" diff --git a/python/cuda_cccl/tests/compute/test_reduce.py b/python/cuda_cccl/tests/compute/test_reduce.py index df3f024f0ee..5af40f95ad7 100644 --- a/python/cuda_cccl/tests/compute/test_reduce.py +++ b/python/cuda_cccl/tests/compute/test_reduce.py @@ -5,11 +5,9 @@ import functools import random -import cupy as cp -import numba.cuda import numpy as np import pytest -from cupy.cuda import runtime +from _utils.device_array import DeviceArray import cuda.compute from cuda.compute import ( @@ -73,13 +71,13 @@ def add_op(a, b): def test_device_reduce(dtype, num_items, op): init_value = 42 h_init = np.array([init_value], dtype=dtype) - d_output = numba.cuda.device_array(1, dtype=dtype) + d_output = DeviceArray.empty(1, dtype) h_input = random_int(num_items, dtype) - d_input = numba.cuda.to_device(h_input) + d_input = DeviceArray.from_numpy(h_input) cuda.compute.reduce_into( - d_in=d_input, d_out=d_output, num_items=d_input.size, op=op, h_init=h_init + d_in=d_input, d_out=d_output, num_items=h_input.size, op=op, h_init=h_init ) h_output = d_output.copy_to_host() assert h_output[0] == pytest.approx( @@ -94,16 +92,16 @@ def test_device_reduce_with_lambda(): num_items = 1024 h_init = np.array([init_value], dtype=dtype) - d_output = numba.cuda.device_array(1, dtype=dtype) + d_output = DeviceArray.empty(1, dtype) h_input = random_int(num_items, dtype) - d_input = numba.cuda.to_device(h_input) + d_input = DeviceArray.from_numpy(h_input) # Use a lambda function directly as the reducer cuda.compute.reduce_into( d_in=d_input, d_out=d_output, - num_items=d_input.size, + num_items=h_input.size, op=lambda a, b: a + b, h_init=h_init, ) @@ -118,16 +116,16 @@ def test_device_reduce_with_lambda_variable(): num_items = 1024 h_init = np.array([init_value], dtype=dtype) - d_output = numba.cuda.device_array(1, dtype=dtype) + d_output = DeviceArray.empty(1, dtype) h_input = random_int(num_items, dtype) - d_input = numba.cuda.to_device(h_input) + d_input = DeviceArray.from_numpy(h_input) # Use a lambda function assigned to a variable as the reducer cuda.compute.reduce_into( d_in=d_input, d_out=d_output, - num_items=d_input.size, + num_items=h_input.size, op=add_op_lambda, h_init=h_init, ) @@ -137,13 +135,13 @@ def test_device_reduce_with_lambda_variable(): def test_complex_device_reduce(): h_init = np.array([40.0 + 2.0j], dtype=complex) - d_output = numba.cuda.device_array(1, dtype=complex) + d_output = DeviceArray.empty(1, complex) for num_items in [42, 420000]: real_imag = np.random.random((2, num_items)) h_input = real_imag[0] + 1j * real_imag[1] - d_input = numba.cuda.to_device(h_input) - assert d_input.size == num_items + d_input = DeviceArray.from_numpy(h_input) + assert h_input.size == num_items cuda.compute.reduce_into( d_in=d_input, d_out=d_output, num_items=num_items, op=add_op, h_init=h_init ) @@ -162,11 +160,11 @@ def _test_device_sum_with_iterator( if use_numpy_array: h_input = np.array(l_varr, dtype_inp) - d_input = numba.cuda.to_device(h_input) + d_input = DeviceArray.from_numpy(h_input) else: d_input = i_input - d_output = numba.cuda.device_array(1, dtype_out) # to store device sum + d_output = DeviceArray.empty(1, dtype_out) # to store device sum h_init = np.array([start_sum_with], dtype_out) @@ -216,7 +214,7 @@ def test_device_sum_cache_modified_input_it( l_varr = [rng.randrange(100) for _ in range(num_items)] dtype_inp = np.dtype(supported_value_type) dtype_out = dtype_inp - input_devarr = numba.cuda.to_device(np.array(l_varr, dtype=dtype_inp)) + input_devarr = DeviceArray.from_numpy(np.array(l_varr, dtype=dtype_inp)) i_input = CacheModifiedInputIterator(input_devarr, modifier="stream") _test_device_sum_with_iterator( l_varr, start_sum_with, i_input, dtype_inp, dtype_out, use_numpy_array @@ -316,7 +314,7 @@ def test_device_sum_map_mul_map_mul_count_it( ("int32", "int64"), ], ) -def test_device_sum_map_mul2_cp_array_it( +def test_device_sum_map_mul2_device_array_it( use_numpy_array, value_type_name_pair, num_items=3, start_sum_with=10 ): vtn_out, vtn_inp = value_type_name_pair @@ -324,7 +322,7 @@ def test_device_sum_map_mul2_cp_array_it( dtype_out = np.dtype(vtn_out) rng = random.Random(0) l_d_in = [rng.randrange(100) for _ in range(num_items)] - a_d_in = cp.array(l_d_in, dtype_inp) + a_d_in = DeviceArray.from_numpy(np.asarray(l_d_in, dtype=dtype_inp)) i_input = TransformIterator(a_d_in, mul2) l_varr = [mul2(v) for v in l_d_in] _test_device_sum_with_iterator( @@ -338,14 +336,14 @@ def sum_op(x, y): # inputs are device arrays reducer_1 = cuda.compute.make_reduce_into( - d_in=cp.zeros(3, dtype="int64"), - d_out=cp.zeros(1, dtype="int64"), + d_in=DeviceArray.empty(3, dtype="int64"), + d_out=DeviceArray.empty(1, dtype="int64"), op=sum_op, h_init=np.zeros(1, dtype="int64"), ) reducer_2 = cuda.compute.make_reduce_into( - d_in=cp.zeros(3, dtype="int64"), - d_out=cp.zeros(1, dtype="int64"), + d_in=DeviceArray.empty(3, dtype="int64"), + d_out=DeviceArray.empty(1, dtype="int64"), op=sum_op, h_init=np.zeros(1, dtype="int64"), ) @@ -353,14 +351,14 @@ def sum_op(x, y): # inputs are device arrays of different dtype: reducer_1 = cuda.compute.make_reduce_into( - d_in=cp.zeros(3, dtype="int64"), - d_out=cp.zeros(1, dtype="int64"), + d_in=DeviceArray.empty(3, dtype="int64"), + d_out=DeviceArray.empty(1, dtype="int64"), op=sum_op, h_init=np.zeros(1, dtype="int64"), ) reducer_2 = cuda.compute.make_reduce_into( - d_in=cp.zeros(3, dtype="int32"), - d_out=cp.zeros(1, dtype="int64"), + d_in=DeviceArray.empty(3, dtype="int32"), + d_out=DeviceArray.empty(1, dtype="int64"), op=sum_op, h_init=np.zeros(1, dtype="int64"), ) @@ -368,14 +366,14 @@ def sum_op(x, y): # outputs are of different dtype: reducer_1 = cuda.compute.make_reduce_into( - d_in=cp.zeros(3, dtype="int64"), - d_out=cp.zeros(1, dtype="int64"), + d_in=DeviceArray.empty(3, dtype="int64"), + d_out=DeviceArray.empty(1, dtype="int64"), op=sum_op, h_init=np.zeros(1, dtype="int64"), ) reducer_2 = cuda.compute.make_reduce_into( - d_in=cp.zeros(3, dtype="int64"), - d_out=cp.zeros(1, dtype="int32"), + d_in=DeviceArray.empty(3, dtype="int64"), + d_out=DeviceArray.empty(1, dtype="int32"), op=sum_op, h_init=np.zeros(1, dtype="int64"), ) @@ -384,14 +382,14 @@ def sum_op(x, y): # inputs are of same dtype but different size # (should still use cached reducer): reducer_1 = cuda.compute.make_reduce_into( - d_in=cp.zeros(3, dtype="int64"), - d_out=cp.zeros(1, dtype="int64"), + d_in=DeviceArray.empty(3, dtype="int64"), + d_out=DeviceArray.empty(1, dtype="int64"), op=sum_op, h_init=np.zeros(1, dtype="int64"), ) reducer_2 = cuda.compute.make_reduce_into( - d_in=cp.zeros(5, dtype="int64"), - d_out=cp.zeros(1, dtype="int64"), + d_in=DeviceArray.empty(5, dtype="int64"), + d_out=DeviceArray.empty(1, dtype="int64"), op=sum_op, h_init=np.zeros(1, dtype="int64"), ) @@ -401,13 +399,13 @@ def sum_op(x, y): # same value type: reducer_1 = cuda.compute.make_reduce_into( d_in=CountingIterator(np.int32(0)), - d_out=cp.zeros(1, dtype="int64"), + d_out=DeviceArray.empty(1, dtype="int64"), op=sum_op, h_init=np.zeros(1, dtype="int64"), ) reducer_2 = cuda.compute.make_reduce_into( d_in=CountingIterator(np.int32(0)), - d_out=cp.zeros(1, dtype="int64"), + d_out=DeviceArray.empty(1, dtype="int64"), op=sum_op, h_init=np.zeros(1, dtype="int64"), ) @@ -416,13 +414,13 @@ def sum_op(x, y): # inputs are counting iterators of different value type: reducer_1 = cuda.compute.make_reduce_into( d_in=CountingIterator(np.int32(0)), - d_out=cp.zeros(1, dtype="int64"), + d_out=DeviceArray.empty(1, dtype="int64"), op=sum_op, h_init=np.zeros(1, dtype="int64"), ) reducer_2 = cuda.compute.make_reduce_into( d_in=CountingIterator(np.int64(0)), - d_out=cp.zeros(1, dtype="int64"), + d_out=DeviceArray.empty(1, dtype="int64"), op=sum_op, h_init=np.zeros(1, dtype="int64"), ) @@ -440,13 +438,13 @@ def op3(x): # inputs are TransformIterators reducer_1 = cuda.compute.make_reduce_into( d_in=TransformIterator(CountingIterator(np.int32(0)), op1), - d_out=cp.zeros(1, dtype="int64"), + d_out=DeviceArray.empty(1, dtype="int64"), op=sum_op, h_init=np.zeros(1, dtype="int64"), ) reducer_2 = cuda.compute.make_reduce_into( d_in=TransformIterator(CountingIterator(np.int32(0)), op1), - d_out=cp.zeros(1, dtype="int64"), + d_out=DeviceArray.empty(1, dtype="int64"), op=sum_op, h_init=np.zeros(1, dtype="int64"), ) @@ -456,13 +454,13 @@ def op3(x): # op: reducer_1 = cuda.compute.make_reduce_into( d_in=TransformIterator(CountingIterator(np.int32(0)), op1), - d_out=cp.zeros(1, dtype="int64"), + d_out=DeviceArray.empty(1, dtype="int64"), op=sum_op, h_init=np.zeros(1, dtype="int64"), ) reducer_2 = cuda.compute.make_reduce_into( d_in=TransformIterator(CountingIterator(np.int32(0)), op2), - d_out=cp.zeros(1, dtype="int64"), + d_out=DeviceArray.empty(1, dtype="int64"), op=sum_op, h_init=np.zeros(1, dtype="int64"), ) @@ -472,13 +470,13 @@ def op3(x): # but different name: reducer_1 = cuda.compute.make_reduce_into( d_in=TransformIterator(CountingIterator(np.int32(0)), op1), - d_out=cp.zeros(1, dtype="int64"), + d_out=DeviceArray.empty(1, dtype="int64"), op=sum_op, h_init=np.zeros(1, dtype="int64"), ) reducer_2 = cuda.compute.make_reduce_into( d_in=TransformIterator(CountingIterator(np.int32(0)), op3), - d_out=cp.zeros(1, dtype="int64"), + d_out=DeviceArray.empty(1, dtype="int64"), op=sum_op, h_init=np.zeros(1, dtype="int64"), ) @@ -487,13 +485,13 @@ def op3(x): # but different state: reducer_1 = cuda.compute.make_reduce_into( d_in=CountingIterator(np.int32(0)), - d_out=cp.zeros(1, dtype="int64"), + d_out=DeviceArray.empty(1, dtype="int64"), op=sum_op, h_init=np.zeros(1, dtype="int64"), ) reducer_2 = cuda.compute.make_reduce_into( d_in=CountingIterator(np.int32(1)), - d_out=cp.zeros(1, dtype="int64"), + d_out=DeviceArray.empty(1, dtype="int64"), op=sum_op, h_init=np.zeros(1, dtype="int64"), ) @@ -502,17 +500,17 @@ def op3(x): # inputs are TransformIterators of same kind # but different state: - ary1 = cp.asarray([0, 1, 2], dtype="int64") - ary2 = cp.asarray([0, 1], dtype="int64") + ary1 = DeviceArray.from_numpy(np.asarray([0, 1, 2], dtype="int64")) + ary2 = DeviceArray.from_numpy(np.asarray([0, 1], dtype="int64")) reducer_1 = cuda.compute.make_reduce_into( d_in=TransformIterator(ary1, op1), - d_out=cp.zeros(1, dtype="int64"), + d_out=DeviceArray.empty(1, dtype="int64"), op=sum_op, h_init=np.zeros(1, dtype="int64"), ) reducer_2 = cuda.compute.make_reduce_into( d_in=TransformIterator(ary2, op1), - d_out=cp.zeros(1, dtype="int64"), + d_out=DeviceArray.empty(1, dtype="int64"), op=sum_op, h_init=np.zeros(1, dtype="int64"), ) @@ -522,13 +520,13 @@ def op3(x): # but different state: reducer_1 = cuda.compute.make_reduce_into( d_in=TransformIterator(CountingIterator(np.int32(0)), op1), - d_out=cp.zeros(1, dtype="int64"), + d_out=DeviceArray.empty(1, dtype="int64"), op=sum_op, h_init=np.zeros(1, dtype="int64"), ) reducer_2 = cuda.compute.make_reduce_into( d_in=TransformIterator(CountingIterator(np.int32(1)), op1), - d_out=cp.zeros(1, dtype="int64"), + d_out=DeviceArray.empty(1, dtype="int64"), op=sum_op, h_init=np.zeros(1, dtype="int64"), ) @@ -537,13 +535,13 @@ def op3(x): # inputs are TransformIterators with different kind: reducer_1 = cuda.compute.make_reduce_into( d_in=TransformIterator(CountingIterator(np.int32(0)), op1), - d_out=cp.zeros(1, dtype="int64"), + d_out=DeviceArray.empty(1, dtype="int64"), op=sum_op, h_init=np.zeros(1, dtype="int64"), ) reducer_2 = cuda.compute.make_reduce_into( d_in=TransformIterator(CountingIterator(np.int64(0)), op1), - d_out=cp.zeros(1, dtype="int64"), + d_out=DeviceArray.empty(1, dtype="int64"), op=sum_op, h_init=np.zeros(1, dtype="int64"), ) @@ -553,27 +551,25 @@ def op3(x): @pytest.fixture(params=[True, False]) def array_2d(request): f_contiguous = request.param - arr = cp.random.rand(5, 10) - if f_contiguous: - try: - return cp.asfortranarray(arr) - except ImportError: # cublas unavailable - return arr - else: - return arr + array = np.random.rand(5, 10) + return np.asfortranarray(array) if f_contiguous else array def test_reduce_2d_array(array_2d): def binary_op(x, y): return x + y - d_out = cp.empty(1, dtype=array_2d.dtype) + d_in = DeviceArray.from_numpy(array_2d) + d_out = DeviceArray.empty(1, dtype=array_2d.dtype) h_init = np.asarray([0], dtype=array_2d.dtype) - d_in = array_2d cuda.compute.reduce_into( - d_in=d_in, d_out=d_out, num_items=d_in.size, op=binary_op, h_init=h_init + d_in=d_in, + d_out=d_out, + num_items=array_2d.size, + op=binary_op, + h_init=h_init, ) - np.testing.assert_allclose(d_in.sum().get(), d_out.get()) + np.testing.assert_allclose(array_2d.sum(), d_out.copy_to_host()) def test_reduce_non_contiguous(): @@ -581,16 +577,28 @@ def binary_op(x, y): return x + y size = 10 - d_out = cp.empty(1, dtype="int64") + + class DeviceArrayView: + def __init__(self, base, host_view): + self._base = base + self.__cuda_array_interface__ = { + **base.__cuda_array_interface__, + "shape": host_view.shape, + "strides": host_view.strides, + } + + d_out = DeviceArray.empty(1, dtype="int64") h_init = np.asarray([0], dtype="int64") - d_in = cp.zeros((size, 2))[:, 0] + h_base = np.zeros((size, 2)) + d_in = DeviceArrayView(DeviceArray.from_numpy(h_base), h_base[:, 0]) with pytest.raises(ValueError, match="Non-contiguous arrays are not supported."): _ = cuda.compute.make_reduce_into( d_in=d_in, d_out=d_out, op=binary_op, h_init=h_init ) - d_in = cp.zeros(size)[::2] + h_base = np.zeros(size) + d_in = DeviceArrayView(DeviceArray.from_numpy(h_base), h_base[::2]) with pytest.raises(ValueError, match="Non-contiguous arrays are not supported."): _ = cuda.compute.make_reduce_into( d_in=d_in, d_out=d_out, op=binary_op, h_init=h_init @@ -604,21 +612,18 @@ def add_op(x, y): h_init = np.asarray([0], dtype=np.int32) h_in = random_int(5, np.int32) - cp_stream = cp.cuda.ExternalStream(cuda_stream.ptr) - with cp_stream: - d_in = cp.asarray(h_in) - d_out = cp.empty(1, dtype=np.int32) + d_in = DeviceArray.from_numpy(h_in, stream=cuda_stream) + d_out = DeviceArray.empty(1, np.int32, stream=cuda_stream) cuda.compute.reduce_into( d_in=d_in, d_out=d_out, - num_items=d_in.size, + num_items=h_in.size, op=add_op, h_init=h_init, stream=cuda_stream, ) - with cp_stream: - cp.testing.assert_allclose(d_in.sum().get(), d_out.get()) + np.testing.assert_allclose(h_in.sum(), d_out.copy_to_host(stream=cuda_stream)) def test_reduce_invalid_stream(): @@ -646,9 +651,9 @@ def __cuda_stream__(self): def add_op(x, y): return x + y - d_out = cp.empty(1) + d_out = DeviceArray.empty(1, np.float64) h_init = np.empty(1) - d_in = cp.empty(1) + d_in = DeviceArray.empty(1, np.float64) reduce_into = cuda.compute.make_reduce_into( d_in=d_in, d_out=d_out, op=add_op, h_init=h_init ) @@ -661,7 +666,7 @@ def add_op(x, y): d_in=d_in, d_out=d_out, op=add_op, - num_items=d_in.size, + num_items=1, h_init=h_init, stream=Stream1(), ) @@ -674,7 +679,7 @@ def add_op(x, y): d_in=d_in, d_out=d_out, op=add_op, - num_items=d_in.size, + num_items=1, h_init=h_init, stream=Stream2(), ) @@ -685,7 +690,7 @@ def add_op(x, y): d_in=d_in, d_out=d_out, op=add_op, - num_items=d_in.size, + num_items=1, h_init=h_init, stream=Stream3(), ) @@ -694,55 +699,58 @@ def add_op(x, y): def test_device_reduce_well_known_plus(): dtype = np.int32 h_init = np.array([0], dtype=dtype) - d_input = cp.array([1, 2, 3, 4, 5], dtype=dtype) - d_output = cp.empty(1, dtype=dtype) + h_input = np.array([1, 2, 3, 4, 5], dtype=dtype) + d_input = DeviceArray.from_numpy(h_input) + d_output = DeviceArray.empty(1, dtype=dtype) cuda.compute.reduce_into( d_in=d_input, d_out=d_output, - num_items=len(d_input), + num_items=len(h_input), op=OpKind.PLUS, h_init=h_init, ) expected_output = 15 - assert (d_output == expected_output).all() + assert d_output.copy_to_host()[0] == expected_output def test_device_reduce_well_known_minimum(): dtype = np.int32 h_init = np.array([100], dtype=dtype) - d_input = cp.array([8, 6, 7, 5, 3, 0, 9], dtype=dtype) - d_output = cp.empty(1, dtype=dtype) + h_input = np.array([8, 6, 7, 5, 3, 0, 9], dtype=dtype) + d_input = DeviceArray.from_numpy(h_input) + d_output = DeviceArray.empty(1, dtype=dtype) cuda.compute.reduce_into( d_in=d_input, d_out=d_output, - num_items=len(d_input), + num_items=len(h_input), op=OpKind.MINIMUM, h_init=h_init, ) expected_output = 0 - assert (d_output == expected_output).all() + assert d_output.copy_to_host()[0] == expected_output def test_device_reduce_well_known_maximum(): dtype = np.int32 h_init = np.array([-100], dtype=dtype) - d_input = cp.array([8, 6, 7, 5, 3, 0, 9], dtype=dtype) - d_output = cp.empty(1, dtype=dtype) + h_input = np.array([8, 6, 7, 5, 3, 0, 9], dtype=dtype) + d_input = DeviceArray.from_numpy(h_input) + d_output = DeviceArray.empty(1, dtype=dtype) cuda.compute.reduce_into( d_in=d_input, d_out=d_output, - num_items=len(d_input), + num_items=len(h_input), op=OpKind.MAXIMUM, h_init=h_init, ) expected_output = 9 - assert (d_output == expected_output).all() + assert d_output.copy_to_host()[0] == expected_output def test_cache_modified_input_iterator(): @@ -750,19 +758,18 @@ def add_op(a, b): return a + b values = [8, 6, 7, 5, 3, 0, 9] - d_input = cp.array(values, dtype=np.int32) - d_output = cp.empty(1, dtype=np.int32) + d_input = DeviceArray.from_numpy(np.asarray(values, dtype=np.int32)) iterator = CacheModifiedInputIterator(d_input, modifier="stream") h_init = np.array([0], dtype=np.int32) - d_output = cp.empty(1, dtype=np.int32) + d_output = DeviceArray.empty(1, dtype=np.int32) cuda.compute.reduce_into( d_in=iterator, d_out=d_output, num_items=len(values), op=add_op, h_init=h_init ) expected_output = functools.reduce(lambda a, b: a + b, values) - assert (d_output == expected_output).all() + assert d_output.copy_to_host()[0] == expected_output def test_constant_iterator(): @@ -774,14 +781,14 @@ def add_op(a, b): constant_it = ConstantIterator(np.int32(value)) h_init = np.array([0], dtype=np.int32) - d_output = cp.empty(1, dtype=np.int32) + d_output = DeviceArray.empty(1, dtype=np.int32) cuda.compute.reduce_into( d_in=constant_it, d_out=d_output, num_items=num_items, op=add_op, h_init=h_init ) expected_output = functools.reduce(lambda a, b: a + b, [value] * num_items) - assert (d_output == expected_output).all() + assert d_output.copy_to_host()[0] == expected_output def test_counting_iterator(): @@ -793,7 +800,7 @@ def add_op(a, b): first_it = CountingIterator(np.int32(first_item)) # Input sequence h_init = np.array([0], dtype=np.int32) # Initial value for the reduction - d_output = cp.empty(1, dtype=np.int32) # Storage for output + d_output = DeviceArray.empty(1, dtype=np.int32) # Storage for output cuda.compute.reduce_into( d_in=first_it, d_out=d_output, num_items=num_items, op=add_op, h_init=h_init @@ -802,7 +809,7 @@ def add_op(a, b): expected_output = functools.reduce( lambda a, b: a + b, range(first_item, first_item + num_items) ) - assert (d_output == expected_output).all() + assert d_output.copy_to_host()[0] == expected_output def test_transform_iterator(): @@ -817,7 +824,7 @@ def square_op(a): transform_it = TransformIterator(CountingIterator(np.int32(first_item)), square_op) h_init = np.array([0], dtype=np.int32) - d_output = cp.empty(1, dtype=np.int32) + d_output = DeviceArray.empty(1, dtype=np.int32) cuda.compute.reduce_into( d_in=transform_it, d_out=d_output, num_items=num_items, op=add_op, h_init=h_init @@ -826,7 +833,7 @@ def square_op(a): expected_output = functools.reduce( lambda a, b: a + b, [a**2 for a in range(first_item, first_item + num_items)] ) - assert (d_output == expected_output).all() + assert d_output.copy_to_host()[0] == expected_output def test_reduce_struct_type(): @@ -839,19 +846,19 @@ class Pixel: def max_g_value(x, y): return x if x.g > y.g else y - d_rgb = cp.random.randint(0, 256, (10, 3), dtype=np.int32).view(Pixel.dtype) - d_out = cp.empty(1, Pixel.dtype) + h_rgb = np.random.randint(0, 256, (10, 3), dtype=np.int32).view(Pixel.dtype) + d_rgb = DeviceArray.from_numpy(h_rgb) + d_out = DeviceArray.empty(1, Pixel.dtype) h_init = Pixel(0, 0, 0) cuda.compute.reduce_into( - d_in=d_rgb, d_out=d_out, num_items=d_rgb.size, op=max_g_value, h_init=h_init + d_in=d_rgb, d_out=d_out, num_items=h_rgb.size, op=max_g_value, h_init=h_init ) - h_rgb = d_rgb.get() expected = h_rgb[h_rgb.view("int32")[:, 1].argmax()] - np.testing.assert_equal(expected["g"], d_out.get()["g"]) + np.testing.assert_equal(expected["g"], d_out.copy_to_host()["g"]) @pytest.mark.no_verify_sass(reason="LDL/STL instructions emitted for this test.") @@ -872,14 +879,15 @@ def transform_op(v): nelems = 4096 - d_in = cp.random.randn(nelems) + h_in = np.random.randn(nelems) + d_in = DeviceArray.from_numpy(h_in) # input values must be transformed to MinMax structures # in-place to map computation to data-parallel reduction # algorithm that requires commutative binary operation # with both operands having the same type. tr_it = TransformIterator(d_in, transform_op) - d_out = cp.empty(tuple(), dtype=MinMax.dtype) + d_out = DeviceArray.empty(tuple(), dtype=MinMax.dtype) # initial value set with identity elements of # minimum and maximum operators @@ -891,9 +899,9 @@ def transform_op(v): ) # display values computed on the device - actual = d_out.get() + actual = d_out.copy_to_host() - h = np.abs(d_in.get()) + h = np.abs(h_in) expected = np.asarray([(h.min(), h.max())], dtype=MinMax.dtype) assert actual == expected @@ -905,8 +913,8 @@ def test_reduce_transform_output_iterator(floating_array): h_init = np.array([0], dtype=dtype) # Use the floating_array fixture which provides random floating-point data of size 1000 - d_input = floating_array - d_output = cp.empty(1, dtype=dtype) + d_input = DeviceArray.from_numpy(floating_array) + d_output = DeviceArray.empty(1, dtype=dtype) def sqrt(x: dtype) -> dtype: return x**0.5 @@ -916,26 +924,26 @@ def sqrt(x: dtype) -> dtype: cuda.compute.reduce_into( d_in=d_input, d_out=d_out_it, - num_items=len(d_input), + num_items=len(floating_array), op=OpKind.PLUS, h_init=h_init, ) - expected = cp.sqrt(cp.sum(d_input)) - np.testing.assert_allclose(d_output.get(), expected.get(), atol=1e-6) + expected = np.sqrt(np.sum(floating_array)) + np.testing.assert_allclose(d_output.copy_to_host(), expected, atol=1e-6) def test_reduce_with_not_guaranteed_determinism(floating_array): dtype = floating_array.dtype h_init = np.array([0], dtype=dtype) - d_input = floating_array - d_output = cp.empty(1, dtype=dtype) + d_input = DeviceArray.from_numpy(floating_array) + d_output = DeviceArray.empty(1, dtype=dtype) cuda.compute.reduce_into( d_in=d_input, d_out=d_output, - num_items=len(d_input), + num_items=len(floating_array), op=OpKind.PLUS, h_init=h_init, determinism=Determinism.NOT_GUARANTEED, @@ -944,20 +952,21 @@ def test_reduce_with_not_guaranteed_determinism(floating_array): def test_reduce_bool(): h_init = np.array([False]) - d_input = cp.array([True, False, True]) - d_output = cp.empty_like(d_input, shape=(1,)) + h_input = np.array([True, False, True]) + d_input = DeviceArray.from_numpy(h_input) + d_output = DeviceArray.empty(1, h_input.dtype) # Perform the reduction. cuda.compute.reduce_into( d_in=d_input, d_out=d_output, - num_items=len(d_input), + num_items=len(h_input), op=OpKind.MAXIMUM, h_init=h_init, ) expected = True - assert d_output.get()[0] == expected + assert d_output.copy_to_host()[0] == expected def test_reduce_input_and_accumulator_type_mismatch(): @@ -969,29 +978,16 @@ class AccumulatorType: def op(foo1: AccumulatorType, foo2: AccumulatorType): return AccumulatorType(foo1.x + foo2.x, foo1.y + foo2.y) - def to_cupy_record(h_array): - # a helper function to copy a numpy array of record type - # into a cupy array. The cupy `asarray` function doesn't - # work for record types. - d_array = cp.empty(h_array.nbytes, dtype=np.uint8) - runtime.memcpy( - d_array.data.ptr, - h_array.ctypes.data, - h_array.nbytes, - runtime.memcpyHostToDevice, - ) - return d_array.view(h_array.dtype).reshape(h_array.shape) - # input data is {int32, int64} dtype = np.dtype([("x", np.int32), ("y", np.int64)], align=True) h_data = np.asarray([(1, 2), (3, 4), (5, 6)], dtype=dtype) - d_data = to_cupy_record(h_data) + d_data = DeviceArray.from_numpy(h_data) # output and h_init, both are AccumulatorType - d_out = cp.empty(1, AccumulatorType.dtype) + d_out = DeviceArray.empty(1, AccumulatorType.dtype) h_init = AccumulatorType(0, 0) # Init is AccumulatorType with pytest.raises(TypeError, match="reduce_into dtype mismatch: input dtype"): cuda.compute.reduce_into( - d_in=d_data, d_out=d_out, op=op, num_items=d_data.size, h_init=h_init + d_in=d_data, d_out=d_out, op=op, num_items=h_data.size, h_init=h_init ) diff --git a/python/cuda_cccl/tests/compute/test_scan.py b/python/cuda_cccl/tests/compute/test_scan.py index af6d941b11b..d2f2a8ad14a 100644 --- a/python/cuda_cccl/tests/compute/test_scan.py +++ b/python/cuda_cccl/tests/compute/test_scan.py @@ -3,10 +3,9 @@ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -import cupy as cp -import numba.cuda import numpy as np import pytest +from _utils.device_array import DeviceArray, get_compute_capability import cuda.compute from cuda.compute import ( @@ -52,7 +51,7 @@ def scan_device(d_input, d_output, num_items, op, h_init, force_inclusive, strea [True, False], ) def test_scan_array_input(force_inclusive, input_array, monkeypatch): - cc_major, _ = numba.cuda.get_current_device().compute_capability + cc_major, _ = get_compute_capability() # Skip sass verification if input is complex # as LDL/STL instructions are emitted for complex types. # Also skip for: @@ -88,15 +87,16 @@ def op(a, b): is_short_dtype = dtype.itemsize < 16 # for small range data types make input small to assure that # accumulation does not overflow - d_input = input_array[:31] if is_short_dtype else input_array + h_input = input_array[:31] if is_short_dtype else input_array + d_input = DeviceArray.from_numpy(h_input) h_init = np.array([42], dtype=dtype) - d_output = cp.empty_like(d_input) + d_output = DeviceArray.empty(h_input.shape, h_input.dtype) - scan_device(d_input, d_output, len(d_input), reduce_op, h_init, force_inclusive) + scan_device(d_input, d_output, h_input.size, reduce_op, h_init, force_inclusive) - got = d_output.get() - expected = scan_host(d_input.get(), op, h_init, force_inclusive) + got = d_output.copy_to_host() + expected = scan_host(h_input, op, h_init, force_inclusive) if np.isdtype(dtype, ("real floating", "complex floating")): real_dt = np.finfo(dtype).dtype @@ -119,11 +119,11 @@ def op(a, b): num_items = 1024 dtype = np.dtype("int32") h_init = np.array([42], dtype=dtype) - d_output = cp.empty(num_items, dtype=dtype) + d_output = DeviceArray.empty(num_items, dtype) scan_device(d_input, d_output, num_items, op, h_init, force_inclusive) - got = d_output.get() + got = d_output.copy_to_host() expected = scan_host( np.arange(1, num_items + 1, dtype=dtype), op, h_init, force_inclusive ) @@ -143,11 +143,11 @@ def op(a, b): d_input = ReverseIterator(CountingIterator(np.int32(num_items))) dtype = np.dtype("int32") h_init = np.array([0], dtype=dtype) - d_output = cp.empty(num_items, dtype=dtype) + d_output = DeviceArray.empty(num_items, dtype) scan_device(d_input, d_output, num_items, op, h_init, force_inclusive) - got = d_output.get() + got = d_output.copy_to_host() expected = scan_host( np.arange(num_items, 0, -1, dtype=dtype), op, h_init, force_inclusive ) @@ -169,19 +169,20 @@ class XY: def op(a, b): return XY(a.x + b.x, a.y + b.y) - d_input = cp.random.randint(0, 256, (10, 2), dtype=np.int32).view(XY.dtype) - d_output = cp.empty_like(d_input) + h_input = np.random.randint(0, 256, (10, 2), dtype=np.int32).view(XY.dtype) + d_input = DeviceArray.from_numpy(h_input) + d_output = DeviceArray.empty(h_input.shape, h_input.dtype) h_init = XY(0, 0) - scan_device(d_input, d_output, len(d_input), op, h_init, force_inclusive) + scan_device(d_input, d_output, len(h_input), op, h_init, force_inclusive) - got = d_output.get() + got = d_output.copy_to_host() expected_x = scan_host( - d_input.get()["x"], lambda a, b: a + b, np.asarray([h_init.x]), force_inclusive + h_input["x"], lambda a, b: a + b, np.asarray([h_init.x]), force_inclusive ) expected_y = scan_host( - d_input.get()["y"], lambda a, b: a + b, np.asarray([h_init.y]), force_inclusive + h_input["y"], lambda a, b: a + b, np.asarray([h_init.y]), force_inclusive ) np.testing.assert_allclose(expected_x, got["x"], rtol=1e-5) @@ -196,20 +197,18 @@ def test_scan_with_stream(force_inclusive, cuda_stream): def op(a, b): return a + b - cp_stream = cp.cuda.ExternalStream(cuda_stream.ptr) - - with cp_stream: - d_input = cp.random.randint(0, 256, 1024, dtype=np.int32) - d_output = cp.empty_like(d_input) + h_input = np.random.randint(0, 256, 1024, dtype=np.int32) + d_input = DeviceArray.from_numpy(h_input, stream=cuda_stream) + d_output = DeviceArray.empty(h_input.shape, h_input.dtype, stream=cuda_stream) h_init = np.array([42], dtype=np.int32) scan_device( - d_input, d_output, len(d_input), op, h_init, force_inclusive, stream=cuda_stream + d_input, d_output, h_input.size, op, h_init, force_inclusive, stream=cuda_stream ) - got = d_output.get() - expected = scan_host(d_input.get(), op, h_init, force_inclusive) + got = d_output.copy_to_host(stream=cuda_stream) + expected = scan_host(h_input, op, h_init, force_inclusive) np.testing.assert_allclose(expected, got, rtol=1e-5) @@ -217,23 +216,24 @@ def op(a, b): def test_exclusive_scan_well_known_plus(): dtype = np.int32 h_init = np.array([0], dtype=dtype) - d_input = cp.array([1, 2, 3, 4, 5], dtype=dtype) - d_output = cp.empty_like(d_input, dtype=dtype) + h_input = np.array([1, 2, 3, 4, 5], dtype=dtype) + d_input = DeviceArray.from_numpy(h_input) + d_output = DeviceArray.empty(h_input.shape, dtype) cuda.compute.exclusive_scan( d_in=d_input, d_out=d_output, op=OpKind.PLUS, init_value=h_init, - num_items=d_input.size, + num_items=h_input.size, ) expected = np.array([0, 1, 3, 6, 10]) - np.testing.assert_equal(d_output.get(), expected) + np.testing.assert_equal(d_output.copy_to_host(), expected) def test_inclusive_scan_well_known_plus(monkeypatch): - cc_major, _ = numba.cuda.get_current_device().compute_capability + cc_major, _ = get_compute_capability() # Skip SASS check for CC 9.0+, due to a bug in NVRTC. # TODO: add NVRTC version check, ref nvbug 5243118 if cc_major >= 9: @@ -247,19 +247,20 @@ def test_inclusive_scan_well_known_plus(monkeypatch): dtype = np.int32 h_init = np.array([0], dtype=dtype) - d_input = cp.array([1, 2, 3, 4, 5], dtype=dtype) - d_output = cp.empty_like(d_input, dtype=dtype) + h_input = np.array([1, 2, 3, 4, 5], dtype=dtype) + d_input = DeviceArray.from_numpy(h_input) + d_output = DeviceArray.empty(h_input.shape, dtype) cuda.compute.inclusive_scan( d_in=d_input, d_out=d_output, op=OpKind.PLUS, init_value=h_init, - num_items=d_input.size, + num_items=h_input.size, ) expected = np.array([1, 3, 6, 10, 15]) - np.testing.assert_equal(d_output.get(), expected) + np.testing.assert_equal(d_output.copy_to_host(), expected) @pytest.mark.xfail( @@ -268,19 +269,20 @@ def test_inclusive_scan_well_known_plus(monkeypatch): def test_exclusive_scan_well_known_maximum(): dtype = np.int32 h_init = np.array([1], dtype=dtype) - d_input = cp.array([-5, 0, 2, -3, 2, 4, 0, -1, 2, 8], dtype=dtype) - d_output = cp.empty_like(d_input, dtype=dtype) + h_input = np.array([-5, 0, 2, -3, 2, 4, 0, -1, 2, 8], dtype=dtype) + d_input = DeviceArray.from_numpy(h_input) + d_output = DeviceArray.empty(h_input.shape, dtype) cuda.compute.exclusive_scan( d_in=d_input, d_out=d_output, op=OpKind.MAXIMUM, init_value=h_init, - num_items=d_input.size, + num_items=h_input.size, ) expected = np.array([1, 1, 1, 2, 2, 2, 4, 4, 4, 4]) - np.testing.assert_equal(d_output.get(), expected) + np.testing.assert_equal(d_output.copy_to_host(), expected) def test_scan_transform_output_iterator(floating_array): @@ -289,8 +291,9 @@ def test_scan_transform_output_iterator(floating_array): h_init = np.array([0], dtype=dtype) # Use the floating_array fixture which provides random floating-point data of size 1000 - d_input = floating_array - d_output = cp.empty_like(d_input, dtype=dtype) + h_input = floating_array + d_input = DeviceArray.from_numpy(h_input) + d_output = DeviceArray.empty(h_input.shape, dtype) def square(x: dtype) -> dtype: return x * x @@ -302,15 +305,17 @@ def square(x: dtype) -> dtype: d_out=d_out_it, op=OpKind.PLUS, init_value=h_init, - num_items=d_input.size, + num_items=h_input.size, ) - expected = cp.cumsum(d_input) ** 2 + expected = np.cumsum(h_input) ** 2 # Use more lenient tolerance for float32 due to precision differences if dtype == np.float32: - np.testing.assert_allclose(d_output.get(), expected.get(), atol=1e-4, rtol=1e-4) + np.testing.assert_allclose( + d_output.copy_to_host(), expected, atol=1e-4, rtol=1e-4 + ) else: - np.testing.assert_allclose(d_output.get(), expected.get(), atol=1e-6) + np.testing.assert_allclose(d_output.copy_to_host(), expected, atol=1e-6) def test_exclusive_scan_max(): @@ -318,19 +323,20 @@ def max_op(a, b): return max(a, b) h_init = np.array([1], dtype="int32") - d_input = cp.array([-5, 0, 2, -3, 2, 4, 0, -1, 2, 8], dtype="int32") - d_output = cp.empty_like(d_input, dtype="int32") + h_input = np.array([-5, 0, 2, -3, 2, 4, 0, -1, 2, 8], dtype=np.int32) + d_input = DeviceArray.from_numpy(h_input) + d_output = DeviceArray.empty(h_input.shape, h_input.dtype) cuda.compute.exclusive_scan( d_in=d_input, d_out=d_output, op=max_op, init_value=h_init, - num_items=d_input.size, + num_items=h_input.size, ) expected = np.asarray([1, 1, 1, 2, 2, 2, 4, 4, 4, 4]) - np.testing.assert_equal(d_output.get(), expected) + np.testing.assert_equal(d_output.copy_to_host(), expected) def test_inclusive_scan_add(): @@ -338,23 +344,24 @@ def add_op(a, b): return a + b h_init = np.array([0], dtype="int32") - d_input = cp.array([-5, 0, 2, -3, 2, 4, 0, -1, 2, 8], dtype="int32") - d_output = cp.empty_like(d_input, dtype="int32") + h_input = np.array([-5, 0, 2, -3, 2, 4, 0, -1, 2, 8], dtype=np.int32) + d_input = DeviceArray.from_numpy(h_input) + d_output = DeviceArray.empty(h_input.shape, h_input.dtype) cuda.compute.inclusive_scan( d_in=d_input, d_out=d_output, op=add_op, init_value=h_init, - num_items=d_input.size, + num_items=h_input.size, ) expected = np.asarray([-5, -5, -3, -6, -4, 0, 0, -1, 1, 9]) - np.testing.assert_equal(d_output.get(), expected) + np.testing.assert_equal(d_output.copy_to_host(), expected) def test_reverse_input_iterator(monkeypatch): - cc_major, _ = numba.cuda.get_current_device().compute_capability + cc_major, _ = get_compute_capability() # Skip SASS check for CC 9.0+, due to a bug in NVRTC. # TODO: add NVRTC version check, ref nvbug 5243118 if cc_major >= 9: @@ -370,8 +377,9 @@ def add_op(a, b): return a + b h_init = np.array([0], dtype="int32") - d_input = cp.array([-5, 0, 2, -3, 2, 4, 0, -1, 2, 8], dtype="int32") - d_output = cp.empty_like(d_input, dtype="int32") + h_input = np.array([-5, 0, 2, -3, 2, 4, 0, -1, 2, 8], dtype=np.int32) + d_input = DeviceArray.from_numpy(h_input) + d_output = DeviceArray.empty(h_input.shape, h_input.dtype) reverse_it = ReverseIterator(d_input) cuda.compute.inclusive_scan( @@ -379,12 +387,12 @@ def add_op(a, b): d_out=d_output, op=add_op, init_value=h_init, - num_items=len(d_input), + num_items=h_input.size, ) # Check the result is correct expected = np.asarray([8, 10, 9, 9, 13, 15, 12, 14, 14, 9]) - np.testing.assert_equal(d_output.get(), expected) + np.testing.assert_equal(d_output.copy_to_host(), expected) @pytest.mark.no_verify_sass(reason="LDL/STL instructions emitted for this test.") @@ -393,8 +401,9 @@ def add_op(a, b): return a + b h_init = np.array([0], dtype="int32") - d_input = cp.array([-5, 0, 2, -3, 2, 4, 0, -1, 2, 8], dtype="int32") - d_output = cp.empty_like(d_input, dtype="int32") + h_input = np.array([-5, 0, 2, -3, 2, 4, 0, -1, 2, 8], dtype=np.int32) + d_input = DeviceArray.from_numpy(h_input) + d_output = DeviceArray.empty(h_input.shape, h_input.dtype) reverse_it = ReverseIterator(d_output) cuda.compute.inclusive_scan( @@ -402,11 +411,11 @@ def add_op(a, b): d_out=reverse_it, op=add_op, init_value=h_init, - num_items=len(d_input), + num_items=h_input.size, ) expected = np.asarray([9, 1, -1, 0, 0, -4, -6, -3, -5, -5]) - np.testing.assert_equal(d_output.get(), expected) + np.testing.assert_equal(d_output.copy_to_host(), expected) @pytest.mark.parametrize( @@ -417,16 +426,16 @@ def test_future_init_value(force_inclusive): num_items = 1024 dtype = np.dtype("int32") - d_input = cp.random.randint(0, 256, num_items, dtype=dtype) - d_output = cp.empty_like(d_input) - init_value = cp.array([42], dtype=dtype) + h_input = np.random.randint(0, 256, num_items, dtype=dtype) + h_init = np.array([42], dtype=dtype) + d_input = DeviceArray.from_numpy(h_input) + d_output = DeviceArray.empty(h_input.shape, h_input.dtype) + init_value = DeviceArray.from_numpy(h_init) scan_device(d_input, d_output, num_items, OpKind.PLUS, init_value, force_inclusive) - got = d_output.get() - expected = scan_host( - d_input.get(), lambda a, b: a + b, init_value.get(), force_inclusive - ) + got = d_output.copy_to_host() + expected = scan_host(h_input, lambda a, b: a + b, h_init, force_inclusive) np.testing.assert_array_equal(expected, got) @@ -436,7 +445,7 @@ def test_no_init_value(monkeypatch): dtype = np.dtype("int32") # Skip SASS check for CC 9.0 due to LDL/STL CI failure. - cc_major, _ = numba.cuda.get_current_device().compute_capability + cc_major, _ = get_compute_capability() if cc_major >= 9: import cuda.compute._cccl_interop @@ -446,13 +455,14 @@ def test_no_init_value(monkeypatch): False, ) - d_input = cp.random.randint(0, 256, num_items, dtype=dtype) - d_output = cp.empty_like(d_input) + h_input = np.random.randint(0, 256, num_items, dtype=dtype) + d_input = DeviceArray.from_numpy(h_input) + d_output = DeviceArray.empty(h_input.shape, h_input.dtype) scan_device(d_input, d_output, num_items, OpKind.PLUS, None, force_inclusive) - got = d_output.get() - expected = scan_host(d_input.get(), lambda a, b: a + b, [0], force_inclusive) + got = d_output.copy_to_host() + expected = scan_host(h_input, lambda a, b: a + b, [0], force_inclusive) np.testing.assert_array_equal(expected, got) @@ -462,11 +472,11 @@ def test_no_init_value_iterator(): dtype = np.dtype("float64") d_input = CountingIterator(np.float64(0)) - d_output = cp.empty(num_items, dtype=dtype) + d_output = DeviceArray.empty(num_items, dtype) scan_device(d_input, d_output, num_items, OpKind.PLUS, None, force_inclusive) - got = d_output.get() + got = d_output.copy_to_host() expected = scan_host( np.arange(0, num_items, dtype=dtype), lambda a, b: a + b, [0], force_inclusive ) @@ -477,8 +487,9 @@ def test_no_init_value_iterator(): def test_inclusive_scan_with_lambda(): """Test inclusive_scan with a lambda function as the scan operator.""" h_init = np.array([0], dtype=np.int32) - d_input = cp.array([1, 2, 3, 4, 5], dtype=np.int32) - d_output = cp.empty_like(d_input) + h_input = np.array([1, 2, 3, 4, 5], dtype=np.int32) + d_input = DeviceArray.from_numpy(h_input) + d_output = DeviceArray.empty(h_input.shape, h_input.dtype) # Use a lambda function directly as the scan operator cuda.compute.inclusive_scan( @@ -486,21 +497,22 @@ def test_inclusive_scan_with_lambda(): d_out=d_output, op=lambda a, b: a + b, init_value=h_init, - num_items=len(d_input), + num_items=h_input.size, ) expected = np.array([1, 3, 6, 10, 15], dtype=np.int32) - np.testing.assert_array_equal(d_output.get(), expected) + np.testing.assert_array_equal(d_output.copy_to_host(), expected) @pytest.mark.parametrize("force_inclusive", [True, False]) def test_scan_bool_maximum(force_inclusive): h_init = np.array([False], dtype=np.bool_) - d_input = cp.array([False, True, False, True], dtype=np.bool_) - d_output = cp.empty_like(d_input) + h_input = np.array([False, True, False, True], dtype=np.bool_) + d_input = DeviceArray.from_numpy(h_input) + d_output = DeviceArray.empty(h_input.shape, h_input.dtype) scan_device( - d_input, d_output, len(d_input), OpKind.MAXIMUM, h_init, force_inclusive + d_input, d_output, h_input.size, OpKind.MAXIMUM, h_init, force_inclusive ) if force_inclusive: @@ -508,4 +520,4 @@ def test_scan_bool_maximum(force_inclusive): else: expected = np.array([False, False, True, True], dtype=np.bool_) - np.testing.assert_array_equal(d_output.get(), expected) + np.testing.assert_array_equal(d_output.copy_to_host(), expected) diff --git a/python/cuda_cccl/tests/compute/test_segmented_reduce.py b/python/cuda_cccl/tests/compute/test_segmented_reduce.py index 950d860a14d..105e4ff895d 100644 --- a/python/cuda_cccl/tests/compute/test_segmented_reduce.py +++ b/python/cuda_cccl/tests/compute/test_segmented_reduce.py @@ -2,9 +2,9 @@ # # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -import cupy as cp import numpy as np import pytest +from _utils.device_array import DeviceArray import cuda.compute from cuda.compute import ( @@ -17,6 +17,19 @@ ) +def is_out_of_memory_error(error): + # cuda-core exception types vary by memory resource, so classify by message. + message = str(error).lower() + return any( + marker in message + for marker in ( + "out of memory", + "out_of_memory", + "failed to allocate memory from pool", + ) + ) + + @pytest.fixture(params=["i4", "u4", "i8", "u8"]) def offset_dtype(request): return np.dtype(request.param) @@ -36,22 +49,24 @@ def binary_op(a, b): assert input_array.ndim == 1 sz = input_array.size - rng = cp.random + rng = np.random.default_rng() n_segments = 16 - h_offsets = cp.zeros(n_segments + 1, dtype="int64") + h_offsets = np.zeros(n_segments + 1, dtype="int64") h_offsets[1:] = rng.multinomial(sz, [1 / n_segments] * n_segments) - offsets = cp.cumsum(cp.asarray(h_offsets, dtype=offset_dtype), dtype=offset_dtype) + offsets = np.cumsum(np.asarray(h_offsets, dtype=offset_dtype), dtype=offset_dtype) - start_offsets = offsets[:-1] - end_offsets = offsets[1:] + h_start_offsets = offsets[:-1] + h_end_offsets = offsets[1:] assert offsets.dtype == np.dtype(offset_dtype) - assert cp.all(start_offsets <= end_offsets) - assert end_offsets[-1] == sz + assert np.all(h_start_offsets <= h_end_offsets) + assert h_end_offsets[-1] == sz - d_in = cp.asarray(input_array) - d_out = cp.empty(n_segments, dtype=d_in.dtype) + d_in = DeviceArray.from_numpy(input_array) + d_out = DeviceArray.empty(n_segments, input_array.dtype) + start_offsets = DeviceArray.from_numpy(h_start_offsets) + end_offsets = DeviceArray.from_numpy(h_end_offsets) h_init = np.zeros(tuple(), dtype=input_array.dtype) @@ -71,11 +86,16 @@ def binary_op(a, b): h_init=h_init, ) - d_expected = cp.empty_like(d_out) + expected = np.empty(n_segments, dtype=input_array.dtype) for i in range(n_segments): - d_expected[i] = cp.sum(d_in[start_offsets[i] : end_offsets[i]]) + expected[i] = np.sum(input_array[h_start_offsets[i] : h_end_offsets[i]]) - assert cp.all(d_out == d_expected) + result = d_out.copy_to_host() + if np.issubdtype(input_array.dtype, np.inexact): + tolerance = 4 * np.finfo(input_array.dtype).eps + np.testing.assert_allclose(result, expected, rtol=tolerance, atol=tolerance) + else: + np.testing.assert_array_equal(result, expected) def test_segmented_reduce_struct_type(monkeypatch): @@ -85,8 +105,6 @@ def test_segmented_reduce_struct_type(monkeypatch): "_check_sass", False, ) - import cupy as cp - import numpy as np @gpu_struct class Pixel: @@ -102,13 +120,18 @@ def align_up(n, m): segment_size = 64 n_pixels = align_up(4000, 64) - offsets = cp.arange(n_pixels + segment_size - 1, step=segment_size, dtype=np.int64) - start_offsets = offsets[:-1] - end_offsets = offsets[1:] - n_segments = start_offsets.size - - d_rgb = cp.random.randint(0, 256, (n_pixels, 3), dtype=np.int32).view(Pixel.dtype) - d_out = cp.empty(n_segments, Pixel.dtype) + offsets = np.arange(n_pixels + segment_size - 1, step=segment_size, dtype=np.int64) + h_start_offsets = offsets[:-1] + h_end_offsets = offsets[1:] + n_segments = h_start_offsets.size + + rng = np.random.default_rng() + h_rgb = rng.integers(0, 256, (n_pixels, 3), dtype=np.int32) + h_rgb = h_rgb.view(Pixel.dtype).reshape(n_pixels) + d_rgb = DeviceArray.from_numpy(h_rgb) + d_out = DeviceArray.empty(n_segments, Pixel.dtype) + start_offsets = DeviceArray.from_numpy(h_start_offsets) + end_offsets = DeviceArray.from_numpy(h_end_offsets) h_init = Pixel(0, 0, 0) @@ -123,10 +146,10 @@ def align_up(n, m): h_init=h_init, ) - h_rgb = np.reshape(d_rgb.get(), (n_segments, -1)) + h_rgb = np.reshape(h_rgb, (n_segments, -1)) expected = h_rgb[np.arange(h_rgb.shape[0]), h_rgb["g"].argmax(axis=-1)] - np.testing.assert_equal(expected["g"], d_out.get()["g"]) + np.testing.assert_equal(expected["g"], d_out.copy_to_host()["g"]) @pytest.mark.large @@ -173,10 +196,12 @@ def scale(row_id): num_segments = (2**15 + 2**3) * 2**16 try: - res = cp.full(num_segments, fill_value=127, dtype=cp.uint8) - except cp.cuda.memory.OutOfMemoryError: + res = DeviceArray.empty(num_segments, np.uint8) + except Exception as error: + if not is_out_of_memory_error(error): + raise pytest.skip("Insufficient memory to run the large number of segments test") - assert res.size == num_segments + assert res.nbytes == num_segments * np.dtype(np.uint8).itemsize def my_add(a: np.uint8, b: np.uint8) -> np.uint8: return (a + b) % np.uint8(7) @@ -246,10 +271,12 @@ def _plus(a, b): num_segments = (2**15 + 2**3) * 2**16 try: - res = cp.full(num_segments, fill_value=-1, dtype=cp.int16) - except cp.cuda.memory.OutOfMemoryError: + res = DeviceArray.empty(num_segments, np.int16) + except Exception as error: + if not is_out_of_memory_error(error): + raise pytest.skip("Insufficient memory to run the large number of segments test") - assert res.size == num_segments + assert res.nbytes == num_segments * np.dtype(np.int16).itemsize h_init = np.zeros(tuple(), dtype=np.int16) @@ -280,10 +307,13 @@ def test_segmented_reduce_well_known_plus(monkeypatch): h_init = np.array([0], dtype=dtype) # Create segmented data: [1, 2, 3] | [4, 5] | [6, 7, 8, 9] - d_input = cp.array([1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=dtype) - d_starts = cp.array([0, 3, 5], dtype=np.int32) - d_ends = cp.array([3, 5, 9], dtype=np.int32) - d_output = cp.empty(3, dtype=dtype) + h_input = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=dtype) + h_starts = np.array([0, 3, 5], dtype=np.int32) + h_ends = np.array([3, 5, 9], dtype=np.int32) + d_input = DeviceArray.from_numpy(h_input) + d_starts = DeviceArray.from_numpy(h_starts) + d_ends = DeviceArray.from_numpy(h_ends) + d_output = DeviceArray.empty(3, dtype) cuda.compute.segmented_reduce( d_in=d_input, @@ -296,7 +326,7 @@ def test_segmented_reduce_well_known_plus(monkeypatch): ) expected = np.array([6, 9, 30]) - np.testing.assert_equal(d_output.get(), expected) + np.testing.assert_equal(d_output.copy_to_host(), expected) def test_segmented_reduce_well_known_maximum(monkeypatch): @@ -310,10 +340,13 @@ def test_segmented_reduce_well_known_maximum(monkeypatch): h_init = np.array([-100], dtype=dtype) # Create segmented data: [1, 9, 3] | [4, 2] | [6, 7, 1, 8] - d_input = cp.array([1, 9, 3, 4, 2, 6, 7, 1, 8], dtype=dtype) - d_starts = cp.array([0, 3, 5], dtype=np.int32) - d_ends = cp.array([3, 5, 9], dtype=np.int32) - d_output = cp.empty(3, dtype=dtype) + h_input = np.array([1, 9, 3, 4, 2, 6, 7, 1, 8], dtype=dtype) + h_starts = np.array([0, 3, 5], dtype=np.int32) + h_ends = np.array([3, 5, 9], dtype=np.int32) + d_input = DeviceArray.from_numpy(h_input) + d_starts = DeviceArray.from_numpy(h_starts) + d_ends = DeviceArray.from_numpy(h_ends) + d_output = DeviceArray.empty(3, dtype) cuda.compute.segmented_reduce( d_in=d_input, @@ -326,7 +359,7 @@ def test_segmented_reduce_well_known_maximum(monkeypatch): ) expected = np.array([9, 4, 8]) # max of each segment - np.testing.assert_equal(d_output.get(), expected) + np.testing.assert_equal(d_output.copy_to_host(), expected) def test_segmented_reduce_bool_maximum(monkeypatch): @@ -339,10 +372,13 @@ def test_segmented_reduce_bool_maximum(monkeypatch): h_init = np.array([False], dtype=np.bool_) # Create segmented data: [False, True] | [False, False] | [True] - d_input = cp.array([False, True, False, False, True], dtype=np.bool_) - d_starts = cp.array([0, 2, 4], dtype=np.int32) - d_ends = cp.array([2, 4, 5], dtype=np.int32) - d_output = cp.empty(3, dtype=np.bool_) + h_input = np.array([False, True, False, False, True], dtype=np.bool_) + h_starts = np.array([0, 2, 4], dtype=np.int32) + h_ends = np.array([2, 4, 5], dtype=np.int32) + d_input = DeviceArray.from_numpy(h_input) + d_starts = DeviceArray.from_numpy(h_starts) + d_ends = DeviceArray.from_numpy(h_ends) + d_output = DeviceArray.empty(3, np.bool_) cuda.compute.segmented_reduce( d_in=d_input, @@ -355,7 +391,7 @@ def test_segmented_reduce_bool_maximum(monkeypatch): ) expected = np.array([True, False, True], dtype=np.bool_) - np.testing.assert_equal(d_output.get(), expected) + np.testing.assert_equal(d_output.copy_to_host(), expected) def test_segmented_reduce_transform_output_iterator(floating_array, monkeypatch): @@ -370,13 +406,15 @@ def test_segmented_reduce_transform_output_iterator(floating_array, monkeypatch) h_init = np.array([0], dtype=dtype) # Use the floating_array fixture which provides random floating-point data of size 1000 - d_input = floating_array + d_input = DeviceArray.from_numpy(floating_array) # Create 2 segments of roughly equal size - segment_size = d_input.size // 2 - d_output = cp.empty(2, dtype=dtype) - start_offsets = cp.array([0, segment_size], dtype=np.int32) - end_offsets = cp.array([segment_size, d_input.size], dtype=np.int32) + segment_size = floating_array.size // 2 + d_output = DeviceArray.empty(2, dtype) + start_offsets = DeviceArray.from_numpy(np.array([0, segment_size], dtype=np.int32)) + end_offsets = DeviceArray.from_numpy( + np.array([segment_size, floating_array.size], dtype=np.int32) + ) def sqrt(x: dtype) -> dtype: return x**0.5 @@ -393,15 +431,15 @@ def sqrt(x: dtype) -> dtype: h_init=h_init, ) - expected = cp.sqrt( - cp.array( + expected = np.sqrt( + np.array( [ - cp.sum(d_input[0:segment_size]), - cp.sum(d_input[segment_size : d_input.size]), + np.sum(floating_array[:segment_size]), + np.sum(floating_array[segment_size:]), ] ) ) - np.testing.assert_allclose(d_output.get(), expected.get(), atol=1e-6) + np.testing.assert_allclose(d_output.copy_to_host(), expected, atol=1e-6) def test_device_segmented_reduce_for_rowwise_sum(monkeypatch): @@ -416,7 +454,7 @@ def add_op(a, b): return a + b n_rows, n_cols = 67, 12345 - rng = cp.random.default_rng() + rng = np.random.default_rng() mat = rng.integers(low=-31, high=32, dtype=np.int32, size=(n_rows, n_cols)) def make_scaler(step): @@ -431,9 +469,9 @@ def scale(row_id): end_offsets = start_offsets + 1 - d_input = mat + d_input = DeviceArray.from_numpy(mat) h_init = np.zeros(tuple(), dtype=np.int32) - d_output = cp.empty(n_rows, dtype=d_input.dtype) + d_output = DeviceArray.empty(n_rows, mat.dtype) cuda.compute.segmented_reduce( d_in=d_input, @@ -445,8 +483,8 @@ def scale(row_id): h_init=h_init, ) - expected = cp.sum(mat, axis=-1) - assert cp.all(d_output == expected) + expected = np.sum(mat, axis=-1) + np.testing.assert_array_equal(d_output.copy_to_host(), expected) def test_segmented_reduce_with_lambda(monkeypatch): @@ -461,10 +499,13 @@ def test_segmented_reduce_with_lambda(monkeypatch): h_init = np.array([0], dtype=dtype) # Create segmented data: [1, 2, 3] | [4, 5] | [6, 7, 8, 9] - d_input = cp.array([1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=dtype) - d_starts = cp.array([0, 3, 5], dtype=np.int32) - d_ends = cp.array([3, 5, 9], dtype=np.int32) - d_output = cp.empty(3, dtype=dtype) + h_input = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=dtype) + h_starts = np.array([0, 3, 5], dtype=np.int32) + h_ends = np.array([3, 5, 9], dtype=np.int32) + d_input = DeviceArray.from_numpy(h_input) + d_starts = DeviceArray.from_numpy(h_starts) + d_ends = DeviceArray.from_numpy(h_ends) + d_output = DeviceArray.empty(3, dtype) # Use a lambda function directly as the reducer cuda.compute.segmented_reduce( @@ -478,7 +519,7 @@ def test_segmented_reduce_with_lambda(monkeypatch): ) expected = np.array([6, 9, 30]) # sum of each segment - np.testing.assert_equal(d_output.get(), expected) + np.testing.assert_equal(d_output.copy_to_host(), expected) @pytest.mark.parametrize( @@ -501,21 +542,24 @@ def test_segmented_reduce_max_segment_size(max_seg_size, monkeypatch): False, ) dtype = np.int32 - rng = cp.random + rng = np.random.default_rng() num_segments = 1024 h_init = np.zeros(1, dtype=dtype) # Non-uniform segment sizes in [1, max_seg_size] - sizes = rng.randint(1, max_seg_size + 1, size=num_segments, dtype=np.int64) - offsets = cp.zeros(num_segments + 1, dtype=np.int64) - offsets[1:] = cp.cumsum(sizes) + sizes = rng.integers(1, max_seg_size + 1, size=num_segments, dtype=np.int64) + offsets = np.zeros(num_segments + 1, dtype=np.int64) + offsets[1:] = np.cumsum(sizes) - total = int(offsets[-1].item()) - d_input = rng.randint(0, 100, size=total, dtype=dtype) - d_output = cp.empty(num_segments, dtype=dtype) + total = int(offsets[-1]) + h_input = rng.integers(0, 100, size=total, dtype=dtype) + d_input = DeviceArray.from_numpy(h_input) + d_output = DeviceArray.empty(num_segments, dtype) - d_starts = offsets[:-1] - d_ends = offsets[1:] + h_starts = offsets[:-1] + h_ends = offsets[1:] + d_starts = DeviceArray.from_numpy(h_starts) + d_ends = DeviceArray.from_numpy(h_ends) cuda.compute.segmented_reduce( d_in=d_input, @@ -528,8 +572,8 @@ def test_segmented_reduce_max_segment_size(max_seg_size, monkeypatch): max_segment_size=max_seg_size, ) - expected = cp.empty(num_segments, dtype=dtype) + expected = np.empty(num_segments, dtype=dtype) for i in range(num_segments): - expected[i] = cp.sum(d_input[int(d_starts[i].item()) : int(d_ends[i].item())]) + expected[i] = np.sum(h_input[h_starts[i] : h_ends[i]]) - np.testing.assert_array_equal(d_output.get(), expected.get()) + np.testing.assert_array_equal(d_output.copy_to_host(), expected) diff --git a/python/cuda_cccl/tests/compute/test_segmented_sort.py b/python/cuda_cccl/tests/compute/test_segmented_sort.py index 2bdd03abb42..5a874e5bfd6 100644 --- a/python/cuda_cccl/tests/compute/test_segmented_sort.py +++ b/python/cuda_cccl/tests/compute/test_segmented_sort.py @@ -4,10 +4,9 @@ from typing import Tuple -import cupy as cp -import numba import numpy as np import pytest +from _utils.device_array import DeviceArray import cuda.compute @@ -118,8 +117,10 @@ def test_segmented_sort_keys(dtype, num_segments, segment_size, monkeypatch): h_in_keys = random_array(num_items, dtype, max_value=50) start_offsets, end_offsets = make_uniform_segments(num_segments, segment_size) - d_in_keys = numba.cuda.to_device(h_in_keys) - d_out_keys = numba.cuda.to_device(np.empty_like(h_in_keys)) + d_in_keys = DeviceArray.from_numpy(h_in_keys) + d_out_keys = DeviceArray.empty(h_in_keys.shape, h_in_keys.dtype) + d_start_offsets = DeviceArray.from_numpy(start_offsets) + d_end_offsets = DeviceArray.from_numpy(end_offsets) cuda.compute.segmented_sort( d_in_keys=d_in_keys, @@ -128,8 +129,8 @@ def test_segmented_sort_keys(dtype, num_segments, segment_size, monkeypatch): d_out_values=None, num_items=num_items, num_segments=num_segments, - start_offsets_in=cp.asarray(start_offsets), - end_offsets_in=cp.asarray(end_offsets), + start_offsets_in=d_start_offsets, + end_offsets_in=d_end_offsets, order=order, ) @@ -153,10 +154,12 @@ def test_segmented_sort_pairs(dtype, num_segments, segment_size): start_offsets, end_offsets = make_uniform_segments(num_segments, segment_size) - d_in_keys = numba.cuda.to_device(h_in_keys) - d_in_vals = numba.cuda.to_device(h_in_vals) - d_out_keys = numba.cuda.to_device(np.empty_like(h_in_keys)) - d_out_vals = numba.cuda.to_device(np.empty_like(h_in_vals)) + d_in_keys = DeviceArray.from_numpy(h_in_keys) + d_in_vals = DeviceArray.from_numpy(h_in_vals) + d_out_keys = DeviceArray.empty(h_in_keys.shape, h_in_keys.dtype) + d_out_vals = DeviceArray.empty(h_in_vals.shape, h_in_vals.dtype) + d_start_offsets = DeviceArray.from_numpy(start_offsets) + d_end_offsets = DeviceArray.from_numpy(end_offsets) cuda.compute.segmented_sort( d_in_keys=d_in_keys, @@ -165,8 +168,8 @@ def test_segmented_sort_pairs(dtype, num_segments, segment_size): d_out_values=d_out_vals, num_items=num_items, num_segments=num_segments, - start_offsets_in=cp.asarray(start_offsets), - end_offsets_in=cp.asarray(end_offsets), + start_offsets_in=d_start_offsets, + end_offsets_in=d_end_offsets, order=order, ) @@ -189,8 +192,10 @@ def test_segmented_sort_keys_double_buffer(dtype, num_segments, segment_size): h_in_keys = random_array(num_items, dtype, max_value=20) start_offsets, end_offsets = make_uniform_segments(num_segments, segment_size) - d_in_keys = numba.cuda.to_device(h_in_keys) - d_tmp_keys = numba.cuda.to_device(np.empty_like(h_in_keys)) + d_in_keys = DeviceArray.from_numpy(h_in_keys) + d_tmp_keys = DeviceArray.empty(h_in_keys.shape, h_in_keys.dtype) + d_start_offsets = DeviceArray.from_numpy(start_offsets) + d_end_offsets = DeviceArray.from_numpy(end_offsets) keys_db = cuda.compute.DoubleBuffer(d_in_keys, d_tmp_keys) cuda.compute.segmented_sort( @@ -200,8 +205,8 @@ def test_segmented_sort_keys_double_buffer(dtype, num_segments, segment_size): d_out_values=None, num_items=num_items, num_segments=num_segments, - start_offsets_in=cp.asarray(start_offsets), - end_offsets_in=cp.asarray(end_offsets), + start_offsets_in=d_start_offsets, + end_offsets_in=d_end_offsets, order=order, ) @@ -224,10 +229,12 @@ def test_segmented_sort_pairs_double_buffer(dtype, num_segments, segment_size): start_offsets, end_offsets = make_uniform_segments(num_segments, segment_size) - d_in_keys = numba.cuda.to_device(h_in_keys) - d_in_vals = numba.cuda.to_device(h_in_vals) - d_tmp_keys = numba.cuda.to_device(np.empty_like(h_in_keys)) - d_tmp_vals = numba.cuda.to_device(np.empty_like(h_in_vals)) + d_in_keys = DeviceArray.from_numpy(h_in_keys) + d_in_vals = DeviceArray.from_numpy(h_in_vals) + d_tmp_keys = DeviceArray.empty(h_in_keys.shape, h_in_keys.dtype) + d_tmp_vals = DeviceArray.empty(h_in_vals.shape, h_in_vals.dtype) + d_start_offsets = DeviceArray.from_numpy(start_offsets) + d_end_offsets = DeviceArray.from_numpy(end_offsets) keys_db = cuda.compute.DoubleBuffer(d_in_keys, d_tmp_keys) vals_db = cuda.compute.DoubleBuffer(d_in_vals, d_tmp_vals) @@ -239,8 +246,8 @@ def test_segmented_sort_pairs_double_buffer(dtype, num_segments, segment_size): d_out_values=None, num_items=num_items, num_segments=num_segments, - start_offsets_in=cp.asarray(start_offsets), - end_offsets_in=cp.asarray(end_offsets), + start_offsets_in=d_start_offsets, + end_offsets_in=d_end_offsets, order=order, ) @@ -297,10 +304,12 @@ def test_segmented_sort_variable_segment_sizes(num_segments): h_in_keys = random_array(num_items, np.int32, max_value=100) h_in_vals = random_array(num_items, np.float32) - d_in_keys = numba.cuda.to_device(h_in_keys) - d_in_vals = numba.cuda.to_device(h_in_vals) - d_out_keys = numba.cuda.to_device(np.empty_like(h_in_keys)) - d_out_vals = numba.cuda.to_device(np.empty_like(h_in_vals)) + d_in_keys = DeviceArray.from_numpy(h_in_keys) + d_in_vals = DeviceArray.from_numpy(h_in_vals) + d_out_keys = DeviceArray.empty(h_in_keys.shape, h_in_keys.dtype) + d_out_vals = DeviceArray.empty(h_in_vals.shape, h_in_vals.dtype) + d_start_offsets = DeviceArray.from_numpy(start_offsets) + d_end_offsets = DeviceArray.from_numpy(end_offsets) cuda.compute.segmented_sort( d_in_keys=d_in_keys, @@ -309,8 +318,8 @@ def test_segmented_sort_variable_segment_sizes(num_segments): d_out_values=d_out_vals, num_items=num_items, num_segments=num_segments, - start_offsets_in=cp.asarray(start_offsets), - end_offsets_in=cp.asarray(end_offsets), + start_offsets_in=d_start_offsets, + end_offsets_in=d_end_offsets, order=order, ) diff --git a/python/cuda_cccl/tests/compute/test_select.py b/python/cuda_cccl/tests/compute/test_select.py index f2483860a4a..d55adeba83b 100644 --- a/python/cuda_cccl/tests/compute/test_select.py +++ b/python/cuda_cccl/tests/compute/test_select.py @@ -2,9 +2,9 @@ # # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -import cupy as cp import numpy as np import pytest +from _utils.device_array import DeviceArray import cuda.compute from cuda.compute import CacheModifiedInputIterator, ZipIterator, gpu_struct @@ -59,6 +59,10 @@ def _host_select(h_in: np.ndarray, cond): return selected, np.int64(selected.size) +def _read_count(array: DeviceArray) -> int: + return int(array.copy_to_host()[0]) + + @pytest.mark.parametrize("dtype,num_items", select_params) def test_select_basic(dtype, num_items): h_in = random_array(num_items, dtype, max_value=100) @@ -66,10 +70,9 @@ def test_select_basic(dtype, num_items): def even_op(x): return x % 2 == 0 - d_in = cp.empty(num_items, dtype=dtype) - d_in.set(h_in) - d_out = cp.empty_like(d_in) - d_num_selected = cp.empty(2, dtype=np.uint64) + d_in = DeviceArray.from_numpy(h_in) + d_out = DeviceArray.empty(h_in.shape, h_in.dtype) + d_num_selected = DeviceArray.empty(2, np.uint64) cuda.compute.select( d_in=d_in, @@ -79,8 +82,8 @@ def even_op(x): num_items=num_items, ) - num_selected = int(d_num_selected[0].get()) - got = d_out.get()[:num_selected] + num_selected = _read_count(d_num_selected) + got = d_out.copy_to_host()[:num_selected] expected, expected_count = _host_select(h_in, even_op) @@ -95,9 +98,9 @@ def test_select_greater_than(dtype, num_items): def greater_than_42(x): return x > 42 - d_in = cp.asarray(h_in) - d_out = cp.empty_like(d_in) - d_num_selected = cp.empty(2, dtype=np.uint64) + d_in = DeviceArray.from_numpy(h_in) + d_out = DeviceArray.empty(h_in.shape, h_in.dtype) + d_num_selected = DeviceArray.empty(2, np.uint64) cuda.compute.select( d_in=d_in, @@ -107,8 +110,8 @@ def greater_than_42(x): num_items=num_items, ) - num_selected = int(d_num_selected[0].get()) - got = d_out.get()[:num_selected] + num_selected = _read_count(d_num_selected) + got = d_out.copy_to_host()[:num_selected] expected, expected_count = _host_select(h_in, greater_than_42) @@ -124,9 +127,9 @@ def test_select_all_pass(dtype): def always_true(x): return True - d_in = cp.asarray(h_in) - d_out = cp.empty_like(d_in) - d_num_selected = cp.empty(2, dtype=np.uint64) + d_in = DeviceArray.from_numpy(h_in) + d_out = DeviceArray.empty(h_in.shape, h_in.dtype) + d_num_selected = DeviceArray.empty(2, np.uint64) cuda.compute.select( d_in=d_in, @@ -136,8 +139,8 @@ def always_true(x): num_items=num_items, ) - num_selected = int(d_num_selected[0].get()) - got = d_out.get()[:num_selected] + num_selected = _read_count(d_num_selected) + got = d_out.copy_to_host()[:num_selected] assert num_selected == num_items assert np.array_equal(got, h_in) @@ -151,9 +154,9 @@ def test_select_none_pass(monkeypatch, dtype): def always_false(x): return False - d_in = cp.asarray(h_in) - d_out = cp.empty_like(d_in) - d_num_selected = cp.empty(2, dtype=np.int32) + d_in = DeviceArray.from_numpy(h_in) + d_out = DeviceArray.empty(h_in.shape, h_in.dtype) + d_num_selected = DeviceArray.empty(2, np.int32) cuda.compute.select( d_in=d_in, @@ -163,7 +166,7 @@ def always_false(x): num_items=num_items, ) - num_selected = int(d_num_selected[0].get()) + num_selected = _read_count(d_num_selected) assert num_selected == 0 @@ -176,9 +179,9 @@ def test_select_empty(): def even_op(x): return x % 2 == 0 - d_in = cp.asarray(h_in) - d_out = cp.empty_like(d_in) - d_num_selected = cp.empty(2, dtype=np.uint64) + d_in = DeviceArray.from_numpy(h_in) + d_out = DeviceArray.empty(h_in.shape, h_in.dtype) + d_num_selected = DeviceArray.empty(2, np.uint64) cuda.compute.select( d_in=d_in, @@ -188,7 +191,7 @@ def even_op(x): num_items=num_items, ) - num_selected = int(d_num_selected[0].get()) + num_selected = _read_count(d_num_selected) assert num_selected == 0 @@ -201,10 +204,10 @@ def test_select_with_iterator(dtype): def less_than_50(x): return x < 50 - d_in = cp.asarray(h_in) + d_in = DeviceArray.from_numpy(h_in) d_in_iter = CacheModifiedInputIterator(d_in, modifier="stream") - d_out = cp.empty_like(d_in) - d_num_selected = cp.empty(2, dtype=np.uint64) + d_out = DeviceArray.empty(h_in.shape, h_in.dtype) + d_num_selected = DeviceArray.empty(2, np.uint64) cuda.compute.select( d_in=d_in_iter, @@ -214,8 +217,8 @@ def less_than_50(x): num_items=num_items, ) - num_selected = int(d_num_selected[0].get()) - got = d_out.get()[:num_selected] + num_selected = _read_count(d_num_selected) + got = d_out.copy_to_host()[:num_selected] expected, expected_count = _host_select(h_in, less_than_50) @@ -231,9 +234,9 @@ def test_select_object_api(dtype): def divisible_by_3(x): return x % 3 == 0 - d_in = cp.asarray(h_in) - d_out = cp.empty_like(d_in) - d_num_selected = cp.empty(2, dtype=np.uint64) + d_in = DeviceArray.from_numpy(h_in) + d_out = DeviceArray.empty(h_in.shape, h_in.dtype) + d_num_selected = DeviceArray.empty(2, np.uint64) # Create select object selector = cuda.compute.make_select( @@ -254,7 +257,7 @@ def divisible_by_3(x): ) # Allocate temp storage - d_temp_storage = cp.empty(temp_storage_bytes, dtype=np.uint8) + d_temp_storage = DeviceArray.empty(temp_storage_bytes, np.uint8) # Execute select selector( @@ -266,8 +269,8 @@ def divisible_by_3(x): num_items=num_items, ) - num_selected = int(d_num_selected[0].get()) - got = d_out.get()[:num_selected] + num_selected = _read_count(d_num_selected) + got = d_out.copy_to_host()[:num_selected] expected, expected_count = _host_select(h_in, divisible_by_3) @@ -283,12 +286,12 @@ def test_select_reuse_object(dtype): def positive_op(x): return x > 0 - d_out = cp.empty(num_items, dtype=dtype) - d_num_selected = cp.empty(2, dtype=np.uint64) + d_out = DeviceArray.empty(num_items, dtype) + d_num_selected = DeviceArray.empty(2, np.uint64) # Create select object with initial input h_in1 = random_array(num_items, dtype, max_value=100) - 50 - d_in1 = cp.asarray(h_in1) + d_in1 = DeviceArray.from_numpy(h_in1) selector = cuda.compute.make_select( d_in=d_in1, d_out=d_out, @@ -305,7 +308,7 @@ def positive_op(x): cond=positive_op, num_items=num_items, ) - d_temp_storage = cp.empty(temp_storage_bytes, dtype=np.uint8) + d_temp_storage = DeviceArray.empty(temp_storage_bytes, np.uint8) selector( temp_storage=d_temp_storage, d_in=d_in1, @@ -315,8 +318,8 @@ def positive_op(x): num_items=num_items, ) - num_selected1 = int(d_num_selected[0].get()) - got1 = d_out.get()[:num_selected1] + num_selected1 = _read_count(d_num_selected) + got1 = d_out.copy_to_host()[:num_selected1] expected1, expected_count1 = _host_select(h_in1, positive_op) assert num_selected1 == expected_count1 @@ -324,7 +327,7 @@ def positive_op(x): # Reuse with different input h_in2 = random_array(num_items, dtype, max_value=100) - 50 - d_in2 = cp.asarray(h_in2) + d_in2 = DeviceArray.from_numpy(h_in2) selector( temp_storage=d_temp_storage, @@ -335,8 +338,8 @@ def positive_op(x): num_items=num_items, ) - num_selected2 = int(d_num_selected[0].get()) - got2 = d_out.get()[:num_selected2] + num_selected2 = _read_count(d_num_selected) + got2 = d_out.copy_to_host()[:num_selected2] expected2, expected_count2 = _host_select(h_in2, positive_op) assert num_selected2 == expected_count2 @@ -363,10 +366,9 @@ class Point: def in_first_quadrant(p: Point) -> np.uint8: return (p.x > 50) and (p.y > 50) - d_in = cp.empty(num_items, dtype=Point.dtype) - d_in.set(h_in) - d_out = cp.empty_like(d_in) - d_num_selected = cp.empty(2, dtype=np.uint64) + d_in = DeviceArray.from_numpy(h_in) + d_out = DeviceArray.empty(h_in.shape, h_in.dtype) + d_num_selected = DeviceArray.empty(2, np.uint64) cuda.compute.select( d_in=d_in, @@ -376,8 +378,8 @@ def in_first_quadrant(p: Point) -> np.uint8: num_items=num_items, ) - num_selected = int(d_num_selected[0].get()) - got = d_out.get()[:num_selected] + num_selected = _read_count(d_num_selected) + got = d_out.copy_to_host()[:num_selected] # Host reference def host_in_first_quadrant(p): @@ -405,19 +407,19 @@ def condition(pair): return (pair[0] + pair[1]) < 70 # Device arrays - d_in1 = cp.asarray(h_in1) - d_in2 = cp.asarray(h_in2) + d_in1 = DeviceArray.from_numpy(h_in1) + d_in2 = DeviceArray.from_numpy(h_in2) # Create zip iterator for input zip_in = ZipIterator(d_in1, d_in2) # Allocate output arrays - d_out1 = cp.empty_like(d_in1) - d_out2 = cp.empty_like(d_in2) + d_out1 = DeviceArray.empty(h_in1.shape, h_in1.dtype) + d_out2 = DeviceArray.empty(h_in2.shape, h_in2.dtype) # Create zip iterator for output zip_out = ZipIterator(d_out1, d_out2) - d_num_selected = cp.empty(1, dtype=np.int32) + d_num_selected = DeviceArray.empty(1, np.int32) cuda.compute.select( d_in=zip_in, @@ -427,11 +429,11 @@ def condition(pair): num_items=num_items, ) - num_selected = int(d_num_selected[0].get()) + num_selected = _read_count(d_num_selected) # Get results - got1 = d_out1.get()[:num_selected] - got2 = d_out2.get()[:num_selected] + got1 = d_out1.copy_to_host()[:num_selected] + got2 = d_out2.copy_to_host()[:num_selected] # Verify results: all elements should satisfy the condition for i in range(num_selected): @@ -451,15 +453,17 @@ def test_select_stateful_threshold(): # Create device state containing threshold value threshold_value = 50 - threshold_state = cp.array([threshold_value], dtype=np.int32) + threshold_state = DeviceArray.from_numpy( + np.array([threshold_value], dtype=np.int32) + ) # Define condition that references state as closure def threshold_select(x): return x > threshold_state[0] - d_in = cp.asarray(h_in) - d_out = cp.empty_like(d_in) - d_num_selected = cp.empty(2, dtype=np.uint64) + d_in = DeviceArray.from_numpy(h_in) + d_out = DeviceArray.empty(h_in.shape, h_in.dtype) + d_num_selected = DeviceArray.empty(2, np.uint64) cuda.compute.select( d_in=d_in, @@ -470,8 +474,8 @@ def threshold_select(x): ) # Check selected output - num_selected = int(d_num_selected[0].get()) - got = d_out.get()[:num_selected] + num_selected = _read_count(d_num_selected) + got = d_out.copy_to_host()[:num_selected] # Verify all output values are > threshold assert np.all(got > threshold_value) @@ -494,7 +498,7 @@ def test_select_stateful_atomic(): h_in = random_array(num_items, np.int32, max_value=100) # Create device state for counting rejected items - reject_counter = cp.zeros(1, dtype=np.int32) + reject_counter = DeviceArray.from_numpy(np.zeros(1, dtype=np.int32)) # Define condition that references state as closure def count_rejects(x): @@ -504,9 +508,9 @@ def count_rejects(x): numba_cuda.atomic.add(reject_counter, 0, 1) return False - d_in = cp.asarray(h_in) - d_out = cp.empty_like(d_in) - d_num_selected = cp.empty(2, dtype=np.uint64) + d_in = DeviceArray.from_numpy(h_in) + d_out = DeviceArray.empty(h_in.shape, h_in.dtype) + d_num_selected = DeviceArray.empty(2, np.uint64) cuda.compute.select( d_in=d_in, @@ -517,8 +521,8 @@ def count_rejects(x): ) # Check selected output - num_selected = int(d_num_selected[0].get()) - got = d_out.get()[:num_selected] + num_selected = _read_count(d_num_selected) + got = d_out.copy_to_host()[:num_selected] # Verify all output values are > 50 assert np.all(got > 50) @@ -533,7 +537,7 @@ def count_rejects(x): assert np.array_equal(got, expected_selected) # Verify state contains count of rejected items - rejected_count = int(reject_counter[0].get()) + rejected_count = _read_count(reject_counter) expected_rejected = len(h_in[h_in <= 50]) assert rejected_count == expected_rejected, ( f"Expected {expected_rejected} rejections, got {rejected_count}" @@ -544,11 +548,12 @@ def test_select_with_side_effect_counting_rejects(): """Select with side effect that counts rejected items""" from numba import cuda as numba_cuda - d_in = cp.arange(100, dtype=np.int32) - d_out = cp.empty_like(d_in) - d_num_selected = cp.empty(1, dtype=np.uint64) + h_in = np.arange(100, dtype=np.int32) + d_in = DeviceArray.from_numpy(h_in) + d_out = DeviceArray.empty(h_in.shape, h_in.dtype) + d_num_selected = DeviceArray.empty(1, np.uint64) - reject_count = cp.zeros(1, dtype=np.int32) + reject_count = DeviceArray.from_numpy(np.zeros(1, dtype=np.int32)) # Define condition that references state as closure def count_rejects(x): @@ -563,11 +568,11 @@ def count_rejects(x): d_out=d_out, d_num_selected_out=d_num_selected, cond=count_rejects, - num_items=len(d_in), + num_items=h_in.size, ) - num_selected = int(d_num_selected.get()[0]) - num_rejected = int(reject_count.get()[0]) + num_selected = _read_count(d_num_selected) + num_rejected = _read_count(reject_count) assert num_selected == 50 # Values 50-99 assert num_rejected == 50 # Values 0-49 @@ -578,9 +583,9 @@ def test_select_with_lambda(): num_items = 100 h_in = np.arange(num_items, dtype=np.int32) - d_in = cp.asarray(h_in) - d_out = cp.empty_like(d_in) - d_num_selected = cp.empty(2, dtype=np.uint64) + d_in = DeviceArray.from_numpy(h_in) + d_out = DeviceArray.empty(h_in.shape, h_in.dtype) + d_num_selected = DeviceArray.empty(2, np.uint64) # Use a lambda function directly as the predicate cuda.compute.select( @@ -591,23 +596,26 @@ def test_select_with_lambda(): num_items=num_items, ) - num_selected = int(d_num_selected.get()[0]) + num_selected = _read_count(d_num_selected) expected_selected = [x for x in h_in if x % 2 == 0] assert num_selected == len(expected_selected) - np.testing.assert_array_equal(d_out.get()[:num_selected], expected_selected) + np.testing.assert_array_equal( + d_out.copy_to_host()[:num_selected], expected_selected + ) def test_select_stateful_state_updates(): """Test that select correctly updates state between calls with different thresholds.""" num_items = 20 - d_in = cp.arange(num_items, dtype=np.int32) - d_out = cp.empty_like(d_in) - d_count = cp.zeros(2, dtype=np.uint64) + h_in = np.arange(num_items, dtype=np.int32) + d_in = DeviceArray.from_numpy(h_in) + d_out = DeviceArray.empty(h_in.shape, h_in.dtype) + d_count = DeviceArray.from_numpy(np.zeros(2, dtype=np.uint64)) # Create two different thresholds - threshold_5 = cp.array([5], dtype=np.int32) - threshold_15 = cp.array([15], dtype=np.int32) + threshold_5 = DeviceArray.from_numpy(np.array([5], dtype=np.int32)) + threshold_15 = DeviceArray.from_numpy(np.array([15], dtype=np.int32)) # Call 1: Select items > 5 (should get 14 items: 6-19) def select_gt_5(x): @@ -620,16 +628,16 @@ def select_gt_5(x): cond=select_gt_5, num_items=num_items, ) - count1 = int(d_count[0].get()) + count1 = _read_count(d_count) assert count1 == 14 expected_1 = list(range(6, 20)) - np.testing.assert_array_equal(d_out.get()[:count1], expected_1) + np.testing.assert_array_equal(d_out.copy_to_host()[:count1], expected_1) # Call 2: Select items > 15 (should get 4 items: 16-19) def select_gt_15(x): return x > threshold_15[0] - d_count.fill(0) + d_count.copy_from_host(np.zeros(2, dtype=np.uint64)) cuda.compute.select( d_in=d_in, d_out=d_out, @@ -637,13 +645,13 @@ def select_gt_15(x): cond=select_gt_15, num_items=num_items, ) - count2 = int(d_count[0].get()) + count2 = _read_count(d_count) assert count2 == 4 expected_2 = list(range(16, 20)) - np.testing.assert_array_equal(d_out.get()[:count2], expected_2) + np.testing.assert_array_equal(d_out.copy_to_host()[:count2], expected_2) # Call 3: Back to first threshold (test cache reuse with updated state) - d_count.fill(0) + d_count.copy_from_host(np.zeros(2, dtype=np.uint64)) cuda.compute.select( d_in=d_in, d_out=d_out, @@ -651,9 +659,9 @@ def select_gt_15(x): cond=select_gt_5, num_items=num_items, ) - count3 = int(d_count[0].get()) + count3 = _read_count(d_count) assert count3 == 14 - np.testing.assert_array_equal(d_out.get()[:count3], expected_1) + np.testing.assert_array_equal(d_out.copy_to_host()[:count3], expected_1) def test_select_stateful_same_bytecode_different_state(): @@ -665,9 +673,10 @@ def test_select_stateful_same_bytecode_different_state(): the same bytecode but different captured arrays would reuse stale state. """ num_items = 20 - d_in = cp.arange(num_items, dtype=np.int32) - d_out = cp.empty_like(d_in) - d_count = cp.zeros(2, dtype=np.uint64) + h_in = np.arange(num_items, dtype=np.int32) + d_in = DeviceArray.from_numpy(h_in) + d_out = DeviceArray.empty(h_in.shape, h_in.dtype) + d_count = DeviceArray.from_numpy(np.zeros(2, dtype=np.uint64)) # Factory that creates functions with identical bytecode def make_selector(threshold_array): @@ -676,8 +685,8 @@ def selector(x): return selector - threshold_5 = cp.array([5], dtype=np.int32) - threshold_15 = cp.array([15], dtype=np.int32) + threshold_5 = DeviceArray.from_numpy(np.array([5], dtype=np.int32)) + threshold_15 = DeviceArray.from_numpy(np.array([15], dtype=np.int32)) select_5 = make_selector(threshold_5) select_15 = make_selector(threshold_15) @@ -690,11 +699,11 @@ def selector(x): cond=select_5, num_items=num_items, ) - count1 = int(d_count[0].get()) + count1 = _read_count(d_count) assert count1 == 14 # Call 2: threshold > 15 (different state, same bytecode) - d_count.fill(0) + d_count.copy_from_host(np.zeros(2, dtype=np.uint64)) cuda.compute.select( d_in=d_in, d_out=d_out, @@ -702,7 +711,7 @@ def selector(x): cond=select_15, num_items=num_items, ) - count2 = int(d_count[0].get()) + count2 = _read_count(d_count) assert count2 == 4 # If this fails, cache collision bug is present @@ -712,19 +721,15 @@ def test_stateful_caching_same_dtype_different_values(): After transformation, values are runtime parameters, so they should use the same compiled code. """ - import cupy as cp - import numpy as np - - import cuda.compute - num_items = 100 - d_in = cp.arange(num_items, dtype=np.int32) - d_out = cp.empty_like(d_in) - d_count = cp.zeros(2, dtype=np.uint64) + h_in = np.arange(num_items, dtype=np.int32) + d_in = DeviceArray.from_numpy(h_in) + d_out = DeviceArray.empty(h_in.shape, h_in.dtype) + d_count = DeviceArray.from_numpy(np.zeros(2, dtype=np.uint64)) # Two thresholds with SAME dtype, SAME size, DIFFERENT values - threshold_30 = cp.array([30], dtype=np.int32) - threshold_70 = cp.array([70], dtype=np.int32) + threshold_30 = DeviceArray.from_numpy(np.array([30], dtype=np.int32)) + threshold_70 = DeviceArray.from_numpy(np.array([70], dtype=np.int32)) # Test with threshold_30 def select_gt_30(x): @@ -737,14 +742,14 @@ def select_gt_30(x): cond=select_gt_30, num_items=num_items, ) - count_30 = int(d_count[0].get()) + count_30 = _read_count(d_count) # Test with threshold_70 def select_gt_70(x): return x > threshold_70[0] - d_out.fill(0) - d_count.fill(0) + d_out.copy_from_host(np.zeros_like(h_in)) + d_count.copy_from_host(np.zeros(2, dtype=np.uint64)) cuda.compute.select( d_in=d_in, d_out=d_out, @@ -752,7 +757,7 @@ def select_gt_70(x): cond=select_gt_70, num_items=num_items, ) - count_70 = int(d_count[0].get()) + count_70 = _read_count(d_count) # Verify correct results (not cache collision) assert count_30 == 69 # Values 31-99 diff --git a/python/cuda_cccl/tests/compute/test_shuffle_iterator.py b/python/cuda_cccl/tests/compute/test_shuffle_iterator.py index 5c82fe1ad38..2d442abce02 100644 --- a/python/cuda_cccl/tests/compute/test_shuffle_iterator.py +++ b/python/cuda_cccl/tests/compute/test_shuffle_iterator.py @@ -2,9 +2,9 @@ # # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -import cupy as cp import numpy as np import pytest +from _utils.device_array import DeviceArray import cuda.compute from cuda.compute.iterators import ( @@ -19,12 +19,12 @@ def test_shuffle_iterator_bijectivity(): shuffle_it = ShuffleIterator(num_items, seed) - d_output = cp.empty(num_items, dtype=np.int64) + d_output = DeviceArray.empty(num_items, np.int64) cuda.compute.unary_transform( d_in=shuffle_it, d_out=d_output, op=lambda x: x, num_items=num_items ) - result = d_output.get() + result = d_output.copy_to_host() assert len(set(result)) == num_items assert set(result) == set(range(num_items)) @@ -37,8 +37,8 @@ def test_shuffle_iterator_determinism(): shuffle_it1 = ShuffleIterator(num_items, seed) shuffle_it2 = ShuffleIterator(num_items, seed) - d_output1 = cp.empty(num_items, dtype=np.int64) - d_output2 = cp.empty(num_items, dtype=np.int64) + d_output1 = DeviceArray.empty(num_items, np.int64) + d_output2 = DeviceArray.empty(num_items, np.int64) cuda.compute.unary_transform( d_in=shuffle_it1, d_out=d_output1, op=lambda x: x, num_items=num_items @@ -47,7 +47,7 @@ def test_shuffle_iterator_determinism(): d_in=shuffle_it2, d_out=d_output2, op=lambda x: x, num_items=num_items ) - cp.testing.assert_array_equal(d_output1, d_output2) + np.testing.assert_array_equal(d_output1.copy_to_host(), d_output2.copy_to_host()) @pytest.mark.parametrize("num_items", [1, 2, 7, 16, 17, 100, 1000, 1023, 1024, 1025]) @@ -56,12 +56,12 @@ def test_shuffle_iterator_various_sizes(num_items): shuffle_it = ShuffleIterator(num_items, seed) - d_output = cp.empty(num_items, dtype=np.int64) + d_output = DeviceArray.empty(num_items, np.int64) cuda.compute.unary_transform( d_in=shuffle_it, d_out=d_output, op=lambda x: x, num_items=num_items ) - result = d_output.get() + result = d_output.copy_to_host() assert len(set(result)) == num_items assert set(result) == set(range(num_items)) @@ -71,20 +71,21 @@ def test_shuffle_iterator_with_permutation_iterator(): num_items = 10 seed = 42 - d_values = cp.asarray([10, 20, 30, 40, 50, 60, 70, 80, 90, 100], dtype=np.int32) + h_values = np.asarray([10, 20, 30, 40, 50, 60, 70, 80, 90, 100], dtype=np.int32) + d_values = DeviceArray.from_numpy(h_values) shuffle_it = ShuffleIterator(num_items, seed) perm_it = PermutationIterator(d_values, shuffle_it) - d_output = cp.empty(num_items, dtype=np.int32) + d_output = DeviceArray.empty(num_items, np.int32) cuda.compute.unary_transform( d_in=perm_it, d_out=d_output, op=lambda x: x, num_items=num_items ) - result = d_output.get() + result = d_output.copy_to_host() - assert result.sum() == d_values.sum() - assert sorted(result) == sorted(d_values.get()) + assert result.sum() == h_values.sum() + assert sorted(result) == sorted(h_values) def test_shuffle_iterator_invalid_num_items(): diff --git a/python/cuda_cccl/tests/compute/test_three_way_partition.py b/python/cuda_cccl/tests/compute/test_three_way_partition.py index 3fb5b102275..51958c6e1f1 100644 --- a/python/cuda_cccl/tests/compute/test_three_way_partition.py +++ b/python/cuda_cccl/tests/compute/test_three_way_partition.py @@ -2,9 +2,9 @@ # # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -import cupy as cp import numpy as np import pytest +from _utils.device_array import DeviceArray import cuda.compute from cuda.compute import CacheModifiedInputIterator, gpu_struct @@ -85,11 +85,11 @@ def less_than_op(x): def greater_equal_op(x): return x >= 42 - d_in = cp.asarray(h_in) - d_first = cp.empty_like(d_in) - d_second = cp.empty_like(d_in) - d_unselected = cp.empty_like(d_in) - d_num_selected = cp.empty(2, dtype=np.int32) + d_in = DeviceArray.from_numpy(h_in) + d_first = DeviceArray.empty(h_in.shape, h_in.dtype) + d_second = DeviceArray.empty(h_in.shape, h_in.dtype) + d_unselected = DeviceArray.empty(h_in.shape, h_in.dtype) + d_num_selected = DeviceArray.empty(2, np.int32) cuda.compute.three_way_partition( d_in=d_in, d_first_part_out=d_first, @@ -101,10 +101,10 @@ def greater_equal_op(x): num_items=num_items, ) - num_selected = d_num_selected.get() - got_first = d_first.get()[: int(num_selected[0])] - got_second = d_second.get()[: int(num_selected[1])] - got_unselected = d_unselected.get()[ + num_selected = d_num_selected.copy_to_host() + got_first = d_first.copy_to_host()[: int(num_selected[0])] + got_second = d_second.copy_to_host()[: int(num_selected[1])] + got_unselected = d_unselected.copy_to_host()[ : int(num_items) - int(num_selected[0]) - int(num_selected[1]) ] @@ -120,11 +120,11 @@ def greater_equal_op(x): def test_three_way_partition_empty(): dtype = np.int32 - d_in = cp.empty(0, dtype=dtype) - d_first = cp.empty(0, dtype=dtype) - d_second = cp.empty(0, dtype=dtype) - d_unselected = cp.empty(0, dtype=dtype) - d_num_selected = cp.zeros(2, dtype=np.int64) + d_in = DeviceArray.empty(0, dtype) + d_first = DeviceArray.empty(0, dtype) + d_second = DeviceArray.empty(0, dtype) + d_unselected = DeviceArray.empty(0, dtype) + d_num_selected = DeviceArray.from_numpy(np.zeros(2, dtype=np.int64)) def less_than_op(x): return x < 42 @@ -143,7 +143,7 @@ def greater_equal_op(x): num_items=0, ) - np.testing.assert_array_equal(d_num_selected.get(), np.array([0, 0])) + np.testing.assert_array_equal(d_num_selected.copy_to_host(), np.array([0, 0])) def test_three_way_partition_with_iterators(): @@ -161,13 +161,13 @@ def greater_equal_op(x): _host_three_way_partition(h_in, less_than_op, greater_equal_op) ) - d_in = cp.asarray(h_in) + d_in = DeviceArray.from_numpy(h_in) in_it = CacheModifiedInputIterator(d_in, modifier="stream") - d_first = cp.empty_like(d_in) - d_second = cp.empty_like(d_in) - d_unselected = cp.empty_like(d_in) - d_num_selected = cp.empty(2, dtype=np.uint32) + d_first = DeviceArray.empty(h_in.shape, h_in.dtype) + d_second = DeviceArray.empty(h_in.shape, h_in.dtype) + d_unselected = DeviceArray.empty(h_in.shape, h_in.dtype) + d_num_selected = DeviceArray.empty(2, np.uint32) cuda.compute.three_way_partition( d_in=in_it, @@ -180,10 +180,10 @@ def greater_equal_op(x): num_items=num_items, ) - num_selected = d_num_selected.get() - got_first = d_first.get()[: int(num_selected[0])] - got_second = d_second.get()[: int(num_selected[1])] - got_unselected = d_unselected.get()[ + num_selected = d_num_selected.copy_to_host() + got_first = d_first.copy_to_host()[: int(num_selected[0])] + got_second = d_second.copy_to_host()[: int(num_selected[1])] + got_unselected = d_unselected.copy_to_host()[ : int(num_items) - int(num_selected[0]) - int(num_selected[1]) ] @@ -224,12 +224,11 @@ def greater_equal_op(x: pair_type): expected_second = h_in[remaining_mask][expected_second_mask] expected_unselected = h_in[remaining_mask][~expected_second_mask] - h_in_i32 = h_in.view(np.int32).reshape(num_items, 4) - d_in = cp.asarray(h_in_i32).view(pair_type.dtype).reshape(num_items) - d_first = cp.empty_like(d_in) - d_second = cp.empty_like(d_in) - d_unselected = cp.empty_like(d_in) - d_num_selected = cp.empty(2, dtype=np.uint64) + d_in = DeviceArray.from_numpy(h_in) + d_first = DeviceArray.empty(h_in.shape, h_in.dtype) + d_second = DeviceArray.empty(h_in.shape, h_in.dtype) + d_unselected = DeviceArray.empty(h_in.shape, h_in.dtype) + d_num_selected = DeviceArray.empty(2, np.uint64) cuda.compute.three_way_partition( d_in=d_in, @@ -242,10 +241,10 @@ def greater_equal_op(x: pair_type): num_items=num_items, ) - num_selected = d_num_selected.get() - got_first = d_first.get()[: int(num_selected[0])] - got_second = d_second.get()[: int(num_selected[1])] - got_unselected = d_unselected.get()[ + num_selected = d_num_selected.copy_to_host() + got_first = d_first.copy_to_host()[: int(num_selected[0])] + got_second = d_second.copy_to_host()[: int(num_selected[1])] + got_unselected = d_unselected.copy_to_host()[ : int(num_items) - int(num_selected[0]) - int(num_selected[1]) ] @@ -269,13 +268,11 @@ def greater_equal_op(x): _host_three_way_partition(h_in, less_than_op, greater_equal_op) ) - cp_stream = cp.cuda.ExternalStream(cuda_stream.ptr) - with cp_stream: - d_in = cp.asarray(h_in) - d_first = cp.empty_like(d_in) - d_second = cp.empty_like(d_in) - d_unselected = cp.empty_like(d_in) - d_num_selected = cp.empty(2, dtype=np.int64) + d_in = DeviceArray.from_numpy(h_in, stream=cuda_stream) + d_first = DeviceArray.empty(h_in.shape, h_in.dtype, stream=cuda_stream) + d_second = DeviceArray.empty(h_in.shape, h_in.dtype, stream=cuda_stream) + d_unselected = DeviceArray.empty(h_in.shape, h_in.dtype, stream=cuda_stream) + d_num_selected = DeviceArray.empty(2, np.int64, stream=cuda_stream) cuda.compute.three_way_partition( d_in=d_in, @@ -289,13 +286,12 @@ def greater_equal_op(x): stream=cuda_stream, ) - with cp_stream: - num_selected = d_num_selected.get() - got_first = d_first.get()[: int(num_selected[0])] - got_second = d_second.get()[: int(num_selected[1])] - got_unselected = d_unselected.get()[ - : int(num_items) - int(num_selected[0]) - int(num_selected[1]) - ] + num_selected = d_num_selected.copy_to_host(stream=cuda_stream) + got_first = d_first.copy_to_host(stream=cuda_stream)[: int(num_selected[0])] + got_second = d_second.copy_to_host(stream=cuda_stream)[: int(num_selected[1])] + got_unselected = d_unselected.copy_to_host(stream=cuda_stream)[ + : int(num_items) - int(num_selected[0]) - int(num_selected[1]) + ] np.testing.assert_array_equal(got_first, expected_first) np.testing.assert_array_equal(got_second, expected_second) @@ -313,11 +309,11 @@ def less_than_op(x): def greater_equal_op(x): return x == 102 - d_in = cp.asarray(h_in) - d_first = cp.empty_like(d_in) - d_second = cp.empty_like(d_in) - d_unselected = cp.empty_like(d_in) - d_num_selected = cp.empty(2, dtype=np.int64) + d_in = DeviceArray.from_numpy(h_in) + d_first = DeviceArray.empty(h_in.shape, h_in.dtype) + d_second = DeviceArray.empty(h_in.shape, h_in.dtype) + d_unselected = DeviceArray.empty(h_in.shape, h_in.dtype) + d_num_selected = DeviceArray.empty(2, np.int64) cuda.compute.three_way_partition( d_in=d_in, @@ -330,12 +326,12 @@ def greater_equal_op(x): num_items=num_items, ) - num_selected = d_num_selected.get() + num_selected = d_num_selected.copy_to_host() assert int(num_selected[0]) == 0 and int(num_selected[1]) == 0 - got_first = d_first.get()[: int(num_selected[0])] - got_second = d_second.get()[: int(num_selected[1])] - got_unselected = d_unselected.get()[:num_items] + got_first = d_first.copy_to_host()[: int(num_selected[0])] + got_second = d_second.copy_to_host()[: int(num_selected[1])] + got_unselected = d_unselected.copy_to_host()[:num_items] np.testing.assert_array_equal(got_first, np.empty(0, dtype=dtype)) np.testing.assert_array_equal(got_second, np.empty(0, dtype=dtype)) @@ -350,11 +346,11 @@ def test_three_way_partition_same_predicate(): def always_true(x): return True - d_in = cp.asarray(h_in) - d_first = cp.empty_like(d_in) - d_second = cp.empty_like(d_in) - d_unselected = cp.empty_like(d_in) - d_num_selected = cp.empty(2, dtype=np.int64) + d_in = DeviceArray.from_numpy(h_in) + d_first = DeviceArray.empty(h_in.shape, h_in.dtype) + d_second = DeviceArray.empty(h_in.shape, h_in.dtype) + d_unselected = DeviceArray.empty(h_in.shape, h_in.dtype) + d_num_selected = DeviceArray.empty(2, np.int64) cuda.compute.three_way_partition( d_in=d_in, @@ -367,7 +363,7 @@ def always_true(x): num_items=num_items, ) - num_selected = d_num_selected.get() + num_selected = d_num_selected.copy_to_host() assert int(num_selected[0]) == num_items assert int(num_selected[1]) == 0 @@ -383,11 +379,11 @@ def less_than_op(x): def greater_equal_op(x): return x == 42 - d_in = cp.asarray(h_in) - d_first = cp.empty_like(d_in) - d_second = cp.empty_like(d_in) - d_unselected = cp.empty_like(d_in) - d_num_selected = cp.empty(2, dtype=np.int64) + d_in = DeviceArray.from_numpy(h_in) + d_first = DeviceArray.empty(h_in.shape, h_in.dtype) + d_second = DeviceArray.empty(h_in.shape, h_in.dtype) + d_unselected = DeviceArray.empty(h_in.shape, h_in.dtype) + d_num_selected = DeviceArray.empty(2, np.int64) cuda.compute.three_way_partition( d_in=d_in, @@ -400,12 +396,12 @@ def greater_equal_op(x): num_items=num_items, ) - num_selected = d_num_selected.get() + num_selected = d_num_selected.copy_to_host() assert int(num_selected[0]) == num_items and int(num_selected[1]) == 0 - got_first = d_first.get()[: int(num_selected[0])] - got_second = d_second.get()[: int(num_selected[1])] - got_unselected = d_unselected.get()[ + got_first = d_first.copy_to_host()[: int(num_selected[0])] + got_second = d_second.copy_to_host()[: int(num_selected[1])] + got_unselected = d_unselected.copy_to_host()[ : int(num_items) - int(num_selected[0]) - int(num_selected[1]) ] diff --git a/python/cuda_cccl/tests/compute/test_transform.py b/python/cuda_cccl/tests/compute/test_transform.py index c7c3ca2818a..1b17bbc9212 100644 --- a/python/cuda_cccl/tests/compute/test_transform.py +++ b/python/cuda_cccl/tests/compute/test_transform.py @@ -2,9 +2,9 @@ # # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -import cupy as cp import numpy as np import pytest +from _utils.device_array import DeviceArray import cuda.compute from cuda.compute import ( @@ -46,13 +46,14 @@ def test_unary_transform(input_array): def op(a): return a + 1 - d_in = input_array - d_out = cp.empty_like(d_in) + h_in = input_array + d_in = DeviceArray.from_numpy(h_in) + d_out = DeviceArray.empty(h_in.shape, h_in.dtype) - unary_transform_device(d_in, d_out, len(d_in), op) + unary_transform_device(d_in, d_out, h_in.size, op) - got = d_out.get() - expected = unary_transform_host(d_in.get(), op) + got = d_out.copy_to_host() + expected = unary_transform_host(h_in, op) np.testing.assert_allclose(expected, got, rtol=1e-5) @@ -64,14 +65,16 @@ def test_binary_transform(input_array): def op(a, b): return a + b - d_in1 = input_array - d_in2 = input_array - d_out = cp.empty_like(d_in1) + h_in1 = input_array + h_in2 = input_array + d_in1 = DeviceArray.from_numpy(h_in1) + d_in2 = DeviceArray.from_numpy(h_in2) + d_out = DeviceArray.empty(h_in1.shape, h_in1.dtype) - binary_transform_device(d_in1, d_in2, d_out, len(d_in1), op) + binary_transform_device(d_in1, d_in2, d_out, h_in1.size, op) - got = d_out.get() - expected = binary_transform_host(d_in1.get(), d_in2.get(), op) + got = d_out.copy_to_host() + expected = binary_transform_host(h_in1, h_in2, op) np.testing.assert_allclose(expected, got, rtol=1e-5) @@ -92,20 +95,12 @@ def op(a): h_in = np.empty(num_values, dtype=MyStruct.dtype) h_in["x"] = np.arange(num_values) h_in["y"] = 1 - d_in = cp.empty_like(h_in) + d_in = DeviceArray.from_numpy(h_in) + d_out = DeviceArray.empty(h_in.shape, h_in.dtype) - cp.cuda.runtime.memcpy( - d_in.data.ptr, - h_in.__array_interface__["data"][0], - h_in.nbytes, - cp.cuda.runtime.memcpyHostToDevice, - ) - - d_out = cp.empty_like(d_in) - - cuda.compute.unary_transform(d_in=d_in, d_out=d_out, op=op, num_items=len(d_in)) + cuda.compute.unary_transform(d_in=d_in, d_out=d_out, op=op, num_items=h_in.size) - got = d_out.get() + got = d_out.copy_to_host() np.testing.assert_allclose(got["x"], np.arange(num_values) * 2) np.testing.assert_allclose(got["y"], np.ones(num_values) + 10) @@ -132,29 +127,15 @@ def op(a, b): h_in2["x"] = np.random.randint(0, num_values, num_values, dtype="int16") h_in2["y"] = np.random.randint(0, num_values, num_values, dtype="uint64") - d_in1 = cp.empty_like(h_in1) - d_in2 = cp.empty_like(h_in2) - - cp.cuda.runtime.memcpy( - d_in1.data.ptr, - h_in1.__array_interface__["data"][0], - h_in1.nbytes, - cp.cuda.runtime.memcpyHostToDevice, - ) - cp.cuda.runtime.memcpy( - d_in2.data.ptr, - h_in2.__array_interface__["data"][0], - h_in2.nbytes, - cp.cuda.runtime.memcpyHostToDevice, - ) - - d_out = cp.empty_like(d_in1) + d_in1 = DeviceArray.from_numpy(h_in1) + d_in2 = DeviceArray.from_numpy(h_in2) + d_out = DeviceArray.empty(h_in1.shape, h_in1.dtype) cuda.compute.binary_transform( - d_in1=d_in1, d_in2=d_in2, d_out=d_out, op=op, num_items=len(d_in1) + d_in1=d_in1, d_in2=d_in2, d_out=d_out, op=op, num_items=h_in1.size ) - got = d_out.get() + got = d_out.copy_to_host() np.testing.assert_allclose(got["x"], h_in1["x"] + h_in2["x"]) np.testing.assert_allclose(got["y"], h_in1["y"] + h_in2["y"]) @@ -167,11 +148,11 @@ def op(a): d_in = CountingIterator(np.int32(0)) num_items = 1024 - d_out = cp.empty(num_items, dtype=np.int32) + d_out = DeviceArray.empty(num_items, np.int32) unary_transform_device(d_in, d_out, num_items, op) - got = d_out.get() + got = d_out.copy_to_host() expected = np.arange(1, num_items + 1, dtype=np.int32) np.testing.assert_allclose(expected, got) @@ -185,11 +166,11 @@ def op(a, b): d_in2 = CountingIterator(np.int32(1)) num_items = 1024 - d_out = cp.empty(num_items, dtype=np.int32) + d_out = DeviceArray.empty(num_items, np.int32) binary_transform_device(d_in1, d_in2, d_out, num_items, op) - got = d_out.get() + got = d_out.copy_to_host() expected = np.arange(1, 2 * num_items + 1, step=2, dtype=np.int32) np.testing.assert_allclose(expected, got) @@ -199,18 +180,15 @@ def test_unary_transform_with_stream(cuda_stream): def op(a): return a + 1 - cp_stream = cp.cuda.ExternalStream(cuda_stream.ptr) - n = 10 - - with cp_stream: - d_in = cp.arange(n, dtype=np.int32) - d_out = cp.empty_like(d_in) + h_in = np.arange(n, dtype=np.int32) + d_in = DeviceArray.from_numpy(h_in, stream=cuda_stream) + d_out = DeviceArray.empty(h_in.shape, h_in.dtype, stream=cuda_stream) unary_transform_device(d_in, d_out, n, op, stream=cuda_stream) - got = d_out.get() - expected = unary_transform_host(d_in.get(), op) + got = d_out.copy_to_host(stream=cuda_stream) + expected = unary_transform_host(h_in, op) np.testing.assert_allclose(expected, got, rtol=1e-5) @@ -219,19 +197,17 @@ def test_binary_transform_with_stream(cuda_stream): def op(a, b): return a + b - cp_stream = cp.cuda.ExternalStream(cuda_stream.ptr) - n = 10 - - with cp_stream: - d_in1 = cp.arange(n, dtype=np.int32) - d_in2 = cp.arange(n, dtype=np.int32) - d_out = cp.empty_like(d_in1) + h_in1 = np.arange(n, dtype=np.int32) + h_in2 = np.arange(n, dtype=np.int32) + d_in1 = DeviceArray.from_numpy(h_in1, stream=cuda_stream) + d_in2 = DeviceArray.from_numpy(h_in2, stream=cuda_stream) + d_out = DeviceArray.empty(h_in1.shape, h_in1.dtype, stream=cuda_stream) binary_transform_device(d_in1, d_in2, d_out, n, op, stream=cuda_stream) - got = d_out.get() - expected = binary_transform_host(d_in1.get(), d_in2.get(), op) + got = d_out.copy_to_host(stream=cuda_stream) + expected = binary_transform_host(h_in1, h_in2, op) np.testing.assert_allclose(expected, got, rtol=1e-5) @@ -244,11 +220,11 @@ def op(a, b): d_in2 = CountingIterator(np.int32(1)) num_items = 1024 - d_out = cp.empty(num_items, dtype=np.int32) + d_out = DeviceArray.empty(num_items, np.int32) binary_transform_device(d_in1, d_in2, d_out, num_items, op) - got = d_out.get() + got = d_out.copy_to_host() expected = np.arange(1, 2 * num_items + 1, step=2, dtype=np.int32) np.testing.assert_allclose(expected, got) @@ -260,7 +236,7 @@ def op2(a): return a + 1 unary_transform_device(d_in2, d_out, num_items, op2) - got = d_out.get() + got = d_out.copy_to_host() expected = np.arange(1, num_items + 1, dtype=np.int32) + 1 np.testing.assert_allclose(expected, got) @@ -269,41 +245,45 @@ def op2(a): def test_unary_transform_well_known_negate(): """Test unary transform with well-known NEGATE operation.""" dtype = np.int32 - d_input = cp.array([1, -2, 3, -4, 5], dtype=dtype) - d_output = cp.empty_like(d_input, dtype=dtype) + h_input = np.array([1, -2, 3, -4, 5], dtype=dtype) + d_input = DeviceArray.from_numpy(h_input) + d_output = DeviceArray.empty(h_input.shape, dtype) # Run unary transform with well-known NEGATE operation cuda.compute.unary_transform( - d_in=d_input, d_out=d_output, op=OpKind.NEGATE, num_items=len(d_input) + d_in=d_input, d_out=d_output, op=OpKind.NEGATE, num_items=h_input.size ) # Check the result is correct expected = np.array([-1, 2, -3, 4, -5]) - np.testing.assert_equal(d_output.get(), expected) + np.testing.assert_equal(d_output.copy_to_host(), expected) def test_unary_transform_well_known_identity(): """Test unary transform with well-known IDENTITY operation.""" dtype = np.int32 - d_input = cp.array([1, 2, 3, 4, 5], dtype=dtype) - d_output = cp.empty_like(d_input, dtype=dtype) + h_input = np.array([1, 2, 3, 4, 5], dtype=dtype) + d_input = DeviceArray.from_numpy(h_input) + d_output = DeviceArray.empty(h_input.shape, dtype) # Run unary transform with well-known IDENTITY operation cuda.compute.unary_transform( - d_in=d_input, d_out=d_output, op=OpKind.IDENTITY, num_items=len(d_input) + d_in=d_input, d_out=d_output, op=OpKind.IDENTITY, num_items=h_input.size ) # Check the result is correct expected = np.array([1, 2, 3, 4, 5]) - np.testing.assert_equal(d_output.get(), expected) + np.testing.assert_equal(d_output.copy_to_host(), expected) @pytest.mark.parametrize("dtype", [np.int32, np.float16]) def test_binary_transform_well_known_plus(dtype): """Test binary transform with well-known PLUS operation.""" - d_input1 = cp.array([1, 2, 3, 4, 5], dtype=dtype) - d_input2 = cp.array([10, 20, 30, 40, 50], dtype=dtype) - d_output = cp.empty_like(d_input1, dtype=dtype) + h_input1 = np.array([1, 2, 3, 4, 5], dtype=dtype) + h_input2 = np.array([10, 20, 30, 40, 50], dtype=dtype) + d_input1 = DeviceArray.from_numpy(h_input1) + d_input2 = DeviceArray.from_numpy(h_input2) + d_output = DeviceArray.empty(h_input1.shape, dtype) # Run binary transform with well-known PLUS operation cuda.compute.binary_transform( @@ -311,20 +291,22 @@ def test_binary_transform_well_known_plus(dtype): d_in2=d_input2, d_out=d_output, op=OpKind.PLUS, - num_items=len(d_input1), + num_items=h_input1.size, ) # Check the result is correct expected = np.array([11, 22, 33, 44, 55]) - np.testing.assert_equal(d_output.get(), expected) + np.testing.assert_equal(d_output.copy_to_host(), expected) def test_binary_transform_well_known_multiplies(): """Test binary transform with well-known MULTIPLIES operation.""" dtype = np.int32 - d_input1 = cp.array([1, 2, 3, 4, 5], dtype=dtype) - d_input2 = cp.array([2, 3, 4, 5, 6], dtype=dtype) - d_output = cp.empty_like(d_input1, dtype=dtype) + h_input1 = np.array([1, 2, 3, 4, 5], dtype=dtype) + h_input2 = np.array([2, 3, 4, 5, 6], dtype=dtype) + d_input1 = DeviceArray.from_numpy(h_input1) + d_input2 = DeviceArray.from_numpy(h_input2) + d_output = DeviceArray.empty(h_input1.shape, dtype) # Run binary transform with well-known MULTIPLIES operation cuda.compute.binary_transform( @@ -332,12 +314,12 @@ def test_binary_transform_well_known_multiplies(): d_in2=d_input2, d_out=d_output, op=OpKind.MULTIPLIES, - num_items=len(d_input1), + num_items=h_input1.size, ) # Check the result is correct expected = np.array([2, 6, 12, 20, 30]) - np.testing.assert_equal(d_output.get(), expected) + np.testing.assert_equal(d_output.copy_to_host(), expected) def test_unary_transform_struct_type_with_annotations(): @@ -355,16 +337,14 @@ def scale_op(p: Point) -> Point: h_in["x"] = np.random.rand(num_items).astype(np.float32) h_in["y"] = np.random.rand(num_items).astype(np.float32) - d_in = cp.empty_like(h_in) - d_in.set(h_in) - - d_out = cp.empty_like(d_in) + d_in = DeviceArray.from_numpy(h_in) + d_out = DeviceArray.empty(h_in.shape, h_in.dtype) cuda.compute.unary_transform( d_in=d_in, d_out=d_out, op=scale_op, num_items=num_items ) - result = d_out.get() + result = d_out.copy_to_host() np.testing.assert_allclose(result["x"], h_in["x"] * 2.0, rtol=1e-5) np.testing.assert_allclose(result["y"], h_in["y"] * 3.0, rtol=1e-5) @@ -389,19 +369,15 @@ def add_vectors(v1: Vec2D, v2: Vec2D) -> Vec2D: h_in2["x"] = np.random.randint(-100, 100, num_items, dtype=np.int32) h_in2["y"] = np.random.randint(-100, 100, num_items, dtype=np.int32) - d_in1 = cp.empty_like(h_in1) - d_in1.set(h_in1) - - d_in2 = cp.empty_like(h_in2) - d_in2.set(h_in2) - - d_out = cp.empty_like(d_in1) + d_in1 = DeviceArray.from_numpy(h_in1) + d_in2 = DeviceArray.from_numpy(h_in2) + d_out = DeviceArray.empty(h_in1.shape, h_in1.dtype) cuda.compute.binary_transform( d_in1=d_in1, d_in2=d_in2, d_out=d_out, op=add_vectors, num_items=num_items ) - result = d_out.get() + result = d_out.copy_to_host() np.testing.assert_equal(result["x"], h_in1["x"] + h_in2["x"]) np.testing.assert_equal(result["y"], h_in1["y"] + h_in2["y"]) @@ -411,10 +387,11 @@ def test_unary_transform_stateful_counting(): """Test unary_transform with state that counts even numbers.""" from numba import cuda as numba_cuda - d_in = cp.arange(100, dtype=np.int32) - d_out = cp.empty_like(d_in) + h_in = np.arange(100, dtype=np.int32) + d_in = DeviceArray.from_numpy(h_in) + d_out = DeviceArray.empty(h_in.shape, h_in.dtype) - even_count = cp.zeros(1, dtype=np.int32) + even_count = DeviceArray.from_numpy(np.zeros(1, dtype=np.int32)) # Define op that references state as closure def count_evens(x): @@ -423,25 +400,26 @@ def count_evens(x): return x * 2 cuda.compute.unary_transform( - d_in=d_in, d_out=d_out, op=count_evens, num_items=len(d_in) + d_in=d_in, d_out=d_out, op=count_evens, num_items=h_in.size ) - expected_output = cp.arange(100, dtype=np.int32) * 2 - np.testing.assert_array_equal(d_out.get(), expected_output.get()) + expected_output = h_in * 2 + np.testing.assert_array_equal(d_out.copy_to_host(), expected_output) - num_evens = int(even_count.get()[0]) + num_evens = int(even_count.copy_to_host()[0]) assert num_evens == 50 # 0, 2, 4, ..., 98 def test_unary_transform_stateful_state_updates(): """Test that stateful transform correctly updates state between calls.""" num_items = 20 - d_in = cp.arange(num_items, dtype=np.int32) - d_out = cp.empty_like(d_in) + h_in = np.arange(num_items, dtype=np.int32) + d_in = DeviceArray.from_numpy(h_in) + d_out = DeviceArray.empty(h_in.shape, h_in.dtype) # Create two different thresholds - threshold_10 = cp.array([10], dtype=np.int32) - threshold_15 = cp.array([15], dtype=np.int32) + threshold_10 = DeviceArray.from_numpy(np.array([10], dtype=np.int32)) + threshold_15 = DeviceArray.from_numpy(np.array([15], dtype=np.int32)) # Call 1: x + 10 def add_threshold_10(x): @@ -450,41 +428,42 @@ def add_threshold_10(x): cuda.compute.unary_transform( d_in=d_in, d_out=d_out, op=add_threshold_10, num_items=num_items ) - result_1 = d_out.get() - expected_1 = d_in.get() + 10 + result_1 = d_out.copy_to_host() + expected_1 = h_in + 10 np.testing.assert_array_equal(result_1, expected_1) # Call 2: x + 15 (different state) def add_threshold_15(x): return x + threshold_15[0] - d_out.fill(0) + d_out.copy_from_host(np.zeros_like(h_in)) cuda.compute.unary_transform( d_in=d_in, d_out=d_out, op=add_threshold_15, num_items=num_items ) - result_2 = d_out.get() - expected_2 = d_in.get() + 15 + result_2 = d_out.copy_to_host() + expected_2 = h_in + 15 np.testing.assert_array_equal(result_2, expected_2) # Call 3: Back to first threshold (test cache reuse with updated state) - d_out.fill(0) + d_out.copy_from_host(np.zeros_like(h_in)) cuda.compute.unary_transform( d_in=d_in, d_out=d_out, op=add_threshold_10, num_items=num_items ) - result_3 = d_out.get() - expected_3 = d_in.get() + 10 + result_3 = d_out.copy_to_host() + expected_3 = h_in + 10 np.testing.assert_array_equal(result_3, expected_3) def test_unary_transform_stateful_multiple_arrays(): """Test stateful transform with multiple captured arrays.""" num_items = 10 - d_in = cp.arange(num_items, dtype=np.int32) - d_out = cp.empty_like(d_in) + h_in = np.arange(num_items, dtype=np.int32) + d_in = DeviceArray.from_numpy(h_in) + d_out = DeviceArray.empty(h_in.shape, h_in.dtype) # Multiple state arrays - offset = cp.array([5], dtype=np.int32) - multiplier = cp.array([2], dtype=np.int32) + offset = DeviceArray.from_numpy(np.array([5], dtype=np.int32)) + multiplier = DeviceArray.from_numpy(np.array([2], dtype=np.int32)) def transform_with_multiple_state(x): return (x + offset[0]) * multiplier[0] @@ -492,23 +471,23 @@ def transform_with_multiple_state(x): cuda.compute.unary_transform( d_in=d_in, d_out=d_out, op=transform_with_multiple_state, num_items=num_items ) - result = d_out.get() - expected = (d_in.get() + 5) * 2 + result = d_out.copy_to_host() + expected = (h_in + 5) * 2 np.testing.assert_array_equal(result, expected) # Update state and verify it works with new values - offset = cp.array([10], dtype=np.int32) - multiplier = cp.array([3], dtype=np.int32) + offset = DeviceArray.from_numpy(np.array([10], dtype=np.int32)) + multiplier = DeviceArray.from_numpy(np.array([3], dtype=np.int32)) def transform_with_updated_state(x): return (x + offset[0]) * multiplier[0] - d_out.fill(0) + d_out.copy_from_host(np.zeros_like(h_in)) cuda.compute.unary_transform( d_in=d_in, d_out=d_out, op=transform_with_updated_state, num_items=num_items ) - result = d_out.get() - expected = (d_in.get() + 10) * 3 + result = d_out.copy_to_host() + expected = (h_in + 10) * 3 np.testing.assert_array_equal(result, expected) @@ -527,14 +506,18 @@ def func(x): return func - d_in = cp.array([0, 1, 2], dtype=np.int32) - d_out = cp.empty_like(d_in) + h_in = np.array([0, 1, 2], dtype=np.int32) + d_in = DeviceArray.from_numpy(h_in) + d_out = DeviceArray.empty(h_in.shape, h_in.dtype) # First call with offset 10 cuda.compute.unary_transform( - d_in=d_in, d_out=d_out, op=make_adder(cp.array([10])), num_items=len(d_in) + d_in=d_in, + d_out=d_out, + op=make_adder(DeviceArray.from_numpy(np.array([10], dtype=np.int64))), + num_items=h_in.size, ) - np.testing.assert_array_equal(d_out.get(), np.array([10, 11, 12])) + np.testing.assert_array_equal(d_out.copy_to_host(), np.array([10, 11, 12])) # Multiple calls with different offsets to test state re-detection for i in range(5): @@ -542,12 +525,12 @@ def func(x): cuda.compute.unary_transform( d_in=d_in, d_out=d_out, - op=make_adder(cp.array([offset])), - num_items=len(d_in), + op=make_adder(DeviceArray.from_numpy(np.array([offset], dtype=np.int64))), + num_items=h_in.size, ) expected = np.array([offset, offset + 1, offset + 2]) np.testing.assert_array_equal( - d_out.get(), + d_out.copy_to_host(), expected, err_msg=f"Failed at iteration {i} with offset {offset}", ) @@ -555,23 +538,26 @@ def func(x): def test_unary_transform_with_lambda(): """Test unary_transform with a lambda function.""" - d_in = cp.array([1, 2, 3, 4, 5], dtype=np.int32) - d_out = cp.empty_like(d_in) + h_in = np.array([1, 2, 3, 4, 5], dtype=np.int32) + d_in = DeviceArray.from_numpy(h_in) + d_out = DeviceArray.empty(h_in.shape, h_in.dtype) # Use a lambda function directly cuda.compute.unary_transform( - d_in=d_in, d_out=d_out, op=lambda x: x * 2, num_items=len(d_in) + d_in=d_in, d_out=d_out, op=lambda x: x * 2, num_items=h_in.size ) expected = np.array([2, 4, 6, 8, 10], dtype=np.int32) - np.testing.assert_array_equal(d_out.get(), expected) + np.testing.assert_array_equal(d_out.copy_to_host(), expected) def test_binary_transform_with_lambda(): """Test binary_transform with a lambda function.""" - d_in1 = cp.array([1, 2, 3, 4, 5], dtype=np.int32) - d_in2 = cp.array([10, 20, 30, 40, 50], dtype=np.int32) - d_out = cp.empty_like(d_in1) + h_in1 = np.array([1, 2, 3, 4, 5], dtype=np.int32) + h_in2 = np.array([10, 20, 30, 40, 50], dtype=np.int32) + d_in1 = DeviceArray.from_numpy(h_in1) + d_in2 = DeviceArray.from_numpy(h_in2) + d_out = DeviceArray.empty(h_in1.shape, h_in1.dtype) # Use a lambda function directly cuda.compute.binary_transform( @@ -579,28 +565,30 @@ def test_binary_transform_with_lambda(): d_in2=d_in2, d_out=d_out, op=lambda a, b: a + b, - num_items=len(d_in1), + num_items=h_in1.size, ) expected = np.array([11, 22, 33, 44, 55], dtype=np.int32) - np.testing.assert_array_equal(d_out.get(), expected) + np.testing.assert_array_equal(d_out.copy_to_host(), expected) def test_binary_transform_bool_equal_to(): - d_input1 = cp.array([True, False, True, False], dtype=np.bool_) - d_input2 = cp.array([True, True, False, False], dtype=np.bool_) - d_output = cp.empty_like(d_input1) + h_input1 = np.array([True, False, True, False], dtype=np.bool_) + h_input2 = np.array([True, True, False, False], dtype=np.bool_) + d_input1 = DeviceArray.from_numpy(h_input1) + d_input2 = DeviceArray.from_numpy(h_input2) + d_output = DeviceArray.empty(h_input1.shape, h_input1.dtype) cuda.compute.binary_transform( d_in1=d_input1, d_in2=d_input2, d_out=d_output, op=OpKind.EQUAL_TO, - num_items=len(d_input1), + num_items=h_input1.size, ) expected = np.array([True, False, False, True], dtype=np.bool_) - np.testing.assert_array_equal(d_output.get(), expected) + np.testing.assert_array_equal(d_output.copy_to_host(), expected) def test_stateful_transform_same_bytecode_different_sizes(): @@ -615,16 +603,19 @@ def op(x): return op - d_in = cp.asarray([1, 2, 3]) - d_out = cp.empty_like(d_in, dtype=bool) - op1 = make_op(cp.empty(1)) # len(arr) == 1 - op2 = make_op(cp.empty(2)) # len(arr) == 2 + h_in = np.asarray([1, 2, 3]) + d_in = DeviceArray.from_numpy(h_in) + d_out = DeviceArray.empty(h_in.shape, bool) + op1 = make_op(DeviceArray.empty(1, np.float64)) # len(arr) == 1 + op2 = make_op(DeviceArray.empty(2, np.float64)) # len(arr) == 2 - cuda.compute.unary_transform(d_in=d_in, d_out=d_out, op=op1, num_items=len(d_in)) - np.testing.assert_array_equal(np.asarray([False, True, True]), d_out.get()) + cuda.compute.unary_transform(d_in=d_in, d_out=d_out, op=op1, num_items=h_in.size) + np.testing.assert_array_equal(np.asarray([False, True, True]), d_out.copy_to_host()) - cuda.compute.unary_transform(d_in=d_in, d_out=d_out, op=op2, num_items=len(d_in)) - np.testing.assert_array_equal(np.asarray([False, False, True]), d_out.get()) + cuda.compute.unary_transform(d_in=d_in, d_out=d_out, op=op2, num_items=h_in.size) + np.testing.assert_array_equal( + np.asarray([False, False, True]), d_out.copy_to_host() + ) def test_transform_caching_with_global_np_ufunc(): @@ -632,8 +623,9 @@ def test_transform_caching_with_global_np_ufunc(): # ops referenced dotted globals like `np.` those # ops would all hash to the same value. - d_in = cp.asarray([1.0, 2.0, 3.0]) - d_out = cp.empty_like(d_in) + h_in = np.asarray([1.0, 2.0, 3.0]) + d_in = DeviceArray.from_numpy(h_in) + d_out = DeviceArray.empty(h_in.shape, h_in.dtype) def make_op(): sin = np.sin @@ -643,12 +635,10 @@ def op(x): return op - d_out = cp.empty_like(d_in) - cuda.compute.unary_transform( - d_in=d_in, d_out=d_out, op=make_op(), num_items=len(d_in) + d_in=d_in, d_out=d_out, op=make_op(), num_items=h_in.size ) - cp.testing.assert_allclose(d_out, cp.sin(d_in)) + np.testing.assert_allclose(d_out.copy_to_host(), np.sin(h_in)) def make_op(): cos = np.cos @@ -659,8 +649,6 @@ def op(x): return op cuda.compute.unary_transform( - d_in=d_in, d_out=d_out, op=make_op(), num_items=len(d_in) + d_in=d_in, d_out=d_out, op=make_op(), num_items=h_in.size ) - cp.testing.assert_allclose(d_out, cp.cos(d_in)) - - d_in = cp.asarray([1.0, 2.0, 3.0]) + np.testing.assert_allclose(d_out.copy_to_host(), np.cos(h_in)) diff --git a/python/cuda_cccl/tests/compute/test_unique_by_key.py b/python/cuda_cccl/tests/compute/test_unique_by_key.py index 83a4a17db67..668b6fb0a0f 100644 --- a/python/cuda_cccl/tests/compute/test_unique_by_key.py +++ b/python/cuda_cccl/tests/compute/test_unique_by_key.py @@ -3,10 +3,9 @@ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -import cupy as cp -import numba.cuda import numpy as np import pytest +from _utils.device_array import DeviceArray, get_compute_capability import cuda.compute from cuda.compute import ( @@ -125,7 +124,7 @@ def compare_op(lhs, rhs): @pytest.mark.parametrize("dtype, num_items, op", unique_by_key_params) def test_unique_by_key(dtype, num_items, op, monkeypatch): - cc_major, _ = numba.cuda.get_current_device().compute_capability + cc_major, _ = get_compute_capability() # Skip sass verification for CC 9.0+ due to a bug in NVRTC. # TODO: add NVRTC version check, ref nvbug 5243118 if cc_major >= 9: @@ -143,11 +142,13 @@ def test_unique_by_key(dtype, num_items, op, monkeypatch): h_out_items = np.empty(num_items, dtype=np.float32) h_out_num_selected = np.empty(1, np.int32) - d_in_keys = numba.cuda.to_device(h_in_keys) - d_in_items = numba.cuda.to_device(h_in_items) - d_out_keys = numba.cuda.to_device(h_out_keys) - d_out_items = numba.cuda.to_device(h_out_items) - d_out_num_selected = numba.cuda.to_device(h_out_num_selected) + d_in_keys = DeviceArray.from_numpy(h_in_keys) + d_in_items = DeviceArray.from_numpy(h_in_items) + d_out_keys = DeviceArray.empty(h_out_keys.shape, h_out_keys.dtype) + d_out_items = DeviceArray.empty(h_out_items.shape, h_out_items.dtype) + d_out_num_selected = DeviceArray.empty( + h_out_num_selected.shape, h_out_num_selected.dtype + ) unique_by_key_device( d_in_keys, @@ -172,7 +173,7 @@ def test_unique_by_key(dtype, num_items, op, monkeypatch): @pytest.mark.parametrize("dtype, num_items, op", unique_by_key_params) def test_unique_by_key_iterators(dtype, num_items, op, monkeypatch): - cc_major, _ = numba.cuda.get_current_device().compute_capability + cc_major, _ = get_compute_capability() # Skip sass verification for CC 9.0+, due to a bug in NVRTC. # TODO: add NVRTC version check, ref nvbug 5243118 if cc_major >= 9: @@ -190,11 +191,13 @@ def test_unique_by_key_iterators(dtype, num_items, op, monkeypatch): h_out_items = np.empty(num_items, dtype=np.float32) h_out_num_selected = np.empty(1, np.int64) - d_in_keys = numba.cuda.to_device(h_in_keys) - d_in_items = numba.cuda.to_device(h_in_items) - d_out_keys = numba.cuda.to_device(h_out_keys) - d_out_items = numba.cuda.to_device(h_out_items) - d_out_num_selected = numba.cuda.to_device(h_out_num_selected) + d_in_keys = DeviceArray.from_numpy(h_in_keys) + d_in_items = DeviceArray.from_numpy(h_in_items) + d_out_keys = DeviceArray.empty(h_out_keys.shape, h_out_keys.dtype) + d_out_items = DeviceArray.empty(h_out_items.shape, h_out_items.dtype) + d_out_num_selected = DeviceArray.empty( + h_out_num_selected.shape, h_out_num_selected.dtype + ) i_in_keys = CacheModifiedInputIterator(d_in_keys, modifier="stream") i_in_items = CacheModifiedInputIterator(d_in_items, modifier="stream") @@ -226,9 +229,11 @@ def test_unique_by_key_keys_only(): h_out_keys = np.empty(num_items, dtype=np.int32) h_out_num_selected = np.empty(1, np.int32) - d_in_keys = numba.cuda.to_device(h_in_keys) - d_out_keys = numba.cuda.to_device(h_out_keys) - d_out_num_selected = numba.cuda.to_device(h_out_num_selected) + d_in_keys = DeviceArray.from_numpy(h_in_keys) + d_out_keys = DeviceArray.empty(h_out_keys.shape, h_out_keys.dtype) + d_out_num_selected = DeviceArray.empty( + h_out_num_selected.shape, h_out_num_selected.dtype + ) unique_by_key_device( d_in_keys, @@ -267,11 +272,13 @@ def compare_complex(lhs, rhs): h_out_items = np.empty(num_items, dtype=np.float32) h_out_num_selected = np.empty(1, np.int32) - d_in_keys = numba.cuda.to_device(h_in_keys) - d_in_items = numba.cuda.to_device(h_in_items) - d_out_keys = numba.cuda.to_device(h_out_keys) - d_out_items = numba.cuda.to_device(h_out_items) - d_out_num_selected = numba.cuda.to_device(h_out_num_selected) + d_in_keys = DeviceArray.from_numpy(h_in_keys) + d_in_items = DeviceArray.from_numpy(h_in_items) + d_out_keys = DeviceArray.empty(h_out_keys.shape, h_out_keys.dtype) + d_out_items = DeviceArray.empty(h_out_items.shape, h_out_items.dtype) + d_out_num_selected = DeviceArray.empty( + h_out_num_selected.shape, h_out_num_selected.dtype + ) unique_by_key_device( d_in_keys, @@ -328,14 +335,13 @@ def struct_compare_op(lhs, rhs): h_in_items["a"] = a_items h_in_items["b"] = b_items - d_in_keys = cp.empty_like(h_in_keys) - d_in_items = cp.empty_like(h_in_items) - d_in_keys.set(h_in_keys) - d_in_items.set(h_in_items) - - d_out_keys = cp.empty_like(d_in_keys) - d_out_items = cp.empty_like(d_in_items) - d_out_num_selected = cp.empty_like(h_out_num_selected) + d_in_keys = DeviceArray.from_numpy(h_in_keys) + d_in_items = DeviceArray.from_numpy(h_in_items) + d_out_keys = DeviceArray.empty(h_in_keys.shape, h_in_keys.dtype) + d_out_items = DeviceArray.empty(h_in_items.shape, h_in_items.dtype) + d_out_num_selected = DeviceArray.empty( + h_out_num_selected.shape, h_out_num_selected.dtype + ) unique_by_key_device( d_in_keys, @@ -347,10 +353,10 @@ def struct_compare_op(lhs, rhs): num_items, ) - h_out_num_selected = d_out_num_selected.get() + h_out_num_selected = d_out_num_selected.copy_to_host() num_selected = h_out_num_selected[0] - h_out_keys = d_out_keys.get()[:num_selected] - h_out_items = d_out_items.get()[:num_selected] + h_out_keys = d_out_keys.copy_to_host()[:num_selected] + h_out_items = d_out_items.copy_to_host()[:num_selected] expected_keys, expected_items = unique_by_key_host( h_in_keys, @@ -363,7 +369,7 @@ def struct_compare_op(lhs, rhs): def test_unique_by_key_with_stream(cuda_stream, monkeypatch): - cc_major, _ = numba.cuda.get_current_device().compute_capability + cc_major, _ = get_compute_capability() # Skip sass verification for CC 9.0+ due to a bug in NVRTC. # TODO: add NVRTC version check, ref nvbug 5243118 if cc_major >= 9: @@ -375,7 +381,6 @@ def test_unique_by_key_with_stream(cuda_stream, monkeypatch): False, ) - cp_stream = cp.cuda.ExternalStream(cuda_stream.ptr) num_items = 10000 h_in_keys = random_array(num_items, np.int32, max_value=20) @@ -384,13 +389,19 @@ def test_unique_by_key_with_stream(cuda_stream, monkeypatch): h_out_items = np.empty(num_items, dtype=np.float32) h_out_num_selected = np.empty(1, np.int32) - with cp_stream: - h_in_keys = random_array(num_items, np.int32) - d_in_keys = cp.asarray(h_in_keys) - d_in_items = cp.asarray(h_in_items) - d_out_keys = cp.empty_like(h_out_keys) - d_out_items = cp.empty_like(h_out_items) - d_out_num_selected = cp.empty_like(h_out_num_selected) + d_in_keys = DeviceArray.from_numpy(h_in_keys, stream=cuda_stream) + d_in_items = DeviceArray.from_numpy(h_in_items, stream=cuda_stream) + d_out_keys = DeviceArray.empty( + h_out_keys.shape, h_out_keys.dtype, stream=cuda_stream + ) + d_out_items = DeviceArray.empty( + h_out_items.shape, h_out_items.dtype, stream=cuda_stream + ) + d_out_num_selected = DeviceArray.empty( + h_out_num_selected.shape, + h_out_num_selected.dtype, + stream=cuda_stream, + ) unique_by_key_device( d_in_keys, @@ -403,9 +414,9 @@ def test_unique_by_key_with_stream(cuda_stream, monkeypatch): stream=cuda_stream, ) - h_out_keys = d_out_keys.get() - h_out_items = d_out_items.get() - h_out_num_selected = d_out_num_selected.get() + h_out_keys = d_out_keys.copy_to_host(stream=cuda_stream) + h_out_items = d_out_items.copy_to_host(stream=cuda_stream) + h_out_num_selected = d_out_num_selected.copy_to_host(stream=cuda_stream) num_selected = h_out_num_selected[0] h_out_keys = h_out_keys[:num_selected] @@ -418,7 +429,7 @@ def test_unique_by_key_with_stream(cuda_stream, monkeypatch): def test_unique_by_key_well_known_equal_to(monkeypatch): - cc_major, _ = numba.cuda.get_current_device().compute_capability + cc_major, _ = get_compute_capability() # Skip sass verification for CC 9.0+ due to a bug in NVRTC. # TODO: add NVRTC version check, ref nvbug 5243118 if cc_major >= 9: @@ -433,11 +444,13 @@ def test_unique_by_key_well_known_equal_to(monkeypatch): dtype = np.int32 # Create input keys and values: keys=[1,1,1,2,2,3] values=[10,20,30,40,50,60] - d_in_keys = cp.array([1, 1, 1, 2, 2, 3], dtype=dtype) - d_in_values = cp.array([10, 20, 30, 40, 50, 60], dtype=dtype) - d_out_keys = cp.empty_like(d_in_keys) - d_out_values = cp.empty_like(d_in_values) - d_num_selected = cp.empty(1, dtype=dtype) + h_in_keys = np.array([1, 1, 1, 2, 2, 3], dtype=dtype) + h_in_values = np.array([10, 20, 30, 40, 50, 60], dtype=dtype) + d_in_keys = DeviceArray.from_numpy(h_in_keys) + d_in_values = DeviceArray.from_numpy(h_in_values) + d_out_keys = DeviceArray.empty(h_in_keys.shape, h_in_keys.dtype) + d_out_values = DeviceArray.empty(h_in_values.shape, h_in_values.dtype) + d_num_selected = DeviceArray.empty(1, dtype) # Run unique by key with well-known EQUAL_TO operation cuda.compute.unique_by_key( @@ -447,13 +460,13 @@ def test_unique_by_key_well_known_equal_to(monkeypatch): d_out_items=d_out_values, d_out_num_selected=d_num_selected, op=OpKind.EQUAL_TO, - num_items=len(d_in_keys), + num_items=h_in_keys.size, ) # Check the result is correct - assert d_num_selected.get()[0] == 3 # three unique keys + assert d_num_selected.copy_to_host()[0] == 3 # three unique keys expected_keys = [1, 2, 3] expected_values = [10, 40, 60] # first occurrence of each key - np.testing.assert_equal(d_out_keys.get()[:3], expected_keys) - np.testing.assert_equal(d_out_values.get()[:3], expected_values) + np.testing.assert_equal(d_out_keys.copy_to_host()[:3], expected_keys) + np.testing.assert_equal(d_out_values.copy_to_host()[:3], expected_values) diff --git a/python/cuda_cccl/tests/compute/test_zip_iterator.py b/python/cuda_cccl/tests/compute/test_zip_iterator.py index 85630bb4f5f..973e5a3c9c9 100644 --- a/python/cuda_cccl/tests/compute/test_zip_iterator.py +++ b/python/cuda_cccl/tests/compute/test_zip_iterator.py @@ -1,9 +1,9 @@ # Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -import cupy as cp import numpy as np import pytest +from _utils.device_array import DeviceArray, get_compute_capability import cuda.compute from cuda.compute import ( @@ -24,24 +24,26 @@ class Pair: def sum_pairs(p1, p2): return Pair(p1[0] + p2[0], p1[1] + p2[1]) - d_input1 = cp.arange(num_items, dtype=np.int64) - d_input2 = cp.arange(num_items, dtype=np.float32) + h_input1 = np.arange(num_items, dtype=np.int64) + h_input2 = np.arange(num_items, dtype=np.float32) + d_input1 = DeviceArray.from_numpy(h_input1) + d_input2 = DeviceArray.from_numpy(h_input2) zip_it = ZipIterator(d_input1, d_input2) - d_output = cp.empty(1, dtype=Pair.dtype) + d_output = DeviceArray.empty(1, Pair.dtype) h_init = Pair(0, 0.0) cuda.compute.reduce_into( d_in=zip_it, d_out=d_output, num_items=num_items, op=sum_pairs, h_init=h_init ) - expected_first = d_input1.sum().get() - expected_second = d_input2.sum().get() + expected_first = h_input1.sum() + expected_second = h_input2.sum() - result = d_output.get()[0] - cp.testing.assert_array_equal(result["first"], expected_first) - cp.testing.assert_allclose(result["second"], expected_second, rtol=1e-6) + result = d_output.copy_to_host()[0] + np.testing.assert_array_equal(result["first"], expected_first) + np.testing.assert_allclose(result["second"], expected_second, rtol=1e-6) @pytest.mark.parametrize("num_items", [10, 1_000, 100_000]) @@ -53,23 +55,24 @@ def max_by_value(p1, p2): return p1 if p1[1] > p2[1] else p2 counting_it = CountingIterator(np.int32(0)) - arr = cp.arange(num_items, dtype=np.int32) + h_arr = np.arange(num_items, dtype=np.int32) + d_arr = DeviceArray.from_numpy(h_arr) - zip_it = ZipIterator(counting_it, arr) + zip_it = ZipIterator(counting_it, d_arr) dtype = np.dtype([("index", np.int32), ("value", np.int32)], align=True) h_init = np.asarray([(-1, -1)], dtype=dtype) - d_output = cp.empty(1, dtype=dtype) + d_output = DeviceArray.empty(1, dtype) cuda.compute.reduce_into( d_in=zip_it, d_out=d_output, num_items=num_items, op=max_by_value, h_init=h_init ) - result = d_output.get()[0] + result = d_output.copy_to_host()[0] - expected_index = cp.argmax(arr).get() - expected_value = arr[expected_index].get() + expected_index = np.argmax(h_arr) + expected_value = h_arr[expected_index] assert result["index"] == expected_index assert result["value"] == expected_value @@ -86,28 +89,27 @@ def max_by_value(p1, p2): return p1 if p1[1] > p2[1] else p2 counting_it = CountingIterator(np.int32(0)) - arr = cp.arange(num_items, dtype=np.int32) + h_arr = np.arange(num_items, dtype=np.int32) + d_arr = DeviceArray.from_numpy(h_arr) def double_op(x): return x * 2 - transform_it = TransformIterator(arr, double_op) + transform_it = TransformIterator(d_arr, double_op) zip_it = ZipIterator(counting_it, transform_it) - d_output = cp.empty(1, dtype=IndexValuePair.dtype) - - result = d_output.get()[0] + d_output = DeviceArray.empty(1, IndexValuePair.dtype) h_init = IndexValuePair(-1, -1) cuda.compute.reduce_into( d_in=zip_it, d_out=d_output, num_items=num_items, op=max_by_value, h_init=h_init ) - result = d_output.get()[0] + result = d_output.copy_to_host()[0] - expected_index = cp.argmax(arr).get() - expected_value = arr[expected_index].get() * 2 + expected_index = np.argmax(h_arr) + expected_value = h_arr[expected_index] * 2 assert result["index"] == expected_index assert result["value"] == expected_value @@ -126,28 +128,30 @@ class Triple: def sum_triples(t1, t2): return Triple(t1[0] + t2[0], t1[1] + t2[1], t1[2] + t2[2]) - d_input1 = cp.arange(num_items, dtype=np.int64) - d_input2 = cp.arange(num_items, dtype=np.float32) + h_input1 = np.arange(num_items, dtype=np.int64) + h_input2 = np.arange(num_items, dtype=np.float32) + d_input1 = DeviceArray.from_numpy(h_input1) + d_input2 = DeviceArray.from_numpy(h_input2) counting_it = CountingIterator(np.int64(10)) zip_it = ZipIterator(d_input1, d_input2, counting_it) - d_output = cp.empty(1, dtype=Triple.dtype) + d_output = DeviceArray.empty(1, Triple.dtype) h_init = Triple(0, 0.0, 0) cuda.compute.reduce_into( d_in=zip_it, d_out=d_output, num_items=num_items, op=sum_triples, h_init=h_init ) - result = d_output.get()[0] + result = d_output.copy_to_host()[0] - expected_first = d_input1.sum().get() - expected_second = d_input2.sum().get() - expected_third = cp.arange(10, 10 + num_items).sum().get() + expected_first = h_input1.sum() + expected_second = h_input2.sum() + expected_third = np.arange(10, 10 + num_items).sum() - cp.testing.assert_array_equal(result["first"], expected_first) - cp.testing.assert_allclose(result["second"], expected_second, rtol=1e-6) - cp.testing.assert_array_equal(result["third"], expected_third) + np.testing.assert_array_equal(result["first"], expected_first) + np.testing.assert_allclose(result["second"], expected_second, rtol=1e-6) + np.testing.assert_array_equal(result["third"], expected_third) @pytest.mark.parametrize("num_items", [10, 1_000, 100_000]) @@ -161,20 +165,21 @@ class Single: def sum_singles(s1, s2): return Single(s1[0] + s2[0]) - d_input = cp.arange(num_items, dtype=np.int64) + h_input = np.arange(num_items, dtype=np.int64) + d_input = DeviceArray.from_numpy(h_input) zip_it = ZipIterator(d_input) - d_output = cp.empty(1, dtype=Single.dtype) + d_output = DeviceArray.empty(1, Single.dtype) h_init = Single(0) cuda.compute.reduce_into( d_in=zip_it, d_out=d_output, num_items=num_items, op=sum_singles, h_init=h_init ) - result = d_output.get()[0] + result = d_output.copy_to_host()[0] - expected_value = d_input.sum().get() + expected_value = h_input.sum() assert result["value"] == expected_value @@ -189,14 +194,16 @@ def binary_transform(pair1, pair2): return TransformedPair(pair1[0] + pair2[0], pair1[1] * pair2[1]) counting_it1 = CountingIterator(np.int32(0)) - arr1 = cp.arange(num_items, dtype=np.int32) - zip_it1 = ZipIterator(counting_it1, arr1) + h_arr1 = np.arange(num_items, dtype=np.int32) + d_arr1 = DeviceArray.from_numpy(h_arr1) + zip_it1 = ZipIterator(counting_it1, d_arr1) counting_it2 = CountingIterator(np.int32(0)) - arr2 = cp.arange(num_items, dtype=np.int32) - zip_it2 = ZipIterator(counting_it2, arr2) + h_arr2 = np.arange(num_items, dtype=np.int32) + d_arr2 = DeviceArray.from_numpy(h_arr2) + zip_it2 = ZipIterator(counting_it2, d_arr2) - d_output = cp.empty(num_items, dtype=TransformedPair.dtype) + d_output = DeviceArray.empty(num_items, TransformedPair.dtype) cuda.compute.binary_transform( d_in1=zip_it1, @@ -206,10 +213,10 @@ def binary_transform(pair1, pair2): num_items=num_items, ) - result = d_output.get() + result = d_output.copy_to_host() - expected_sum_indices = (arr1 + arr2).get() - expected_product_values = (arr1 * arr2).get() + expected_sum_indices = h_arr1 + h_arr2 + expected_product_values = h_arr1 * h_arr2 for i, result_item in enumerate(result): assert result_item["sum_indices"] == expected_sum_indices[i] @@ -231,13 +238,15 @@ def min_pairs(p1, p2): return Pair(min(p1[0], p2[0]), min(p1[1], p2[1])) # Create two randomized arrays to make min operations interesting - arr1 = cp.random.randint(0, 1000, num_items, dtype=np.int64) - arr2 = cp.random.randint(0, 1000, num_items, dtype=np.int64) + h_arr1 = np.random.randint(0, 1000, num_items, dtype=np.int64) + h_arr2 = np.random.randint(0, 1000, num_items, dtype=np.int64) + d_arr1 = DeviceArray.from_numpy(h_arr1) + d_arr2 = DeviceArray.from_numpy(h_arr2) - zip_it = ZipIterator(arr1, arr2) + zip_it = ZipIterator(d_arr1, d_arr2) - d_output = cp.empty(num_items, dtype=Pair.dtype) - h_init = Pair(cp.iinfo(np.int64).max, cp.iinfo(np.int64).max) + d_output = DeviceArray.empty(num_items, Pair.dtype) + h_init = Pair(np.iinfo(np.int64).max, np.iinfo(np.int64).max) cuda.compute.inclusive_scan( d_in=zip_it, @@ -247,11 +256,11 @@ def min_pairs(p1, p2): num_items=num_items, ) - result = d_output.get() + result = d_output.copy_to_host() # Verify the scan operation produces running minimums for both arrays - expected_first_running_mins = np.minimum.accumulate(arr1.get()) - expected_second_running_mins = np.minimum.accumulate(arr2.get()) + expected_first_running_mins = np.minimum.accumulate(h_arr1) + expected_second_running_mins = np.minimum.accumulate(h_arr2) for i, result_item in enumerate(result): assert result_item["first_min"] == expected_first_running_mins[i] @@ -261,10 +270,8 @@ def min_pairs(p1, p2): @pytest.mark.parametrize("num_items", [10, 1000]) def test_output_zip_iterator_with_scan(monkeypatch, num_items): """Test ZipIterator as output iterator with scan operations.""" - import numba.cuda - # Skip SASS check for CC 8.0+ due to LDL/STL CI failure. - cc_major, _ = numba.cuda.get_current_device().compute_capability + cc_major, _ = get_compute_capability() if cc_major >= 8: monkeypatch.setattr( cuda.compute._cccl_interop, @@ -272,13 +279,15 @@ def test_output_zip_iterator_with_scan(monkeypatch, num_items): False, ) - d_in1 = cp.random.randint(0, 1000, num_items, dtype=np.int64) - d_in2 = cp.random.randint(0, 1000, num_items, dtype=np.int64) + h_in1 = np.random.randint(0, 1000, num_items, dtype=np.int64) + h_in2 = np.random.randint(0, 1000, num_items, dtype=np.int64) + d_in1 = DeviceArray.from_numpy(h_in1) + d_in2 = DeviceArray.from_numpy(h_in2) zip_it = ZipIterator(d_in1, d_in2) - d_out1 = cp.empty_like(d_in1) - d_out2 = cp.empty_like(d_in2) + d_out1 = DeviceArray.empty(h_in1.shape, h_in1.dtype) + d_out2 = DeviceArray.empty(h_in2.shape, h_in2.dtype) zip_out_it = ZipIterator(d_out1, d_out2) @@ -293,20 +302,18 @@ def add_pairs(p1, p2): num_items=num_items, ) - in1 = d_in1.get() - in2 = d_in2.get() - expected_out1 = np.empty_like(in1) - expected_out2 = np.empty_like(in2) + expected_out1 = np.empty_like(h_in1) + expected_out2 = np.empty_like(h_in2) # First element is just the input - expected_out1[0] = in1[0] - expected_out2[0] = in2[0] + expected_out1[0] = h_in1[0] + expected_out2[0] = h_in2[0] for i in range(1, num_items): - expected_out1[i] = expected_out1[i - 1] + in1[i] - expected_out2[i] = expected_out2[i - 1] + in2[i] + expected_out1[i] = expected_out1[i - 1] + h_in1[i] + expected_out2[i] = expected_out2[i - 1] + h_in2[i] - np.testing.assert_array_equal(d_out1.get(), expected_out1) - np.testing.assert_array_equal(d_out2.get(), expected_out2) + np.testing.assert_array_equal(d_out1.copy_to_host(), expected_out1) + np.testing.assert_array_equal(d_out2.copy_to_host(), expected_out2) def test_nested_zip_iterators(): @@ -330,9 +337,12 @@ def sum_nested_zips(v1, v2): num_items = 100 # Create three input arrays - d_input_a = cp.arange(num_items, dtype=np.int32) - d_input_b = cp.arange(num_items, dtype=np.int64) * 2 - d_input_c = cp.arange(num_items, dtype=np.float32) * 3.0 + h_input_a = np.arange(num_items, dtype=np.int32) + h_input_b = np.arange(num_items, dtype=np.int64) * 2 + h_input_c = np.arange(num_items, dtype=np.float32) * 3.0 + d_input_a = DeviceArray.from_numpy(h_input_a) + d_input_b = DeviceArray.from_numpy(h_input_b) + d_input_c = DeviceArray.from_numpy(h_input_c) # Create an inner zip iterator combining a and b inner_zip = ZipIterator(d_input_a, d_input_b) @@ -341,7 +351,7 @@ def sum_nested_zips(v1, v2): outer_zip = ZipIterator(inner_zip, d_input_c) # Perform reduction - d_output = cp.empty(1, dtype=OuterTriple.dtype) + d_output = DeviceArray.empty(1, OuterTriple.dtype) h_init = OuterTriple(InnerPair(0, 0), 0.0) cuda.compute.reduce_into( @@ -352,12 +362,12 @@ def sum_nested_zips(v1, v2): h_init=h_init, ) - result = d_output.get()[0] + result = d_output.copy_to_host()[0] # Calculate expected values - expected_first = d_input_a.sum().get() - expected_second = d_input_b.sum().get() - expected_third = d_input_c.sum().get() + expected_first = h_input_a.sum() + expected_second = h_input_b.sum() + expected_third = h_input_c.sum() assert result["inner"]["first"] == expected_first, ( f"Expected inner.first={expected_first}, got {result['inner']['first']}" @@ -386,14 +396,17 @@ def sum_nested_zips(v1, v2): num_items = 100 - d_input_a = cp.arange(num_items, dtype=np.int32) - d_input_b = cp.arange(num_items, dtype=np.float32) - d_input_c = cp.arange(num_items, dtype=np.int64) + h_input_a = np.arange(num_items, dtype=np.int32) + h_input_b = np.arange(num_items, dtype=np.float32) + h_input_c = np.arange(num_items, dtype=np.int64) + d_input_a = DeviceArray.from_numpy(h_input_a) + d_input_b = DeviceArray.from_numpy(h_input_b) + d_input_c = DeviceArray.from_numpy(h_input_c) inner_zip = ZipIterator(d_input_a, d_input_b) outer_zip = ZipIterator(inner_zip, d_input_c) - d_output = cp.empty(1, dtype=OuterPair.dtype) + d_output = DeviceArray.empty(1, OuterPair.dtype) h_init = OuterPair(InnerPair(0, 0.0), 0) cuda.compute.reduce_into( @@ -404,13 +417,13 @@ def sum_nested_zips(v1, v2): h_init=h_init, ) - result = d_output.get()[0] + result = d_output.copy_to_host()[0] # outer_zip produces: {value_0: {value_0: int32, value_1: float32}, value_1: int64} # which maps to our OuterPair: {inner: {a: int32, b: float32}, c: int64} - expected_a = d_input_a.sum().get() # int32 - expected_b = d_input_b.sum().get() # float32 - expected_c = d_input_c.sum().get() # int64 + expected_a = h_input_a.sum() # int32 + expected_b = h_input_b.sum() # float32 + expected_c = h_input_c.sum() # int64 assert result["inner"]["a"] == expected_a assert np.isclose(result["inner"]["b"], expected_b) @@ -426,9 +439,7 @@ def sum_nested_zips(v1, v2): ], ) def test_nested_output_zip_iterator_with_scan(monkeypatch, num_items, dtype_map): - import numba.cuda - - cc_major, _ = numba.cuda.get_current_device().compute_capability + cc_major, _ = get_compute_capability() if cc_major >= 8: monkeypatch.setattr( cuda.compute._cccl_interop, @@ -446,15 +457,13 @@ def test_nested_output_zip_iterator_with_scan(monkeypatch, num_items, dtype_map) h_in2[i]["x"] = float(i * 10) h_in2[i]["y"] = float(i * 20) - d_in1 = cp.empty(num_items, dtype=Vec2.dtype) - d_in2 = cp.empty(num_items, dtype=Vec2.dtype) - d_in1.set(h_in1) - d_in2.set(h_in2) + d_in1 = DeviceArray.from_numpy(h_in1) + d_in2 = DeviceArray.from_numpy(h_in2) zip_it = ZipIterator(d_in1, d_in2) - d_out1 = cp.empty_like(d_in1) - d_out2 = cp.empty_like(d_in2) + d_out1 = DeviceArray.empty(h_in1.shape, h_in1.dtype) + d_out2 = DeviceArray.empty(h_in2.shape, h_in2.dtype) zip_out_it = ZipIterator(d_out1, d_out2) @@ -471,25 +480,23 @@ def add_vec2_pairs(v1, v2): num_items=num_items, ) - in1 = d_in1.get() - in2 = d_in2.get() - expected_out1 = np.empty_like(in1) - expected_out2 = np.empty_like(in2) + expected_out1 = np.empty_like(h_in1) + expected_out2 = np.empty_like(h_in2) - expected_out1[0] = in1[0] - expected_out2[0] = in2[0] + expected_out1[0] = h_in1[0] + expected_out2[0] = h_in2[0] for i in range(1, num_items): - expected_out1[i]["x"] = expected_out1[i - 1]["x"] + in1[i]["x"] - expected_out1[i]["y"] = expected_out1[i - 1]["y"] + in1[i]["y"] - expected_out2[i]["x"] = expected_out2[i - 1]["x"] + in2[i]["x"] - expected_out2[i]["y"] = expected_out2[i - 1]["y"] + in2[i]["y"] + expected_out1[i]["x"] = expected_out1[i - 1]["x"] + h_in1[i]["x"] + expected_out1[i]["y"] = expected_out1[i - 1]["y"] + h_in1[i]["y"] + expected_out2[i]["x"] = expected_out2[i - 1]["x"] + h_in2[i]["x"] + expected_out2[i]["y"] = expected_out2[i - 1]["y"] + h_in2[i]["y"] - np.testing.assert_array_equal(d_out1.get(), expected_out1) - np.testing.assert_array_equal(d_out2.get(), expected_out2) + np.testing.assert_array_equal(d_out1.copy_to_host(), expected_out1) + np.testing.assert_array_equal(d_out2.copy_to_host(), expected_out2) def test_zip_iterator_of_transform_iterator_kind(): - arr = cp.arange(10, dtype=np.int64) + d_arr = DeviceArray.from_numpy(np.arange(10, dtype=np.int64)) def f(x): return x @@ -497,8 +504,8 @@ def f(x): def g(x): return x + 1 - it1 = ZipIterator(TransformIterator(arr, f)) - it2 = ZipIterator(TransformIterator(arr, g)) + it1 = ZipIterator(TransformIterator(d_arr, f)) + it2 = ZipIterator(TransformIterator(d_arr, g)) assert it1.kind != it2.kind @@ -522,8 +529,10 @@ def test_caching_zip_iterator(): # Create multiple instances with same structure iterators = [] for i in range(5): - arr = cp.arange(i * 10, (i + 1) * 10, dtype=np.float32) - z = ZipIterator(arr) + d_arr = DeviceArray.from_numpy( + np.arange(i * 10, (i + 1) * 10, dtype=np.float32) + ) + z = ZipIterator(d_arr) # Trigger compilation by accessing LTOIR z.get_advance_op() z.get_input_deref_op() @@ -542,12 +551,12 @@ def test_caching_zip_iterator(): # Test 4: Arrays with different dtypes should not share cache compile_cpp_op_code.cache_clear() - z_int32 = ZipIterator(cp.arange(10, dtype=np.int32)) + z_int32 = ZipIterator(DeviceArray.from_numpy(np.arange(10, dtype=np.int32))) z_int32.get_advance_op() z_int32.get_input_deref_op() misses_after_first = compile_cpp_op_code.cache_info().misses - z_int64 = ZipIterator(cp.arange(10, dtype=np.int64)) + z_int64 = ZipIterator(DeviceArray.from_numpy(np.arange(10, dtype=np.int64))) z_int64.get_advance_op() z_int64.get_input_deref_op() misses_after_second = compile_cpp_op_code.cache_info().misses @@ -647,8 +656,10 @@ class Pair: num_items = 100 offset = 10 - d_input1 = cp.arange(num_items, dtype=np.int32) - d_input2 = cp.arange(num_items, dtype=np.int32) * 2 + h_input1 = np.arange(num_items, dtype=np.int32) + h_input2 = np.arange(num_items, dtype=np.int32) * 2 + d_input1 = DeviceArray.from_numpy(h_input1) + d_input2 = DeviceArray.from_numpy(h_input2) # Create base zip iterator zip_it = ZipIterator(d_input1, d_input2) @@ -661,7 +672,7 @@ def sum_pairs(p1, p2): return Pair(p1[0] + p2[0], p1[1] + p2[1]) h_init = Pair(0, 0) - d_output = cp.empty(1, dtype=Pair.dtype) + d_output = DeviceArray.empty(1, Pair.dtype) remaining_items = num_items - offset cuda.compute.reduce_into( @@ -672,11 +683,11 @@ def sum_pairs(p1, p2): h_init=h_init, ) - result = d_output.get()[0] + result = d_output.copy_to_host()[0] # Expected values should be sum from offset onwards - expected_first = d_input1[offset:].sum().get() - expected_second = d_input2[offset:].sum().get() + expected_first = h_input1[offset:].sum() + expected_second = h_input2[offset:].sum() assert result["first"] == expected_first assert result["second"] == expected_second @@ -699,9 +710,12 @@ def sum_nested_zips(v1, v2): offset = 15 # Create three input arrays - d_input_a = cp.arange(num_items, dtype=np.int32) - d_input_b = cp.arange(num_items, dtype=np.int64) * 2 - d_input_c = cp.arange(num_items, dtype=np.float32) * 3.0 + h_input_a = np.arange(num_items, dtype=np.int32) + h_input_b = np.arange(num_items, dtype=np.int64) * 2 + h_input_c = np.arange(num_items, dtype=np.float32) * 3.0 + d_input_a = DeviceArray.from_numpy(h_input_a) + d_input_b = DeviceArray.from_numpy(h_input_b) + d_input_c = DeviceArray.from_numpy(h_input_c) # Create nested zip: ZipIterator(ZipIterator(a, b), c) inner_zip = ZipIterator(d_input_a, d_input_b) @@ -711,7 +725,7 @@ def sum_nested_zips(v1, v2): advanced_outer_zip = outer_zip + offset # Perform reduction from the advanced position - d_output = cp.empty(1, dtype=OuterTriple.dtype) + d_output = DeviceArray.empty(1, OuterTriple.dtype) h_init = OuterTriple(InnerPair(0, 0), 0.0) remaining_items = num_items - offset @@ -723,12 +737,12 @@ def sum_nested_zips(v1, v2): h_init=h_init, ) - result = d_output.get()[0] + result = d_output.copy_to_host()[0] # Calculate expected values from offset onwards - expected_first = d_input_a[offset:].sum().get() - expected_second = d_input_b[offset:].sum().get() - expected_third = d_input_c[offset:].sum().get() + expected_first = h_input_a[offset:].sum() + expected_second = h_input_b[offset:].sum() + expected_third = h_input_c[offset:].sum() assert result["inner"]["first"] == expected_first, ( f"Expected inner.first={expected_first}, got {result['inner']['first']}" diff --git a/python/cuda_cccl/tests/coop/_experimental/test_block_exchange.py b/python/cuda_cccl/tests/coop/_experimental/test_block_exchange.py index add2bc6d0c2..c81c4c1e6d9 100644 --- a/python/cuda_cccl/tests/coop/_experimental/test_block_exchange.py +++ b/python/cuda_cccl/tests/coop/_experimental/test_block_exchange.py @@ -14,6 +14,7 @@ import numba import numpy as np import pytest +from _utils.device_array import DeviceArray from helpers import ( NUMBA_TYPES_TO_NP, Complex, @@ -119,11 +120,10 @@ def kernel(input_arr, output_arr): total_items = num_threads * items_per_thread h_input = random_int(total_items, T_np) - d_input = cuda.to_device(h_input) - d_output = cuda.device_array(total_items, dtype=T_np) + d_input = DeviceArray.from_numpy(h_input) + d_output = DeviceArray.empty(total_items, dtype=T_np) kernel[1, threads_per_block](d_input, d_output) - cuda.synchronize() output = d_output.copy_to_host() @@ -229,11 +229,10 @@ def kernel(input_arr, output_arr): h_input_imag = random_int(total_complex_items, T_complex_np_component) h_input_combined = np.concatenate((h_input_real, h_input_imag)) - d_input = cuda.to_device(h_input_combined) - d_output = cuda.device_array(2 * total_complex_items, dtype=T_complex_np_component) + d_input = DeviceArray.from_numpy(h_input_combined) + d_output = DeviceArray.empty(2 * total_complex_items, dtype=T_complex_np_component) kernel[1, threads_per_block](d_input, d_output) - cuda.synchronize() output_combined = d_output.copy_to_host() output_real = output_combined[:total_complex_items] diff --git a/python/cuda_cccl/tests/coop/_experimental/test_block_load.py b/python/cuda_cccl/tests/coop/_experimental/test_block_load.py index 2693d642395..617ff27210e 100644 --- a/python/cuda_cccl/tests/coop/_experimental/test_block_load.py +++ b/python/cuda_cccl/tests/coop/_experimental/test_block_load.py @@ -7,6 +7,7 @@ import numba import pytest +from _utils.device_array import DeviceArray from helpers import NUMBA_TYPES_TO_NP, random_int, row_major_tid from numba import cuda, types @@ -61,10 +62,9 @@ def kernel(d_input, d_output): dtype = NUMBA_TYPES_TO_NP[T] items_per_tile = num_threads_per_block * items_per_thread h_input = random_int(items_per_tile, dtype) - d_input = cuda.to_device(h_input) - d_output = cuda.device_array(items_per_tile, dtype=dtype) + d_input = DeviceArray.from_numpy(h_input) + d_output = DeviceArray.empty(items_per_tile, dtype=dtype) kernel[1, threads_per_block](d_input, d_output) - cuda.synchronize() output = d_output.copy_to_host() reference = h_input diff --git a/python/cuda_cccl/tests/coop/_experimental/test_block_load_store_api.py b/python/cuda_cccl/tests/coop/_experimental/test_block_load_store_api.py index 99da797c97c..2bd3ecd1fc7 100644 --- a/python/cuda_cccl/tests/coop/_experimental/test_block_load_store_api.py +++ b/python/cuda_cccl/tests/coop/_experimental/test_block_load_store_api.py @@ -2,6 +2,9 @@ # # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +from _utils.device_array import DeviceArray + +# isort: split # example-begin imports import numba import numpy as np @@ -36,8 +39,8 @@ def kernel(input, output): h_input = np.random.randint( 0, 42, threads_per_block * items_per_thread, dtype=np.int32 ) - d_input = cuda.to_device(h_input) - d_output = cuda.device_array_like(d_input) + d_input = DeviceArray.from_numpy(h_input) + d_output = DeviceArray.empty(h_input.shape, h_input.dtype) kernel[1, threads_per_block](d_input, d_output) h_output = d_output.copy_to_host() diff --git a/python/cuda_cccl/tests/coop/_experimental/test_block_merge_sort.py b/python/cuda_cccl/tests/coop/_experimental/test_block_merge_sort.py index fe339b1d27e..88943761c80 100644 --- a/python/cuda_cccl/tests/coop/_experimental/test_block_merge_sort.py +++ b/python/cuda_cccl/tests/coop/_experimental/test_block_merge_sort.py @@ -8,6 +8,7 @@ import numba import numpy as np import pytest +from _utils.device_array import DeviceArray from helpers import NUMBA_TYPES_TO_NP, random_int, row_major_tid from numba import cuda, types @@ -51,10 +52,9 @@ def kernel(input, output): dtype = NUMBA_TYPES_TO_NP[T] items_per_tile = num_threads_per_block * items_per_thread input = random_int(items_per_tile, dtype) - d_input = cuda.to_device(input) - d_output = cuda.device_array(items_per_tile, dtype=dtype) + d_input = DeviceArray.from_numpy(input) + d_output = DeviceArray.empty(items_per_tile, dtype=dtype) kernel[1, threads_per_block](d_input, d_output) - cuda.synchronize() output = d_output.copy_to_host() reference = sorted(input) @@ -103,10 +103,9 @@ def kernel(input, output): dtype = NUMBA_TYPES_TO_NP[T] items_per_tile = num_threads_per_block * items_per_thread input = random_int(items_per_tile, dtype) - d_input = cuda.to_device(input) - d_output = cuda.device_array(items_per_tile, dtype=dtype) + d_input = DeviceArray.from_numpy(input) + d_output = DeviceArray.empty(items_per_tile, dtype=dtype) kernel[1, threads_per_block](d_input, d_output) - cuda.synchronize() output = d_output.copy_to_host() reference = sorted(input, reverse=True) @@ -151,10 +150,9 @@ def kernel(input, output): items_per_tile = threads_per_block * items_per_thread input = np.random.random(items_per_tile) + 1j * np.random.random(items_per_tile) input = input.astype(dtype) - d_input = cuda.to_device(input) - d_output = cuda.device_array(items_per_tile, dtype=dtype) + d_input = DeviceArray.from_numpy(input) + d_output = DeviceArray.empty(items_per_tile, dtype=dtype) kernel[1, threads_per_block](d_input, d_output) - cuda.synchronize() output = d_output.copy_to_host() reference = sorted(input, reverse=True, key=lambda x: x.real) diff --git a/python/cuda_cccl/tests/coop/_experimental/test_block_merge_sort_api.py b/python/cuda_cccl/tests/coop/_experimental/test_block_merge_sort_api.py index 0bb68952d67..024df9604a6 100644 --- a/python/cuda_cccl/tests/coop/_experimental/test_block_merge_sort_api.py +++ b/python/cuda_cccl/tests/coop/_experimental/test_block_merge_sort_api.py @@ -4,6 +4,7 @@ import numba import numpy as np +from _utils.device_array import DeviceArray from numba import cuda import cuda.coop._experimental as coop @@ -47,7 +48,7 @@ def kernel(keys): tile_size = threads_per_block * items_per_thread h_keys = np.arange(0, tile_size, dtype=np.int32) - d_keys = cuda.to_device(h_keys) + d_keys = DeviceArray.from_numpy(h_keys) kernel[1, threads_per_block](d_keys) h_keys = d_keys.copy_to_host() for i in range(tile_size): diff --git a/python/cuda_cccl/tests/coop/_experimental/test_block_radix_sort.py b/python/cuda_cccl/tests/coop/_experimental/test_block_radix_sort.py index 072e443662d..961b1c259c6 100644 --- a/python/cuda_cccl/tests/coop/_experimental/test_block_radix_sort.py +++ b/python/cuda_cccl/tests/coop/_experimental/test_block_radix_sort.py @@ -7,6 +7,7 @@ import numba import pytest +from _utils.device_array import DeviceArray from helpers import NUMBA_TYPES_TO_NP, random_int, row_major_tid from numba import cuda, types @@ -47,10 +48,9 @@ def kernel(input, output): dtype = NUMBA_TYPES_TO_NP[T] items_per_tile = num_threads_per_block * items_per_thread input = random_int(items_per_tile, dtype) - d_input = cuda.to_device(input) - d_output = cuda.device_array(items_per_tile, dtype=dtype) + d_input = DeviceArray.from_numpy(input) + d_output = DeviceArray.empty(items_per_tile, dtype=dtype) kernel[1, threads_per_block](d_input, d_output) - cuda.synchronize() output = d_output.copy_to_host() reference = sorted(input, reverse=True) @@ -92,10 +92,9 @@ def kernel(input, output): dtype = NUMBA_TYPES_TO_NP[T] input = random_int(items_per_tile, dtype) - d_input = cuda.to_device(input) - d_output = cuda.device_array(items_per_tile, dtype=dtype) + d_input = DeviceArray.from_numpy(input) + d_output = DeviceArray.empty(items_per_tile, dtype=dtype) kernel[1, threads_per_block](d_input, d_output) - cuda.synchronize() output = d_output.copy_to_host() reference = sorted(input) @@ -133,10 +132,9 @@ def kernel(input, output): dtype = NUMBA_TYPES_TO_NP[T] input = random_int(items_per_tile, dtype) - d_input = cuda.to_device(input) - d_output = cuda.device_array(items_per_tile, dtype=dtype) + d_input = DeviceArray.from_numpy(input) + d_output = DeviceArray.empty(items_per_tile, dtype=dtype) kernel[1, threads_per_block](d_input, d_output) - cuda.synchronize() output = d_output.copy_to_host() reference = sorted(input) @@ -189,15 +187,14 @@ def kernel(int_input, int_output, double_input, double_output): double_output[tid * items_per_thread + i] = double_thread_data[i] int_input = random_int(items_per_tile, "int32") - d_int_input = cuda.to_device(int_input) - d_int_output = cuda.device_array(items_per_tile, dtype="int32") + d_int_input = DeviceArray.from_numpy(int_input) + d_int_output = DeviceArray.empty(items_per_tile, dtype="int32") double_input = random_int(items_per_tile, "float64") - d_double_input = cuda.to_device(double_input) - d_double_output = cuda.device_array(items_per_tile, dtype="float64") + d_double_input = DeviceArray.from_numpy(double_input) + d_double_output = DeviceArray.empty(items_per_tile, dtype="float64") kernel[1, threads_per_block]( d_int_input, d_int_output, d_double_input, d_double_output ) - cuda.synchronize() int_output = d_int_output.copy_to_host() int_reference = sorted(int_input) diff --git a/python/cuda_cccl/tests/coop/_experimental/test_block_radix_sort_api.py b/python/cuda_cccl/tests/coop/_experimental/test_block_radix_sort_api.py index 5747cd8e161..9f7c36b722d 100644 --- a/python/cuda_cccl/tests/coop/_experimental/test_block_radix_sort_api.py +++ b/python/cuda_cccl/tests/coop/_experimental/test_block_radix_sort_api.py @@ -4,6 +4,7 @@ import numba import numpy as np +from _utils.device_array import DeviceArray from numba import cuda import cuda.coop._experimental as coop @@ -43,7 +44,7 @@ def kernel(keys): tile_size = threads_per_block * items_per_thread h_keys = np.arange(tile_size - 1, -1, -1, dtype=np.int32) - d_keys = cuda.to_device(h_keys) + d_keys = DeviceArray.from_numpy(h_keys) kernel[1, threads_per_block](d_keys) h_keys = d_keys.copy_to_host() for i in range(tile_size): @@ -80,7 +81,7 @@ def kernel(keys): tile_size = threads_per_block * items_per_thread h_keys = np.arange(0, tile_size, dtype=np.int32) - d_keys = cuda.to_device(h_keys) + d_keys = DeviceArray.from_numpy(h_keys) kernel[1, threads_per_block](d_keys) h_keys = d_keys.copy_to_host() for i in range(tile_size): diff --git a/python/cuda_cccl/tests/coop/_experimental/test_block_reduce.py b/python/cuda_cccl/tests/coop/_experimental/test_block_reduce.py index f777b1acb83..874ff498f55 100644 --- a/python/cuda_cccl/tests/coop/_experimental/test_block_reduce.py +++ b/python/cuda_cccl/tests/coop/_experimental/test_block_reduce.py @@ -8,6 +8,7 @@ import numba import numpy as np import pytest +from _utils.device_array import DeviceArray from helpers import ( NUMBA_TYPES_TO_NP, Complex, @@ -65,10 +66,9 @@ def kernel(input, output): output[1] = block_output.imag h_input = random_int(2 * num_threads_per_block, "int32") - d_input = cuda.to_device(h_input) - d_output = cuda.device_array(2, dtype="int32") + d_input = DeviceArray.from_numpy(h_input) + d_output = DeviceArray.empty(2, dtype="int32") kernel[1, threads_per_block](d_input, d_output) - cuda.synchronize() h_output = d_output.copy_to_host() h_expected = ( np.sum(h_input[:num_threads_per_block]), @@ -129,10 +129,9 @@ def kernel(input, output): output[1] = block_output.imag h_input = random_int(2 * num_threads_per_block, "int32") - d_input = cuda.to_device(h_input) - d_output = cuda.device_array(2, dtype="int32") + d_input = DeviceArray.from_numpy(h_input) + d_output = DeviceArray.empty(2, dtype="int32") kernel[1, threads_per_block](d_input, d_output) - cuda.synchronize() h_output = d_output.copy_to_host() h_expected = ( np.sum(h_input[:num_threads_per_block]), @@ -182,10 +181,9 @@ def kernel(input, output): dtype = NUMBA_TYPES_TO_NP[T] h_input = random_int(num_threads_per_block, dtype) - d_input = cuda.to_device(h_input) - d_output = cuda.device_array(1, dtype=dtype) + d_input = DeviceArray.from_numpy(h_input) + d_output = DeviceArray.empty(1, dtype=dtype) kernel[1, threads_per_block](d_input, d_output) - cuda.synchronize() h_output = d_output.copy_to_host() h_expected = np.min(h_input) @@ -234,10 +232,9 @@ def kernel(input, output): dtype = NUMBA_TYPES_TO_NP[T] h_input = random_int(num_threads_per_block, dtype) h_input[-1] = 0 - d_input = cuda.to_device(h_input) - d_output = cuda.device_array(1, dtype=dtype) + d_input = DeviceArray.from_numpy(h_input) + d_output = DeviceArray.empty(1, dtype=dtype) kernel[1, threads_per_block](d_input, d_output) - cuda.synchronize() h_output = d_output.copy_to_host() h_expected = np.min(h_input[: num_threads_per_block // 2]) @@ -296,10 +293,9 @@ def kernel(input, output): dtype = NUMBA_TYPES_TO_NP[T] h_input = random_int(items_per_thread * num_threads_per_block, dtype) - d_input = cuda.to_device(h_input) - d_output = cuda.device_array(1, dtype=dtype) + d_input = DeviceArray.from_numpy(h_input) + d_output = DeviceArray.empty(1, dtype=dtype) kernel[1, threads_per_block](d_input, d_output) - cuda.synchronize() h_output = d_output.copy_to_host() h_expected = np.min(h_input) @@ -357,10 +353,9 @@ def kernel(input, output): dtype = NUMBA_TYPES_TO_NP[T] h_input = random_int(items_per_thread * num_threads_per_block, dtype) - d_input = cuda.to_device(h_input) - d_output = cuda.device_array(1, dtype=dtype) + d_input = DeviceArray.from_numpy(h_input) + d_output = DeviceArray.empty(1, dtype=dtype) kernel[1, threads_per_block](d_input, d_output) - cuda.synchronize() h_output = d_output.copy_to_host() h_expected = np.min(h_input) @@ -403,10 +398,9 @@ def kernel(input, output): dtype = NUMBA_TYPES_TO_NP[T] h_input = random_int(num_threads_per_block, dtype) - d_input = cuda.to_device(h_input) - d_output = cuda.device_array(1, dtype=dtype) + d_input = DeviceArray.from_numpy(h_input) + d_output = DeviceArray.empty(1, dtype=dtype) kernel[1, threads_per_block](d_input, d_output) - cuda.synchronize() h_output = d_output.copy_to_host() h_expected = np.sum(h_input) @@ -452,10 +446,9 @@ def kernel(input, output): dtype = NUMBA_TYPES_TO_NP[T] h_input = random_int(num_threads_per_block, dtype) h_input[-1] = 0 - d_input = cuda.to_device(h_input) - d_output = cuda.device_array(1, dtype=dtype) + d_input = DeviceArray.from_numpy(h_input) + d_output = DeviceArray.empty(1, dtype=dtype) kernel[1, threads_per_block](d_input, d_output) - cuda.synchronize() h_output = d_output.copy_to_host() h_expected = np.sum(h_input[: num_threads_per_block // 2]) @@ -510,10 +503,9 @@ def kernel(input, output): dtype = NUMBA_TYPES_TO_NP[T] h_input = random_int(items_per_thread * num_threads_per_block, dtype) - d_input = cuda.to_device(h_input) - d_output = cuda.device_array(1, dtype=dtype) + d_input = DeviceArray.from_numpy(h_input) + d_output = DeviceArray.empty(1, dtype=dtype) kernel[1, threads_per_block](d_input, d_output) - cuda.synchronize() h_output = d_output.copy_to_host() h_expected = np.sum(h_input) @@ -565,10 +557,9 @@ def kernel(input, output): dtype = NUMBA_TYPES_TO_NP[T] h_input = random_int(items_per_thread * num_threads_per_block, dtype) - d_input = cuda.to_device(h_input) - d_output = cuda.device_array(1, dtype=dtype) + d_input = DeviceArray.from_numpy(h_input) + d_output = DeviceArray.empty(1, dtype=dtype) kernel[1, threads_per_block](d_input, d_output) - cuda.synchronize() h_output = d_output.copy_to_host() h_expected = np.sum(h_input) diff --git a/python/cuda_cccl/tests/coop/_experimental/test_block_reduce_api.py b/python/cuda_cccl/tests/coop/_experimental/test_block_reduce_api.py index 3c795ac1722..349b8ea61b1 100644 --- a/python/cuda_cccl/tests/coop/_experimental/test_block_reduce_api.py +++ b/python/cuda_cccl/tests/coop/_experimental/test_block_reduce_api.py @@ -2,6 +2,9 @@ # # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +from _utils.device_array import DeviceArray + +# isort: split # example-begin imports import numba import numpy as np @@ -32,8 +35,8 @@ def kernel(input, output): # example-end reduce h_input = np.random.randint(0, 42, threads_per_block, dtype=np.int32) - d_input = cuda.to_device(h_input) - d_output = cuda.device_array(1, dtype=np.int32) + d_input = DeviceArray.from_numpy(h_input) + d_output = DeviceArray.empty(1, dtype=np.int32) kernel[1, threads_per_block](d_input, d_output) h_output = d_output.copy_to_host() h_expected = np.max(h_input) @@ -56,8 +59,8 @@ def kernel(input, output): # example-end sum h_input = np.ones(threads_per_block, dtype=np.int32) - d_input = cuda.to_device(h_input) - d_output = cuda.device_array(1, dtype=np.int32) + d_input = DeviceArray.from_numpy(h_input) + d_output = DeviceArray.empty(1, dtype=np.int32) kernel[1, threads_per_block](d_input, d_output) h_output = d_output.copy_to_host() diff --git a/python/cuda_cccl/tests/coop/_experimental/test_block_scan.py b/python/cuda_cccl/tests/coop/_experimental/test_block_scan.py index 1b0ea5ac3ed..e700301b8ae 100644 --- a/python/cuda_cccl/tests/coop/_experimental/test_block_scan.py +++ b/python/cuda_cccl/tests/coop/_experimental/test_block_scan.py @@ -17,6 +17,7 @@ import numba import numpy as np import pytest +from _utils.device_array import DeviceArray from helpers import ( NUMBA_TYPES_TO_NP, Complex, @@ -152,11 +153,10 @@ def kernel(input_arr, output_arr): dtype_np = NUMBA_TYPES_TO_NP[T] items_per_tile = num_threads * items_per_thread h_input = random_int(items_per_tile, dtype_np) - d_input = cuda.to_device(h_input) - d_output = cuda.device_array(items_per_tile, dtype=dtype_np) + d_input = DeviceArray.from_numpy(h_input) + d_output = DeviceArray.empty(items_per_tile, dtype=dtype_np) kernel[1, threads_per_block](d_input, d_output) - cuda.synchronize() output = d_output.copy_to_host() if mode == "inclusive": @@ -248,11 +248,10 @@ def kernel(input_arr, output_arr): tile_offset += tile_items h_input = np.arange(num_elements, dtype=np.int32) - d_input = cuda.to_device(h_input) - d_output = cuda.to_device(np.zeros(num_elements, dtype=np.int32)) + d_input = DeviceArray.from_numpy(h_input) + d_output = DeviceArray.from_numpy(np.zeros(num_elements, dtype=np.int32)) kernel[num_segments, threads_per_block](d_input, d_output) - cuda.synchronize() h_output = d_output.copy_to_host() ref = np.zeros_like(h_input) @@ -438,10 +437,9 @@ def kernel(input_arr, output_arr): # Account for a Complex type containing two int32 values. total_items = num_threads * items_per_thread * 2 h_input = random_int(total_items, "int32") - d_input = cuda.to_device(h_input) - d_output = cuda.device_array(total_items, dtype=np.int32) + d_input = DeviceArray.from_numpy(h_input) + d_output = DeviceArray.empty(total_items, dtype=np.int32) kernel[1, threads_per_block](d_input, d_output) - cuda.synchronize() h_output = d_output.copy_to_host() real_vals = h_input[:num_elements] @@ -538,11 +536,10 @@ def kernel(input_arr, output_arr): dtype_np = NUMBA_TYPES_TO_NP[T] total_items = num_threads * items_per_thread h_input = random_int(total_items, dtype_np) - d_input = cuda.to_device(h_input) - d_output = cuda.device_array(total_items, dtype=dtype_np) + d_input = DeviceArray.from_numpy(h_input) + d_output = DeviceArray.empty(total_items, dtype=dtype_np) kernel[1, threads_per_block](d_input, d_output) - cuda.synchronize() output = d_output.copy_to_host() @@ -709,11 +706,10 @@ def kernel(input_arr, output_arr): dtype_np = NUMBA_TYPES_TO_NP[T] total_items = num_threads * items_per_thread h_input = random_int(total_items, dtype_np) - d_input = cuda.to_device(h_input) - d_output = cuda.device_array(total_items, dtype=dtype_np) + d_input = DeviceArray.from_numpy(h_input) + d_output = DeviceArray.empty(total_items, dtype=dtype_np) kernel[1, threads_per_block](d_input, d_output) - cuda.synchronize() output = d_output.copy_to_host() @@ -822,12 +818,11 @@ def kernel(input_arr, output_arr): else: h_input = random_int(total_items, dtype_np) - d_input = cuda.to_device(h_input) - d_output = cuda.device_array(total_items, dtype=dtype_np) + d_input = DeviceArray.from_numpy(h_input) + d_output = DeviceArray.empty(total_items, dtype=dtype_np) k = kernel[1, threads_per_block] k(d_input, d_output) - cuda.synchronize() output = d_output.copy_to_host() diff --git a/python/cuda_cccl/tests/coop/_experimental/test_block_scan_api.py b/python/cuda_cccl/tests/coop/_experimental/test_block_scan_api.py index 8b40e57d7f7..b0b90bea662 100644 --- a/python/cuda_cccl/tests/coop/_experimental/test_block_scan_api.py +++ b/python/cuda_cccl/tests/coop/_experimental/test_block_scan_api.py @@ -4,6 +4,7 @@ import numba import numpy as np +from _utils.device_array import DeviceArray from numba import cuda import cuda.coop._experimental as coop @@ -43,7 +44,7 @@ def kernel(data): tile_size = threads_per_block * items_per_thread h_keys = np.ones(tile_size, dtype=np.int32) - d_keys = cuda.to_device(h_keys) + d_keys = DeviceArray.from_numpy(h_keys) kernel[1, threads_per_block](d_keys) h_keys = d_keys.copy_to_host() for i in range(tile_size): @@ -77,7 +78,7 @@ def kernel(data): tile_size = threads_per_block h_keys = np.ones(tile_size, dtype=np.int32) - d_keys = cuda.to_device(h_keys) + d_keys = DeviceArray.from_numpy(h_keys) kernel[1, threads_per_block](d_keys) h_keys = d_keys.copy_to_host() for i in range(tile_size): diff --git a/python/cuda_cccl/tests/coop/_experimental/test_block_store.py b/python/cuda_cccl/tests/coop/_experimental/test_block_store.py index 1f6a7aeb9cf..2616d3f3aa9 100644 --- a/python/cuda_cccl/tests/coop/_experimental/test_block_store.py +++ b/python/cuda_cccl/tests/coop/_experimental/test_block_store.py @@ -7,6 +7,7 @@ import numba import pytest +from _utils.device_array import DeviceArray from helpers import NUMBA_TYPES_TO_NP, random_int, row_major_tid from numba import cuda, types @@ -63,10 +64,9 @@ def kernel(d_input, d_output): dtype = NUMBA_TYPES_TO_NP[T] items_per_tile = num_threads_per_block * items_per_thread h_input = random_int(items_per_tile, dtype) - d_input = cuda.to_device(h_input) - d_output = cuda.device_array(items_per_tile, dtype=dtype) + d_input = DeviceArray.from_numpy(h_input) + d_output = DeviceArray.empty(items_per_tile, dtype=dtype) kernel[1, threads_per_block](d_input, d_output) - cuda.synchronize() output = d_output.copy_to_host() reference = h_input diff --git a/python/cuda_cccl/tests/coop/_experimental/test_warp_merge_sort.py b/python/cuda_cccl/tests/coop/_experimental/test_warp_merge_sort.py index ec9a88fa634..08abf8260e1 100644 --- a/python/cuda_cccl/tests/coop/_experimental/test_warp_merge_sort.py +++ b/python/cuda_cccl/tests/coop/_experimental/test_warp_merge_sort.py @@ -4,6 +4,7 @@ import numba import pytest +from _utils.device_array import DeviceArray from helpers import NUMBA_TYPES_TO_NP, random_int from numba import cuda, types @@ -35,10 +36,9 @@ def kernel(input, output): dtype = NUMBA_TYPES_TO_NP[T] items_per_tile = 32 * items_per_thread input = random_int(items_per_tile, dtype) - d_input = cuda.to_device(input) - d_output = cuda.device_array(items_per_tile, dtype=dtype) + d_input = DeviceArray.from_numpy(input) + d_output = DeviceArray.empty(items_per_tile, dtype=dtype) kernel[1, 32](d_input, d_output) - cuda.synchronize() output = d_output.copy_to_host() reference = sorted(input) @@ -79,10 +79,9 @@ def kernel(input, output): dtype = NUMBA_TYPES_TO_NP[T] h_input = random_int(items_per_tile, dtype) - d_input = cuda.to_device(h_input) - d_output = cuda.device_array(items_per_tile, dtype=dtype) + d_input = DeviceArray.from_numpy(h_input) + d_output = DeviceArray.empty(items_per_tile, dtype=dtype) kernel[1, threads_per_block](d_input, d_output) - cuda.synchronize() output = d_output.copy_to_host() for wid in range(threads_per_block // warp_threads): diff --git a/python/cuda_cccl/tests/coop/_experimental/test_warp_merge_sort_api.py b/python/cuda_cccl/tests/coop/_experimental/test_warp_merge_sort_api.py index 4ee9b55b267..c7c088fdb59 100644 --- a/python/cuda_cccl/tests/coop/_experimental/test_warp_merge_sort_api.py +++ b/python/cuda_cccl/tests/coop/_experimental/test_warp_merge_sort_api.py @@ -4,6 +4,7 @@ import numba import numpy as np +from _utils.device_array import DeviceArray from numba import cuda import cuda.coop._experimental as coop @@ -42,7 +43,7 @@ def kernel(keys): tile_size = 32 * items_per_thread h_keys = np.arange(0, tile_size, dtype=np.int32) - d_keys = cuda.to_device(h_keys) + d_keys = DeviceArray.from_numpy(h_keys) kernel[1, 32](d_keys) h_keys = d_keys.copy_to_host() for i in range(tile_size): diff --git a/python/cuda_cccl/tests/coop/_experimental/test_warp_reduce.py b/python/cuda_cccl/tests/coop/_experimental/test_warp_reduce.py index 2aa53b651f2..28401a74900 100644 --- a/python/cuda_cccl/tests/coop/_experimental/test_warp_reduce.py +++ b/python/cuda_cccl/tests/coop/_experimental/test_warp_reduce.py @@ -5,6 +5,7 @@ import numba import numpy as np import pytest +from _utils.device_array import DeviceArray from helpers import NUMBA_TYPES_TO_NP, random_int from numba import cuda, types @@ -31,10 +32,9 @@ def kernel(input, output): dtype = NUMBA_TYPES_TO_NP[T] h_input = random_int(32, dtype) - d_input = cuda.to_device(h_input) - d_output = cuda.device_array(1, dtype=dtype) + d_input = DeviceArray.from_numpy(h_input) + d_output = DeviceArray.empty(1, dtype=dtype) kernel[1, 32](d_input, d_output) - cuda.synchronize() h_output = d_output.copy_to_host() h_expected = np.min(h_input) @@ -62,10 +62,9 @@ def kernel(input, output): dtype = NUMBA_TYPES_TO_NP[T] h_input = random_int(32, dtype) - d_input = cuda.to_device(h_input) - d_output = cuda.device_array(1, dtype=dtype) + d_input = DeviceArray.from_numpy(h_input) + d_output = DeviceArray.empty(1, dtype=dtype) kernel[1, 32](d_input, d_output) - cuda.synchronize() h_output = d_output.copy_to_host() h_expected = np.sum(h_input) diff --git a/python/cuda_cccl/tests/coop/_experimental/test_warp_reduce_api.py b/python/cuda_cccl/tests/coop/_experimental/test_warp_reduce_api.py index 4e79f22b913..3ade32a0203 100644 --- a/python/cuda_cccl/tests/coop/_experimental/test_warp_reduce_api.py +++ b/python/cuda_cccl/tests/coop/_experimental/test_warp_reduce_api.py @@ -4,6 +4,7 @@ import numba import numpy as np +from _utils.device_array import DeviceArray from numba import cuda import cuda.coop._experimental as coop @@ -30,8 +31,8 @@ def kernel(input, output): # example-end reduce h_input = np.random.randint(0, 42, 32, dtype=np.int32) - d_input = cuda.to_device(h_input) - d_output = cuda.device_array(1, dtype=np.int32) + d_input = DeviceArray.from_numpy(h_input) + d_output = DeviceArray.empty(1, dtype=np.int32) kernel[1, 32](d_input, d_output) h_output = d_output.copy_to_host() h_expected = np.max(h_input) @@ -53,8 +54,8 @@ def kernel(input, output): # example-end sum h_input = np.ones(32, dtype=np.int32) - d_input = cuda.to_device(h_input) - d_output = cuda.device_array(1, dtype=np.int32) + d_input = DeviceArray.from_numpy(h_input) + d_output = DeviceArray.empty(1, dtype=np.int32) kernel[1, 32](d_input, d_output) h_output = d_output.copy_to_host() diff --git a/python/cuda_cccl/tests/coop/_experimental/test_warp_scan.py b/python/cuda_cccl/tests/coop/_experimental/test_warp_scan.py index a82809de342..650baf446c8 100644 --- a/python/cuda_cccl/tests/coop/_experimental/test_warp_scan.py +++ b/python/cuda_cccl/tests/coop/_experimental/test_warp_scan.py @@ -5,6 +5,7 @@ import numba import numpy as np import pytest +from _utils.device_array import DeviceArray from helpers import NUMBA_TYPES_TO_NP, random_int from numba import cuda, types @@ -26,10 +27,9 @@ def kernel(input, output): dtype = NUMBA_TYPES_TO_NP[T] h_input = random_int(32, dtype) - d_input = cuda.to_device(h_input) - d_output = cuda.device_array(32, dtype=dtype) + d_input = DeviceArray.from_numpy(h_input) + d_output = DeviceArray.empty(32, dtype=dtype) kernel[1, 32](d_input, d_output) - cuda.synchronize() output = d_output.copy_to_host() reference = np.cumsum(h_input) - h_input diff --git a/python/cuda_cccl/tests/coop/_experimental/test_warp_scan_api.py b/python/cuda_cccl/tests/coop/_experimental/test_warp_scan_api.py index aa5a2497d70..82ccd22dee3 100644 --- a/python/cuda_cccl/tests/coop/_experimental/test_warp_scan_api.py +++ b/python/cuda_cccl/tests/coop/_experimental/test_warp_scan_api.py @@ -4,6 +4,7 @@ import numba import numpy as np +from _utils.device_array import DeviceArray from numba import cuda import cuda.coop._experimental as coop @@ -29,7 +30,7 @@ def kernel(data): tile_size = 32 h_keys = np.ones(tile_size, dtype=np.int32) - d_keys = cuda.to_device(h_keys) + d_keys = DeviceArray.from_numpy(h_keys) kernel[1, 32](d_keys) h_keys = d_keys.copy_to_host() for i in range(tile_size): diff --git a/python/cuda_cccl/tests/test_examples.py b/python/cuda_cccl/tests/test_examples.py index a78165905c5..5c6759130e5 100644 --- a/python/cuda_cccl/tests/test_examples.py +++ b/python/cuda_cccl/tests/test_examples.py @@ -11,11 +11,14 @@ """ import importlib +import importlib.util import inspect import sys import traceback from pathlib import Path +import pytest + def discover_examples(): """Automatically discover all example files and their functions.""" @@ -153,6 +156,11 @@ def test_func(): globals()[test_name] = make_test_func(module_name, display_name) globals()[test_name].__name__ = test_name globals()[test_name].__doc__ = f"Test {display_name} examples" + if module_name.startswith("compute.examples."): + globals()[test_name] = pytest.mark.skipif( + importlib.util.find_spec("cupy") is None, + reason="cuda.compute examples require the optional CuPy dependency", + )(globals()[test_name]) # Create test functions for pytest