diff --git a/ci/test_cuda_cccl_examples_python.sh b/ci/test_cuda_cccl_examples_python.sh
index 68181aca622..87a73e63dc9 100755
--- a/ci/test_cuda_cccl_examples_python.sh
+++ b/ci/test_cuda_cccl_examples_python.sh
@@ -23,7 +23,7 @@ fi
 
 # Install cuda_cccl
 CUDA_CCCL_WHEEL_PATH="$(ls /home/coder/cccl/wheelhouse/cuda_cccl-*.whl)"
-python -m pip install "${CUDA_CCCL_WHEEL_PATH}[test-cu${cuda_major_version}]"
+python -m pip install "${CUDA_CCCL_WHEEL_PATH}[test-with-examples-cu${cuda_major_version}]"
 
 # Run tests for parallel module
 cd "/home/coder/cccl/python/cuda_cccl/tests/"
diff --git a/ci/test_cuda_compute_minimal_python.sh b/ci/test_cuda_compute_minimal_python.sh
index a43bbac7bfa..03392736012 100755
--- a/ci/test_cuda_compute_minimal_python.sh
+++ b/ci/test_cuda_compute_minimal_python.sh
@@ -30,7 +30,7 @@ fi
 # full cu* extras because those pull in numba/numba-cuda.
 CUDA_CCCL_WHEEL_PATH="$(ls "${wheelhouse_dir}"/cuda_cccl-*.whl)"
 python -m pip install "${CUDA_CCCL_WHEEL_PATH}[minimal-cu${cuda_major_version}]"
-python -m pip install pytest pytest-xdist "cupy-cuda${cuda_major_version}x"
+python -m pip install pytest pytest-xdist
 
 cd "${repo_root}/python/cuda_cccl/tests/"
 python -m pytest -n 6 -v compute/test_no_numba.py
diff --git a/ci/windows/test_cuda_cccl_examples_python.ps1 b/ci/windows/test_cuda_cccl_examples_python.ps1
index 0c108328822..16e01b4443c 100644
--- a/ci/windows/test_cuda_cccl_examples_python.ps1
+++ b/ci/windows/test_cuda_cccl_examples_python.ps1
@@ -18,7 +18,7 @@ $repoRoot = Get-RepoRoot
 
 ${wheelPath} = Get-CudaCcclWheel
 & $python -m pip install -U pip pytest pytest-xdist
-& $python -m pip install "${wheelPath}[test-cu$cudaMajor]"
+& $python -m pip install "${wheelPath}[test-with-examples-cu$cudaMajor]"
 
 Push-Location (Join-Path $repoRoot "python/cuda_cccl/tests")
 try {
diff --git a/docs/python/setup.rst b/docs/python/setup.rst
index 15c9510ce5e..b44c5ea2064 100644
--- a/docs/python/setup.rst
+++ b/docs/python/setup.rst
@@ -72,6 +72,10 @@ For development or to access the latest features:
    cd cccl/python/cuda_cccl
    pip install -e .[test-cu13]  # or .[test-cu12], .[test-sysctk13], .[test-sysctk12]
 
+The standard test extras do not install CuPy. To also run the CuPy-based
+``cuda.compute`` examples, use the corresponding ``test-with-examples-*`` extra,
+for example ``pip install -e .[test-with-examples-cu13]``.
+
 
 Development Setup
 ~~~~~~~~~~~~~~~~~~
diff --git a/python/cuda_cccl/pyproject.toml b/python/cuda_cccl/pyproject.toml
index 98e0e6533c0..1acf7b8dcc5 100644
--- a/python/cuda_cccl/pyproject.toml
+++ b/python/cuda_cccl/pyproject.toml
@@ -83,11 +83,22 @@ test-cu12 = [
   "cuda-cccl[cu12]",
   "pytest",
   "pytest-xdist",
-  "cupy-cuda12x",
 ]
-test-cu13 = ["cuda-cccl[cu13]", "pytest", "pytest-xdist", "cupy-cuda13x"]
-test-sysctk12 = ["cuda-cccl[sysctk12]", "pytest", "pytest-xdist", "cupy-cuda12x"]
-test-sysctk13 = ["cuda-cccl[sysctk13]", "pytest", "pytest-xdist", "cupy-cuda13x"]
+test-cu13 = ["cuda-cccl[cu13]", "pytest", "pytest-xdist"]
+test-sysctk12 = [
+  "cuda-cccl[sysctk12]",
+  "pytest",
+  "pytest-xdist",
+]
+test-sysctk13 = [
+  "cuda-cccl[sysctk13]",
+  "pytest",
+  "pytest-xdist",
+]
+test-with-examples-cu12 = ["cuda-cccl[test-cu12]", "cupy-cuda12x"]
+test-with-examples-cu13 = ["cuda-cccl[test-cu13]", "cupy-cuda13x"]
+test-with-examples-sysctk12 = ["cuda-cccl[test-sysctk12]", "cupy-cuda12x"]
+test-with-examples-sysctk13 = ["cuda-cccl[test-sysctk13]", "cupy-cuda13x"]
 bench-cu12 = [
   "cuda-cccl[cu12]",
   "cuda-bench[cu12]",
@@ -173,6 +184,8 @@ known-first-party = [
 ]
 
 [tool.pytest.ini_options]
+# Make shared helpers in tests/_utils importable when pytest runs outside tests/.
+pythonpath = ["tests"]
 markers = [
   "no_verify_sass: skip SASS verification check",
   "large: tests requiring large device memory allocations",
diff --git a/python/cuda_cccl/tests/_utils/__init__.py b/python/cuda_cccl/tests/_utils/__init__.py
new file mode 100644
index 00000000000..b221129eb3a
--- /dev/null
+++ b/python/cuda_cccl/tests/_utils/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+"""Shared test utilities for cuda-cccl."""
diff --git a/python/cuda_cccl/tests/_utils/device_array.py b/python/cuda_cccl/tests/_utils/device_array.py
new file mode 100644
index 00000000000..b2496173a28
--- /dev/null
+++ b/python/cuda_cccl/tests/_utils/device_array.py
@@ -0,0 +1,252 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+from __future__ import annotations
+
+import math
+import operator
+from collections.abc import Iterable
+
+import numpy as np
+from numpy.typing import DTypeLike
+
+from cuda.core import Buffer, Device, Stream
+
+
+def get_compute_capability() -> tuple[int, int]:
+    return Device().compute_capability
+
+
+def _normalize_shape(shape: int | Iterable[int]) -> tuple[int, ...]:
+    try:
+        dimensions = (operator.index(shape),)  # type: ignore[arg-type]
+    except TypeError:
+        dimensions = tuple(operator.index(dimension) for dimension in shape)  # type: ignore[union-attr]
+
+    if any(dimension < 0 for dimension in dimensions):
+        raise ValueError("negative dimensions are not allowed")
+
+    return dimensions
+
+
+def _contiguous_strides(
+    shape: tuple[int, ...], itemsize: int, order: str
+) -> tuple[int, ...]:
+    if any(dimension == 0 for dimension in shape):
+        return (0,) * len(shape)
+
+    strides = [0] * len(shape)
+    stride = itemsize
+
+    if order == "C":
+        for index in range(len(shape) - 1, -1, -1):
+            strides[index] = stride
+            stride *= shape[index]
+    else:
+        for index, dimension in enumerate(shape):
+            strides[index] = stride
+            stride *= dimension
+
+    return tuple(strides)
+
+
+def _resolve_device_and_stream(
+    device: Device | None, stream: Stream | None
+) -> tuple[Device, Stream]:
+    if device is None:
+        device = stream.device if stream is not None else Device()
+
+    if stream is not None and stream.device.device_id != device.device_id:
+        raise ValueError("device and stream must refer to the same device")
+
+    device.set_current()
+    return device, device.default_stream if stream is None else stream
+
+
+class DeviceArray:
+    """A small, Buffer-backed device array for cuda-cccl tests.
+
+    The class intentionally provides only allocation, NumPy transfers, array
+    metadata, and the CUDA Array Interface. Array operations and initialization
+    belong on the NumPy host arrays used by the tests.
+    """
+
+    def __init__(
+        self,
+        buffer: Buffer,
+        device: Device,
+        stream: Stream,
+        shape: tuple[int, ...],
+        dtype: np.dtype,
+        strides: tuple[int, ...],
+        order: str,
+    ) -> None:
+        self._buffer = buffer
+        self._device = device
+        self._stream = stream
+        self._order = order
+        self._shape = shape
+        self._dtype = dtype
+        self._strides = strides
+
+    @classmethod
+    def empty(
+        cls,
+        shape: int | Iterable[int],
+        dtype: DTypeLike,
+        *,
+        order: str = "C",
+        device: Device | None = None,
+        stream: Stream | None = None,
+    ) -> DeviceArray:
+        """Allocate an uninitialized device array."""
+        shape = _normalize_shape(shape)
+        dtype = np.dtype(dtype)
+        order = order.upper()
+        if order not in ("C", "F"):
+            raise ValueError("order must be either 'C' or 'F'")
+        if dtype.itemsize == 0:
+            raise ValueError("zero-sized dtypes are not supported")
+
+        device, stream = _resolve_device_and_stream(device, stream)
+        buffer = device.allocate(math.prod(shape) * dtype.itemsize, stream=stream)
+        result = cls(
+            buffer,
+            device,
+            stream,
+            shape,
+            dtype,
+            _contiguous_strides(shape, dtype.itemsize, order),
+            order,
+        )
+
+        # Device allocation is stream ordered. Synchronizing makes an empty array
+        # safe to hand to a test that subsequently uses a different stream.
+        stream.sync()
+        return result
+
+    @classmethod
+    def from_numpy(
+        cls,
+        array: np.ndarray,
+        *,
+        device: Device | None = None,
+        stream: Stream | None = None,
+    ) -> DeviceArray:
+        """Allocate a device array and initialize it from a NumPy array."""
+        host_array = np.asarray(array)
+        if host_array.dtype.itemsize == 0:
+            raise ValueError("zero-sized dtypes are not supported")
+
+        if host_array.flags.c_contiguous:
+            order = "C"
+        elif host_array.flags.f_contiguous:
+            order = "F"
+        else:
+            host_array = np.ascontiguousarray(host_array)
+            order = "C"
+
+        device, stream = _resolve_device_and_stream(device, stream)
+        buffer = device.allocate(host_array.nbytes, stream=stream)
+        result = cls(
+            buffer,
+            device,
+            stream,
+            host_array.shape,
+            host_array.dtype,
+            host_array.strides,
+            order,
+        )
+        result._copy_from_host_array(host_array, stream)
+        stream.sync()
+        return result
+
+    @property
+    def nbytes(self) -> int:
+        return self._buffer.size
+
+    @property
+    def dtype(self) -> np.dtype:
+        return self._dtype
+
+    def __len__(self) -> int:
+        if not self._shape:
+            raise TypeError("len() of unsized object")
+        return self._shape[0]
+
+    @property
+    def __cuda_array_interface__(self) -> dict[str, object]:
+        interface: dict[str, object] = {
+            "data": (0 if self.nbytes == 0 else int(self._buffer.handle), False),
+            "shape": self._shape,
+            "strides": None if self._is_c_contiguous() else self._strides,
+            "typestr": self._dtype.str,
+            "version": 3,
+        }
+        if self._dtype.fields is not None:
+            interface["descr"] = self._dtype.descr
+        return interface
+
+    def _is_c_contiguous(self) -> bool:
+        return (
+            self._order == "C"
+            or self.nbytes == 0
+            or sum(dimension > 1 for dimension in self._shape) <= 1
+        )
+
+    @staticmethod
+    def _host_buffer(array: np.ndarray) -> Buffer:
+        # Buffer.from_handle does not own the host memory. `owner` ties the NumPy
+        # allocation to this temporary Buffer; the caller also retains the array
+        # and synchronizes the copy stream before returning.
+        return Buffer.from_handle(
+            ptr=int(array.ctypes.data), size=array.nbytes, owner=array
+        )
+
+    def _copy_stream(self, stream: Stream | None) -> Stream:
+        if stream is None:
+            # The allocation stream is not necessarily the last stream to have
+            # used the array. Synchronize the device when that stream is unknown.
+            self._device.sync()
+            return self._stream
+        if stream.device.device_id != self._device.device_id:
+            raise ValueError("copy stream must belong to the array's device")
+        return stream
+
+    def _copy_from_host_array(self, array: np.ndarray, stream: Stream) -> None:
+        self._buffer.copy_from(self._host_buffer(array), stream=stream)
+
+    def copy_from_host(
+        self, array: np.ndarray, *, stream: Stream | None = None
+    ) -> None:
+        """Replace the array's contents from a shape- and dtype-matched NumPy array."""
+        host_array = np.asarray(array)
+        if host_array.shape != self._shape:
+            raise ValueError(
+                f"source shape {host_array.shape} does not match {self._shape}"
+            )
+        if host_array.dtype != self._dtype:
+            raise TypeError(
+                f"source dtype {host_array.dtype} does not match {self._dtype}"
+            )
+
+        if self._order == "F":
+            host_array = np.asfortranarray(host_array)
+        else:
+            host_array = np.ascontiguousarray(host_array)
+
+        self._device.set_current()
+        stream = self._copy_stream(stream)
+        self._copy_from_host_array(host_array, stream)
+        stream.sync()
+
+    def copy_to_host(self, *, stream: Stream | None = None) -> np.ndarray:
+        """Return an owning NumPy copy of the array."""
+        self._device.set_current()
+        stream = self._copy_stream(stream)
+
+        result = np.empty(self._shape, dtype=self._dtype, order=self._order)
+        self._buffer.copy_to(self._host_buffer(result), stream=stream)
+        stream.sync()
+        return result
diff --git a/python/cuda_cccl/tests/compute/conftest.py b/python/cuda_cccl/tests/compute/conftest.py
index 1fa66c48360..ef72dc0fc54 100644
--- a/python/cuda_cccl/tests/compute/conftest.py
+++ b/python/cuda_cccl/tests/compute/conftest.py
@@ -1,9 +1,11 @@
 import builtins
+from collections.abc import Generator
 
-import cupy as cp
 import numpy as np
 import pytest
 
+from cuda.core import Device, Stream
+
 check_ldl_stl_in_sass = False
 
 
@@ -37,15 +39,15 @@ def input_array(request):
             low_inclusive, high_exclusive = 0, 8
         else:
             low_inclusive, high_exclusive = -5, 6
-        array = cp.random.randint(
+        array = np.random.randint(
             low=low_inclusive, high=high_exclusive, size=sample_size, dtype=dtype
         )
     elif np.issubdtype(dtype, np.floating):
         # For floating-point types, use np.random.random and cast to the required dtype
-        array = cp.random.random(sample_size).astype(dtype)
+        array = np.random.random(sample_size).astype(dtype)
     elif np.issubdtype(dtype, np.complexfloating):
         # For complex types, generate random real and imaginary parts
-        packed = cp.random.random(2 * sample_size)
+        packed = np.random.random(2 * sample_size)
         real_part = packed[:sample_size]
         imag_part = packed[sample_size:]
         array = (real_part + 1j * imag_part).astype(dtype)
@@ -65,29 +67,19 @@ def floating_array(request):
     sample_size = 1000
 
     # Generate random floating-point values
-    array = cp.random.random(sample_size).astype(dtype)
+    array = np.random.random(sample_size).astype(dtype)
     return array
 
 
-class Stream:
-    """
-    Simple cupy stream wrapper that implements the __cuda_stream__ protocol.
-    """
-
-    def __init__(self, cp_stream):
-        self.cp_stream = cp_stream
-
-    def __cuda_stream__(self):
-        return (0, self.cp_stream.ptr)
-
-    @property
-    def ptr(self):
-        return self.cp_stream.ptr
-
-
 @pytest.fixture(scope="function")
-def cuda_stream() -> Stream:
-    return Stream(cp.cuda.Stream())
+def cuda_stream() -> Generator[Stream, None, None]:
+    device = Device()
+    device.set_current()
+    stream = device.create_stream()
+    try:
+        yield stream
+    finally:
+        stream.close()
 
 
 @pytest.fixture(scope="function", autouse=True)
diff --git a/python/cuda_cccl/tests/compute/test_binary_search.py b/python/cuda_cccl/tests/compute/test_binary_search.py
index 585a998c44c..ac38d92fc1d 100644
--- a/python/cuda_cccl/tests/compute/test_binary_search.py
+++ b/python/cuda_cccl/tests/compute/test_binary_search.py
@@ -1,9 +1,9 @@
 # Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-import cupy as cp
 import numpy as np
 import pytest
+from _utils.device_array import DeviceArray
 
 import cuda.compute
 from cuda.compute import OpKind
@@ -51,21 +51,21 @@ def test_binary_search_explicit_opkind_less(search, side):
     h_data = np.array([1, 3, 3, 7, 9], dtype=np.int32)
     h_values = np.array([0, 3, 4, 10], dtype=np.int32)
 
-    d_data = cp.asarray(h_data)
-    d_values = cp.asarray(h_values)
-    d_out = cp.empty(len(h_values), dtype=np.uintp)
+    d_data = DeviceArray.from_numpy(h_data)
+    d_values = DeviceArray.from_numpy(h_values)
+    d_out = DeviceArray.empty(len(h_values), np.uintp)
 
     search(
         d_data=d_data,
-        num_items=len(d_data),
+        num_items=len(h_data),
         d_values=d_values,
-        num_values=len(d_values),
+        num_values=len(h_values),
         d_out=d_out,
         comp=OpKind.LESS,
     )
 
     expected = np.searchsorted(h_data, h_values, side=side).astype(np.uintp)
-    np.testing.assert_array_equal(d_out.get(), expected)
+    np.testing.assert_array_equal(d_out.copy_to_host(), expected)
 
 
 @pytest.mark.parametrize(
@@ -82,21 +82,21 @@ def test_binary_search_custom_comparator(search, side):
     def greater(lhs, rhs):
         return lhs > rhs
 
-    d_data = cp.asarray(h_data)
-    d_values = cp.asarray(h_values)
-    d_out = cp.empty(len(h_values), dtype=np.uintp)
+    d_data = DeviceArray.from_numpy(h_data)
+    d_values = DeviceArray.from_numpy(h_values)
+    d_out = DeviceArray.empty(len(h_values), np.uintp)
 
     search(
         d_data=d_data,
-        num_items=len(d_data),
+        num_items=len(h_data),
         d_values=d_values,
-        num_values=len(d_values),
+        num_values=len(h_values),
         d_out=d_out,
         comp=greater,
     )
 
     expected = np.searchsorted(-h_data, -h_values, side=side).astype(np.uintp)
-    np.testing.assert_array_equal(d_out.get(), expected)
+    np.testing.assert_array_equal(d_out.copy_to_host(), expected)
 
 
 @pytest.mark.parametrize("dtype", DTYPE_LIST)
@@ -107,9 +107,9 @@ def test_lower_bound_basic(dtype, num_items, num_values):
     h_data = random_sorted_array(num_items, dtype)
     h_values = random_sorted_array(num_values, dtype)
 
-    d_data = cp.asarray(h_data)
-    d_values = cp.asarray(h_values)
-    d_out = cp.empty(num_values, dtype=np.uintp)
+    d_data = DeviceArray.from_numpy(h_data)
+    d_values = DeviceArray.from_numpy(h_values)
+    d_out = DeviceArray.empty(num_values, np.uintp)
 
     cuda.compute.lower_bound(
         d_data=d_data,
@@ -120,7 +120,7 @@ def test_lower_bound_basic(dtype, num_items, num_values):
     )
 
     expected = np.searchsorted(h_data, h_values, side="left").astype(np.uintp)
-    got = cp.asnumpy(d_out)
+    got = d_out.copy_to_host()
     assert np.array_equal(got, expected)
 
 
@@ -132,9 +132,9 @@ def test_upper_bound_basic(dtype, num_items, num_values):
     h_data = random_sorted_array(num_items, dtype)
     h_values = random_sorted_array(num_values, dtype)
 
-    d_data = cp.asarray(h_data)
-    d_values = cp.asarray(h_values)
-    d_out = cp.empty(num_values, dtype=np.uintp)
+    d_data = DeviceArray.from_numpy(h_data)
+    d_values = DeviceArray.from_numpy(h_values)
+    d_out = DeviceArray.empty(num_values, np.uintp)
 
     cuda.compute.upper_bound(
         d_data=d_data,
@@ -145,7 +145,7 @@ def test_upper_bound_basic(dtype, num_items, num_values):
     )
 
     expected = np.searchsorted(h_data, h_values, side="right").astype(np.uintp)
-    got = cp.asnumpy(d_out)
+    got = d_out.copy_to_host()
     assert np.array_equal(got, expected)
 
 
@@ -164,9 +164,9 @@ def test_binary_search_with_duplicates(dtype):
         else rng.random(128, dtype=dtype)
     )
 
-    d_data = cp.asarray(h_data)
-    d_values = cp.asarray(h_values)
-    d_out = cp.empty(len(h_values), dtype=np.uintp)
+    d_data = DeviceArray.from_numpy(h_data)
+    d_values = DeviceArray.from_numpy(h_values)
+    d_out = DeviceArray.empty(len(h_values), np.uintp)
 
     cuda.compute.lower_bound(
         d_data=d_data,
@@ -176,7 +176,7 @@ def test_binary_search_with_duplicates(dtype):
         d_out=d_out,
     )
     expected = np.searchsorted(h_data, h_values, side="left").astype(np.uintp)
-    got = cp.asnumpy(d_out)
+    got = d_out.copy_to_host()
     assert np.array_equal(got, expected)
 
     cuda.compute.upper_bound(
@@ -187,39 +187,43 @@ def test_binary_search_with_duplicates(dtype):
         d_out=d_out,
     )
     expected = np.searchsorted(h_data, h_values, side="right").astype(np.uintp)
-    got = cp.asnumpy(d_out)
+    got = d_out.copy_to_host()
     assert np.array_equal(got, expected)
 
 
 def test_binary_search_requires_unsigned_output():
     """Output must be unsigned integer dtype for indices."""
-    d_data = cp.asarray(np.array([1, 2, 3, 4], dtype=np.int32))
-    d_values = cp.asarray(np.array([2, 3], dtype=np.int32))
-    d_out = cp.empty(len(d_values), dtype=np.int32)  # signed, should fail
+    h_data = np.array([1, 2, 3, 4], dtype=np.int32)
+    h_values = np.array([2, 3], dtype=np.int32)
+    d_data = DeviceArray.from_numpy(h_data)
+    d_values = DeviceArray.from_numpy(h_values)
+    d_out = DeviceArray.empty(len(h_values), np.int32)  # signed, should fail
 
     with pytest.raises(TypeError, match="unsigned integer"):
         cuda.compute.lower_bound(
             d_data=d_data,
-            num_items=len(d_data),
+            num_items=len(h_data),
             d_values=d_values,
-            num_values=len(d_values),
+            num_values=len(h_values),
             d_out=d_out,
         )
 
 
 def test_binary_search_requires_pointer_sized_output():
     """Output must be pointer-sized (np.uintp) to hold any valid index."""
-    d_data = cp.asarray(np.array([1, 2, 3, 4], dtype=np.int32))
-    d_values = cp.asarray(np.array([2, 3], dtype=np.int32))
-    d_out = cp.empty(
-        len(d_values), dtype=np.uint32
+    h_data = np.array([1, 2, 3, 4], dtype=np.int32)
+    h_values = np.array([2, 3], dtype=np.int32)
+    d_data = DeviceArray.from_numpy(h_data)
+    d_values = DeviceArray.from_numpy(h_values)
+    d_out = DeviceArray.empty(
+        len(h_values), np.uint32
     )  # unsigned but not pointer-sized (on 64-bit)
 
     with pytest.raises(ValueError, match="pointer-sized"):
         cuda.compute.lower_bound(
             d_data=d_data,
-            num_items=len(d_data),
+            num_items=len(h_data),
             d_values=d_values,
-            num_values=len(d_values),
+            num_values=len(h_values),
             d_out=d_out,
         )
diff --git a/python/cuda_cccl/tests/compute/test_deferred_annotations.py b/python/cuda_cccl/tests/compute/test_deferred_annotations.py
index c30aeda4068..757c5d6e71f 100644
--- a/python/cuda_cccl/tests/compute/test_deferred_annotations.py
+++ b/python/cuda_cccl/tests/compute/test_deferred_annotations.py
@@ -5,8 +5,8 @@
 
 from __future__ import annotations
 
-import cupy as cp
 import numpy as np
+from _utils.device_array import DeviceArray
 
 from cuda.compute import OpKind, TransformIterator, gpu_struct, reduce_into
 
@@ -25,18 +25,19 @@ def test_transform_iterator_future_annotations():
     def add_one(x: "np.int32") -> "np.int32":
         return x + np.int32(1)
 
-    d_in = cp.arange(8, dtype=np.int32)
-    d_out = cp.empty(1, dtype=np.int32)
+    h_in = np.arange(8, dtype=np.int32)
+    d_in = DeviceArray.from_numpy(h_in)
+    d_out = DeviceArray.empty(1, np.int32)
     h_init = np.array([0], dtype=np.int32)
 
     transform_it = TransformIterator(d_in, add_one)
     reduce_into(
         d_in=transform_it,
         d_out=d_out,
-        num_items=d_in.size,
+        num_items=h_in.size,
         op=OpKind.PLUS,
         h_init=h_init,
     )
 
-    expected = int(cp.sum(d_in + 1).get())
-    assert int(d_out.get()[0]) == expected
+    expected = int(np.sum(h_in + 1))
+    assert int(d_out.copy_to_host()[0]) == expected
diff --git a/python/cuda_cccl/tests/compute/test_histogram.py b/python/cuda_cccl/tests/compute/test_histogram.py
index a002b0f9131..cea8a2519cf 100644
--- a/python/cuda_cccl/tests/compute/test_histogram.py
+++ b/python/cuda_cccl/tests/compute/test_histogram.py
@@ -4,9 +4,9 @@
 
 import math
 
-import cupy as cp
 import numpy as np
 import pytest
+from _utils.device_array import DeviceArray
 
 import cuda.compute
 from cuda.compute import (
@@ -106,9 +106,9 @@ def test_device_histogram_basic_use(dtype, num_samples):
     upper_level = dtype(max_level)
 
     h_samples = random_int_array(num_samples, dtype)
-    d_samples = cp.asarray(h_samples)
+    d_samples = DeviceArray.from_numpy(h_samples)
 
-    d_histogram = cp.zeros(num_levels - 1, dtype=np.int32)
+    d_histogram = DeviceArray.from_numpy(np.zeros(num_levels - 1, dtype=np.int32))
 
     cuda.compute.histogram_even(
         d_samples=d_samples,
@@ -122,7 +122,7 @@ def test_device_histogram_basic_use(dtype, num_samples):
     h_expected = compute_reference_histogram(
         h_samples, num_levels, lower_level, upper_level
     )
-    h_result = cp.asnumpy(d_histogram)
+    h_result = d_histogram.copy_to_host()
 
     np.testing.assert_array_equal(h_result, h_expected)
 
@@ -138,7 +138,7 @@ def test_device_histogram_sample_iterator():
 
     counting_it = CountingIterator(np.int32(0))
 
-    d_histogram = cp.zeros(num_levels - 1, dtype=np.int32)
+    d_histogram = DeviceArray.from_numpy(np.zeros(num_levels - 1, dtype=np.int32))
 
     # Set up levels so that values 0 to adjusted_total_samples-1 are evenly distributed
     lower_level = np.int32(0.0)
@@ -155,20 +155,20 @@ def test_device_histogram_sample_iterator():
 
     # Each bin should have exactly samples_per_bin elements
     h_expected = np.full(num_bins, samples_per_bin, dtype=np.int32)
-    h_result = cp.asnumpy(d_histogram)
+    h_result = d_histogram.copy_to_host()
 
     np.testing.assert_array_equal(h_result, h_expected)
 
 
 def test_device_histogram_single_sample():
     h_samples = np.array([5.0], dtype=np.float32)
-    d_samples = cp.asarray(h_samples)
+    d_samples = DeviceArray.from_numpy(h_samples)
 
     num_levels = 5
     lower_level = np.float32(0.0)
     upper_level = np.float32(10.0)
 
-    d_histogram = cp.zeros(num_levels - 1, dtype=np.int32)
+    d_histogram = DeviceArray.from_numpy(np.zeros(num_levels - 1, dtype=np.int32))
 
     cuda.compute.histogram_even(
         d_samples=d_samples,
@@ -181,20 +181,20 @@ def test_device_histogram_single_sample():
 
     # Sample 5.0 should go into bin 2 (bins: [0,2.5), [2.5,5), [5,7.5), [7.5,10))
     h_expected = np.array([0, 0, 1, 0], dtype=np.int32)
-    h_result = cp.asnumpy(d_histogram)
+    h_result = d_histogram.copy_to_host()
 
     np.testing.assert_array_equal(h_result, h_expected)
 
 
 def test_device_histogram_out_of_range():
     h_samples = np.array([-1.0, 0.5, 5.5, 10.5, 15.0], dtype=np.float32)
-    d_samples = cp.asarray(h_samples)
+    d_samples = DeviceArray.from_numpy(h_samples)
 
     num_levels = 3  # 2 bins: [0,5), [5,10)
     lower_level = np.float32(0.0)
     upper_level = np.float32(10.0)
 
-    d_histogram = cp.zeros(num_levels - 1, dtype=np.int32)
+    d_histogram = DeviceArray.from_numpy(np.zeros(num_levels - 1, dtype=np.int32))
 
     cuda.compute.histogram_even(
         d_samples=d_samples,
@@ -208,26 +208,22 @@ def test_device_histogram_out_of_range():
     # Only 0.5 (bin 0) and 5.5 (bin 1) should be counted
     # -1.0, 10.5, and 15.0 are out of range
     h_expected = np.array([1, 1], dtype=np.int32)
-    h_result = cp.asnumpy(d_histogram)
+    h_result = d_histogram.copy_to_host()
 
     np.testing.assert_array_equal(h_result, h_expected)
 
 
 def test_device_histogram_with_stream(cuda_stream):
-    cp_stream = cp.cuda.ExternalStream(cuda_stream.ptr)
-
     h_samples = np.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0], dtype=np.float32)
-    d_samples = cp.asarray(h_samples)
+    d_samples = DeviceArray.from_numpy(h_samples, stream=cuda_stream)
 
     num_levels = 5  # 4 bins: [0,2), [2,4), [4,6), [6,8)
     lower_level = np.float32(0.0)
     upper_level = np.float32(8.0)
 
-    d_histogram = cp.zeros(num_levels - 1, dtype=np.int32)
-
-    with cp_stream:
-        d_samples = cp.asarray(h_samples)
-        d_histogram = cp.zeros(num_levels - 1, dtype=np.int32)
+    d_histogram = DeviceArray.from_numpy(
+        np.zeros(num_levels - 1, dtype=np.int32), stream=cuda_stream
+    )
 
     cuda.compute.histogram_even(
         d_samples=d_samples,
@@ -239,8 +235,7 @@ def test_device_histogram_with_stream(cuda_stream):
         stream=cuda_stream,
     )
 
-    with cp_stream:
-        h_result = cp.asnumpy(d_histogram)
+    h_result = d_histogram.copy_to_host(stream=cuda_stream)
 
     # Expected: bin 0: [1.0, 2.0), bin 1: [2.0, 4.0), bin 2: [4.0, 6.0), bin 3: [6.0, 8.0)
     # Values: 1.0->bin0, 2.0->bin1, 3.0->bin1, 4.0->bin2, 5.0->bin2, 6.0->bin3, 7.0->bin3, 8.0->out_of_range
@@ -258,7 +253,7 @@ def test_device_histogram_with_constant_iterator():
     lower_level = np.float32(0.0)
     upper_level = np.float32(8.0)
 
-    d_histogram = cp.zeros(num_levels - 1, dtype=np.int32)
+    d_histogram = DeviceArray.from_numpy(np.zeros(num_levels - 1, dtype=np.int32))
 
     cuda.compute.histogram_even(
         d_samples=constant_it,
@@ -269,7 +264,7 @@ def test_device_histogram_with_constant_iterator():
         num_samples=num_samples,
     )
 
-    h_result = cp.asnumpy(d_histogram)
+    h_result = d_histogram.copy_to_host()
 
     # Expected: All 10 samples have value 3.0, which falls in bin 1 [2,4)
     h_expected = np.array([0, 10, 0, 0], dtype=np.int32)
@@ -278,16 +273,13 @@ def test_device_histogram_with_constant_iterator():
 
 
 def test_histogram_even():
-    import cupy as cp
-    import numpy as np
-
     num_samples = 10
     h_samples = np.array(
         [2.2, 6.1, 7.1, 2.9, 3.5, 0.3, 2.9, 2.1, 6.1, 999.5], dtype="float32"
     )
-    d_samples = cp.asarray(h_samples)
+    d_samples = DeviceArray.from_numpy(h_samples)
     num_levels = 7
-    d_histogram = cp.empty(num_levels - 1, dtype="int32")
+    d_histogram = DeviceArray.empty(num_levels - 1, np.int32)
     lower_level = np.float32(0)
     upper_level = np.float32(12)
 
@@ -302,7 +294,7 @@ def test_histogram_even():
     )
 
     # Check the result is correct
-    h_actual_histogram = cp.asnumpy(d_histogram)
+    h_actual_histogram = d_histogram.copy_to_host()
     # Calculate expected histogram using numpy
     h_expected_histogram, _ = np.histogram(
         h_samples, bins=num_levels - 1, range=(lower_level, upper_level)
@@ -319,8 +311,6 @@ def test_histogram_cache_bug_crosses_256_bin_threshold():
     # in invalid shared memory accesses, because a different shared
     # memory strategy is used for num_bins > 256.
     num_samples = 128
-    d_samples = cp.empty(num_samples, dtype=np.int32)
-    d_histogram = cp.empty(2048, dtype=np.int32)
     h_num_output_levels = np.array([0], dtype=np.int32)
     h_lower_level = np.array([0], dtype=np.int32)
     h_upper_level = np.array([0], dtype=np.int32)
@@ -331,12 +321,13 @@ def test_histogram_cache_bug_crosses_256_bin_threshold():
     h_lower_level[0] = 0
     h_upper_level[0] = num_bins_1
 
-    d_samples[:] = cp.random.randint(0, num_bins_1, size=num_samples, dtype=np.int32)
-    d_histogram[:num_bins_1] = 0
+    h_samples = np.random.randint(0, num_bins_1, size=num_samples, dtype=np.int32)
+    d_samples = DeviceArray.from_numpy(h_samples)
+    d_histogram = DeviceArray.from_numpy(np.zeros(num_bins_1, dtype=np.int32))
 
     hist = cuda.compute.make_histogram_even(
         d_samples=d_samples,
-        d_histogram=d_histogram[:num_bins_1],
+        d_histogram=d_histogram,
         h_num_output_levels=h_num_output_levels,
         h_lower_level=h_lower_level,
         h_upper_level=h_upper_level,
@@ -345,36 +336,36 @@ def test_histogram_cache_bug_crosses_256_bin_threshold():
     temp_bytes = hist(
         temp_storage=None,
         d_samples=d_samples,
-        d_histogram=d_histogram[:num_bins_1],
+        d_histogram=d_histogram,
         h_num_output_levels=h_num_output_levels,
         h_lower_level=h_lower_level,
         h_upper_level=h_upper_level,
         num_samples=num_samples,
     )
-    temp_storage = cp.empty(temp_bytes, dtype=np.uint8)
+    temp_storage = DeviceArray.empty(temp_bytes, np.uint8)
     hist(
         temp_storage=temp_storage,
         d_samples=d_samples,
-        d_histogram=d_histogram[:num_bins_1],
+        d_histogram=d_histogram,
         h_num_output_levels=h_num_output_levels,
         h_lower_level=h_lower_level,
         h_upper_level=h_upper_level,
         num_samples=num_samples,
     )
-    cp.cuda.Device().synchronize()
-    assert int(d_histogram[:num_bins_1].sum()) == num_samples
+    assert int(d_histogram.copy_to_host().sum()) == num_samples
 
     num_bins_2 = 2048
     h_num_output_levels[0] = num_bins_2 + 1
     h_lower_level[0] = 0
     h_upper_level[0] = num_bins_2
 
-    d_samples[:] = cp.random.randint(0, num_bins_2, size=num_samples, dtype=np.int32)
-    d_histogram[:num_bins_2] = 0
+    h_samples = np.random.randint(0, num_bins_2, size=num_samples, dtype=np.int32)
+    d_samples = DeviceArray.from_numpy(h_samples)
+    d_histogram = DeviceArray.from_numpy(np.zeros(num_bins_2, dtype=np.int32))
 
     hist2 = cuda.compute.make_histogram_even(
         d_samples=d_samples,
-        d_histogram=d_histogram[:num_bins_2],
+        d_histogram=d_histogram,
         h_num_output_levels=h_num_output_levels,
         h_lower_level=h_lower_level,
         h_upper_level=h_upper_level,
@@ -384,24 +375,23 @@ def test_histogram_cache_bug_crosses_256_bin_threshold():
     temp_bytes2 = hist2(
         temp_storage=None,
         d_samples=d_samples,
-        d_histogram=d_histogram[:num_bins_2],
+        d_histogram=d_histogram,
         h_num_output_levels=h_num_output_levels,
         h_lower_level=h_lower_level,
         h_upper_level=h_upper_level,
         num_samples=num_samples,
     )
-    temp_storage2 = cp.empty(temp_bytes2, dtype=np.uint8)
+    temp_storage2 = DeviceArray.empty(temp_bytes2, np.uint8)
     hist2(
         temp_storage=temp_storage2,
         d_samples=d_samples,
-        d_histogram=d_histogram[:num_bins_2],
+        d_histogram=d_histogram,
         h_num_output_levels=h_num_output_levels,
         h_lower_level=h_lower_level,
         h_upper_level=h_upper_level,
         num_samples=num_samples,
     )
-    cp.cuda.Device().synchronize()
-    assert int(d_histogram[:num_bins_2].sum()) == num_samples
+    assert int(d_histogram.copy_to_host().sum()) == num_samples
 
 
 def test_histogram_cache_reuses_artifact_when_bounds_change():
@@ -409,8 +399,8 @@ def test_histogram_cache_reuses_artifact_when_bounds_change():
 
     num_samples = 8
     num_levels = 5
-    d_samples = cp.asarray(np.arange(num_samples, dtype=np.float32))
-    d_histogram = cp.empty(num_levels - 1, dtype=np.int32)
+    d_samples = DeviceArray.from_numpy(np.arange(num_samples, dtype=np.float32))
+    d_histogram = DeviceArray.empty(num_levels - 1, np.int32)
     h_num_output_levels = np.array([num_levels], dtype=np.int32)
 
     h_lower_level_1 = np.array([0], dtype=np.float32)
@@ -436,8 +426,8 @@ def test_histogram_cache_reuses_artifact_when_bounds_change():
     )
     assert hist1 is hist2
 
-    d_samples = cp.asarray(np.arange(10, 18, dtype=np.float32))
-    d_histogram.fill(0)
+    d_samples = DeviceArray.from_numpy(np.arange(10, 18, dtype=np.float32))
+    d_histogram.copy_from_host(np.zeros(num_levels - 1, dtype=np.int32))
     temp_bytes = hist2(
         temp_storage=None,
         d_samples=d_samples,
@@ -447,7 +437,7 @@ def test_histogram_cache_reuses_artifact_when_bounds_change():
         h_upper_level=h_upper_level_2,
         num_samples=num_samples,
     )
-    temp_storage = cp.empty(temp_bytes, dtype=np.uint8)
+    temp_storage = DeviceArray.empty(temp_bytes, np.uint8)
     hist2(
         temp_storage=temp_storage,
         d_samples=d_samples,
@@ -457,10 +447,8 @@ def test_histogram_cache_reuses_artifact_when_bounds_change():
         h_upper_level=h_upper_level_2,
         num_samples=num_samples,
     )
-    cp.cuda.Device().synchronize()
-
     np.testing.assert_array_equal(
-        cp.asnumpy(d_histogram), np.array([2, 2, 2, 2], dtype=np.int32)
+        d_histogram.copy_to_host(), np.array([2, 2, 2, 2], dtype=np.int32)
     )
 
 
@@ -468,13 +456,13 @@ def test_histogram_cache_reuses_artifact_for_same_offset_width():
     cuda.compute.clear_all_caches()
 
     num_levels = 5
-    d_histogram = cp.empty(num_levels - 1, dtype=np.int32)
+    d_histogram = DeviceArray.empty(num_levels - 1, np.int32)
     h_num_output_levels = np.array([num_levels], dtype=np.int32)
     h_lower_level = np.array([0], dtype=np.float32)
     h_upper_level = np.array([12], dtype=np.float32)
 
     hist1 = cuda.compute.make_histogram_even(
-        d_samples=cp.asarray(np.arange(8, dtype=np.float32)),
+        d_samples=DeviceArray.from_numpy(np.arange(8, dtype=np.float32)),
         d_histogram=d_histogram,
         h_num_output_levels=h_num_output_levels,
         h_lower_level=h_lower_level,
@@ -482,7 +470,7 @@ def test_histogram_cache_reuses_artifact_for_same_offset_width():
         num_samples=8,
     )
     hist2 = cuda.compute.make_histogram_even(
-        d_samples=cp.asarray(np.arange(12, dtype=np.float32)),
+        d_samples=DeviceArray.from_numpy(np.arange(12, dtype=np.float32)),
         d_histogram=d_histogram,
         h_num_output_levels=h_num_output_levels,
         h_lower_level=h_lower_level,
@@ -497,7 +485,7 @@ def test_histogram_cache_reuses_artifact_for_same_offset_width():
         np.iinfo(np.int32).max / np.dtype(np.float32).itemsize
     )
     hist3 = cuda.compute.make_histogram_even(
-        d_samples=cp.asarray(np.arange(12, dtype=np.float32)),
+        d_samples=DeviceArray.from_numpy(np.arange(12, dtype=np.float32)),
         d_histogram=d_histogram,
         h_num_output_levels=h_num_output_levels,
         h_lower_level=h_lower_level,
@@ -506,8 +494,8 @@ def test_histogram_cache_reuses_artifact_for_same_offset_width():
     )
     assert hist3 is not hist1
 
-    d_samples = cp.asarray(np.arange(12, dtype=np.float32))
-    d_histogram.fill(0)
+    d_samples = DeviceArray.from_numpy(np.arange(12, dtype=np.float32))
+    d_histogram.copy_from_host(np.zeros(num_levels - 1, dtype=np.int32))
     temp_bytes = hist2(
         temp_storage=None,
         d_samples=d_samples,
@@ -517,7 +505,7 @@ def test_histogram_cache_reuses_artifact_for_same_offset_width():
         h_upper_level=h_upper_level,
         num_samples=12,
     )
-    temp_storage = cp.empty(temp_bytes, dtype=np.uint8)
+    temp_storage = DeviceArray.empty(temp_bytes, np.uint8)
     hist2(
         temp_storage=temp_storage,
         d_samples=d_samples,
@@ -527,17 +515,15 @@ def test_histogram_cache_reuses_artifact_for_same_offset_width():
         h_upper_level=h_upper_level,
         num_samples=12,
     )
-    cp.cuda.Device().synchronize()
-
     np.testing.assert_array_equal(
-        cp.asnumpy(d_histogram), np.array([3, 3, 3, 3], dtype=np.int32)
+        d_histogram.copy_to_host(), np.array([3, 3, 3, 3], dtype=np.int32)
     )
 
 
 def test_make_histogram_even_rejects_mismatched_bound_dtypes():
     num_samples = 8
-    d_samples = cp.asarray(np.arange(num_samples, dtype=np.int32))
-    d_histogram = cp.empty(4, dtype=np.int32)
+    d_samples = DeviceArray.from_numpy(np.arange(num_samples, dtype=np.int32))
+    d_histogram = DeviceArray.empty(4, np.int32)
 
     with pytest.raises(TypeError, match="must have the same dtype"):
         cuda.compute.make_histogram_even(
diff --git a/python/cuda_cccl/tests/compute/test_iterators.py b/python/cuda_cccl/tests/compute/test_iterators.py
index 49e2ab39d15..c5b57e0581d 100644
--- a/python/cuda_cccl/tests/compute/test_iterators.py
+++ b/python/cuda_cccl/tests/compute/test_iterators.py
@@ -3,10 +3,9 @@
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-import cupy as cp
-import numba.cuda
 import numpy as np
 import pytest
+from _utils.device_array import DeviceArray
 
 import cuda.compute
 from cuda.compute import OpKind
@@ -43,9 +42,9 @@ def test_counting_iterator_equality():
 
 
 def test_cache_modified_input_iterator_equality():
-    ary1 = cp.asarray([0, 1, 2], dtype="int32")
-    ary2 = cp.asarray([3, 4, 5], dtype="int32")
-    ary3 = cp.asarray([0, 1, 2], dtype="int64")
+    ary1 = DeviceArray.from_numpy(np.asarray([0, 1, 2], dtype="int32"))
+    ary2 = DeviceArray.from_numpy(np.asarray([3, 4, 5], dtype="int32"))
+    ary3 = DeviceArray.from_numpy(np.asarray([0, 1, 2], dtype="int64"))
 
     it1 = CacheModifiedInputIterator(ary1, "stream")
     it2 = CacheModifiedInputIterator(ary1, "stream")
@@ -76,8 +75,8 @@ def op3(x):
     # op3 has a different name than op1, so should have a different kind
     assert it1.kind != it3.kind
 
-    ary1 = cp.asarray([0, 1, 2])
-    ary2 = cp.asarray([3, 4, 5])
+    ary1 = DeviceArray.from_numpy(np.asarray([0, 1, 2]))
+    ary2 = DeviceArray.from_numpy(np.asarray([3, 4, 5]))
     it4 = TransformIterator(ary1, op1)
     it5 = TransformIterator(ary1, op1)
     it6 = TransformIterator(ary1, op2)
@@ -91,46 +90,10 @@ def op3(x):
     assert it4.kind != it7.kind
 
 
-@pytest.fixture(
-    params=[
-        # Each tuple is (shape, layout, array_type)
-        ((5,), "C", "cupy"),
-        ((5,), "F", "cupy"),
-        ((5,), "C", "numba"),
-        ((5,), "F", "numba"),
-        ((4, 3), "C", "cupy"),
-        ((4, 3), "F", "cupy"),
-        ((4, 3), "C", "numba"),
-        ((4, 3), "F", "numba"),
-        ((3, 4, 2), "C", "cupy"),
-        ((3, 4, 2), "F", "cupy"),
-        ((3, 4, 2), "C", "numba"),
-        ((3, 4, 2), "F", "numba"),
-    ],
-    ids=lambda param: f"{param[2]}_{param[1]}_{len(param[0])}D",
-)
-def reverse_iterator_array(request):
-    shape, layout, array_type = request.param
-
-    # Create base numpy array
-    base_array = np.arange(np.prod(shape))
-    base_array[-1] = -999
-    base_array = base_array.reshape(shape)
-    if layout == "F":
-        base_array = np.asfortranarray(base_array)
-
-    if array_type == "cupy":
-        array = cp.array(base_array)
-    else:
-        array = numba.cuda.to_device(base_array)
-
-    return array
-
-
 def test_reverse_input_iterator_equality():
-    ary1 = cp.asarray([0, 1, 2], dtype="int32")
-    ary2 = cp.asarray([3, 4, 5], dtype="int32")
-    ary3 = cp.asarray([0, 1, 2], dtype="int64")
+    ary1 = DeviceArray.from_numpy(np.asarray([0, 1, 2], dtype="int32"))
+    ary2 = DeviceArray.from_numpy(np.asarray([3, 4, 5], dtype="int32"))
+    ary3 = DeviceArray.from_numpy(np.asarray([0, 1, 2], dtype="int64"))
 
     it1 = ReverseIterator(ary1)
     it2 = ReverseIterator(ary1)
@@ -142,9 +105,9 @@ def test_reverse_input_iterator_equality():
 
 
 def test_reverse_output_iterator_equality():
-    ary1 = cp.asarray([0, 1, 2], dtype="int32")
-    ary2 = cp.asarray([3, 4, 5], dtype="int32")
-    ary3 = cp.asarray([0, 1, 2], dtype="int64")
+    ary1 = DeviceArray.from_numpy(np.asarray([0, 1, 2], dtype="int32"))
+    ary2 = DeviceArray.from_numpy(np.asarray([3, 4, 5], dtype="int32"))
+    ary3 = DeviceArray.from_numpy(np.asarray([0, 1, 2], dtype="int64"))
 
     it1 = ReverseIterator(ary1)
     it2 = ReverseIterator(ary1)
@@ -205,7 +168,7 @@ def test_transform_iterator_with_lambda():
         CountingIterator(np.int32(first_item)), lambda x: x * 2
     )
     h_init = np.array([0], dtype=np.int32)
-    d_output = cp.empty(1, dtype=np.int32)
+    d_output = DeviceArray.empty(1, np.int32)
 
     # Perform reduction on the transformed iterator
     cuda.compute.reduce_into(
@@ -218,7 +181,7 @@ def test_transform_iterator_with_lambda():
 
     # Expected: sum of (10*2, 11*2, ..., 109*2) = 2 * sum(10..109)
     expected = 2 * sum(range(first_item, first_item + num_items))
-    assert d_output.get()[0] == expected
+    assert d_output.copy_to_host()[0] == expected
 
 
 def test_transform_iterator_with_zip_iterator():
@@ -226,8 +189,10 @@ def test_transform_iterator_with_zip_iterator():
     from cuda.compute.iterators import ZipIterator
 
     # Create a ZipIterator with two int32 arrays
-    d_a = cp.arange(10, dtype=np.int32)
-    d_b = cp.arange(100, 110, dtype=np.int32)
+    h_a = np.arange(10, dtype=np.int32)
+    h_b = np.arange(100, 110, dtype=np.int32)
+    d_a = DeviceArray.from_numpy(h_a)
+    d_b = DeviceArray.from_numpy(h_b)
 
     zip_it = ZipIterator(d_a, d_b)
 
@@ -242,17 +207,17 @@ def sum_fields(pair):
 
     # Use it in a reduction
     h_init = np.array([0], dtype=np.int32)
-    d_output = cp.empty(1, dtype=np.int32)
+    d_output = DeviceArray.empty(1, np.int32)
 
     cuda.compute.reduce_into(
         d_in=transform_it,
         d_out=d_output,
-        num_items=len(d_a),
+        num_items=len(h_a),
         op=OpKind.PLUS,
         h_init=h_init,
     )
 
-    result = d_output.get()[0]
-    expected = (d_a + d_b).sum().get()
+    result = d_output.copy_to_host()[0]
+    expected = (h_a + h_b).sum()
 
     assert result == expected, f"Expected {expected}, got {result}"
diff --git a/python/cuda_cccl/tests/compute/test_merge_sort.py b/python/cuda_cccl/tests/compute/test_merge_sort.py
index 33d7d15de58..0d929ae6f14 100644
--- a/python/cuda_cccl/tests/compute/test_merge_sort.py
+++ b/python/cuda_cccl/tests/compute/test_merge_sort.py
@@ -4,10 +4,9 @@
 
 from typing import List
 
-import cupy as cp
-import numba.cuda
 import numpy as np
 import pytest
+from _utils.device_array import DeviceArray
 
 import cuda.compute
 from cuda.compute import (
@@ -82,7 +81,7 @@ def compare_op(lhs, rhs):
 def test_merge_sort_keys(dtype, num_items, op):
     h_in_keys = random_array(num_items, dtype)
 
-    d_in_keys = numba.cuda.to_device(h_in_keys)
+    d_in_keys = DeviceArray.from_numpy(h_in_keys)
 
     merge_sort_device(d_in_keys, None, d_in_keys, None, op, num_items)
 
@@ -102,8 +101,8 @@ def test_merge_sort_pairs(dtype, num_items, op, monkeypatch):
     h_in_keys = random_array(num_items, dtype)
     h_in_items = random_array(num_items, np.float32)
 
-    d_in_keys = numba.cuda.to_device(h_in_keys)
-    d_in_items = numba.cuda.to_device(h_in_items)
+    d_in_keys = DeviceArray.from_numpy(h_in_keys)
+    d_in_items = DeviceArray.from_numpy(h_in_items)
 
     merge_sort_device(d_in_keys, d_in_items, d_in_keys, d_in_items, op, num_items)
 
@@ -123,8 +122,8 @@ def test_merge_sort_keys_copy(dtype, num_items, op):
     h_in_keys = random_array(num_items, dtype)
     h_out_keys = np.empty(num_items, dtype=dtype)
 
-    d_in_keys = numba.cuda.to_device(h_in_keys)
-    d_out_keys = numba.cuda.to_device(h_out_keys)
+    d_in_keys = DeviceArray.from_numpy(h_in_keys)
+    d_out_keys = DeviceArray.empty(h_out_keys.shape, h_out_keys.dtype)
 
     merge_sort_device(d_in_keys, None, d_out_keys, None, op, num_items)
 
@@ -146,10 +145,10 @@ def test_merge_sort_pairs_copy(dtype, num_items, op, monkeypatch):
     h_out_keys = np.empty(num_items, dtype=dtype)
     h_out_items = np.empty(num_items, dtype=np.float32)
 
-    d_in_keys = numba.cuda.to_device(h_in_keys)
-    d_in_items = numba.cuda.to_device(h_in_items)
-    d_out_keys = numba.cuda.to_device(h_out_keys)
-    d_out_items = numba.cuda.to_device(h_out_items)
+    d_in_keys = DeviceArray.from_numpy(h_in_keys)
+    d_in_items = DeviceArray.from_numpy(h_in_items)
+    d_out_keys = DeviceArray.empty(h_out_keys.shape, h_out_keys.dtype)
+    d_out_items = DeviceArray.empty(h_out_items.shape, h_out_items.dtype)
 
     merge_sort_device(d_in_keys, d_in_items, d_out_keys, d_out_items, op, num_items)
 
@@ -195,17 +194,15 @@ def struct_compare_op(lhs, rhs):
     h_in_items["a"] = a_items
     h_in_items["b"] = b_items
 
-    d_in_keys = cp.empty_like(h_in_keys)
-    d_in_items = cp.empty_like(h_in_items)
-    d_in_keys.set(h_in_keys)
-    d_in_items.set(h_in_items)
+    d_in_keys = DeviceArray.from_numpy(h_in_keys)
+    d_in_items = DeviceArray.from_numpy(h_in_items)
 
     merge_sort_device(
         d_in_keys, d_in_items, d_in_keys, d_in_items, struct_compare_op, num_items
     )
 
-    h_out_keys = d_in_keys.get()
-    h_out_items = d_in_items.get()
+    h_out_keys = d_in_keys.copy_to_host()
+    h_out_items = d_in_items.copy_to_host()
 
     argsort = np.argsort(h_in_keys, stable=True)
     h_in_keys = np.array(h_in_keys)[argsort]
@@ -225,7 +222,7 @@ def compare_complex(lhs, rhs):
     imaginary = random_array(num_items, np.int64, max_value)
 
     h_in_keys = real + 1j * imaginary
-    d_in_keys = numba.cuda.to_device(h_in_keys)
+    d_in_keys = DeviceArray.from_numpy(h_in_keys)
 
     merge_sort_device(d_in_keys, None, d_in_keys, None, compare_complex, num_items)
 
@@ -240,8 +237,8 @@ def test_merge_sort_keys_copy_iterator_input(dtype, num_items, op):
     h_in_keys = random_array(num_items, dtype)
     h_out_keys = np.empty(num_items, dtype=dtype)
 
-    d_in_keys = numba.cuda.to_device(h_in_keys)
-    d_out_keys = numba.cuda.to_device(h_out_keys)
+    d_in_keys = DeviceArray.from_numpy(h_in_keys)
+    d_out_keys = DeviceArray.empty(h_out_keys.shape, h_out_keys.dtype)
 
     i_input = CacheModifiedInputIterator(d_in_keys, modifier="stream")
 
@@ -265,10 +262,10 @@ def test_merge_sort_pairs_copy_iterator_input(dtype, num_items, op, monkeypatch)
     h_out_keys = np.empty(num_items, dtype=dtype)
     h_out_items = np.empty(num_items, dtype=np.float32)
 
-    d_in_keys = numba.cuda.to_device(h_in_keys)
-    d_in_items = numba.cuda.to_device(h_in_items)
-    d_out_keys = numba.cuda.to_device(h_out_keys)
-    d_out_items = numba.cuda.to_device(h_out_items)
+    d_in_keys = DeviceArray.from_numpy(h_in_keys)
+    d_in_items = DeviceArray.from_numpy(h_in_items)
+    d_out_keys = DeviceArray.empty(h_out_keys.shape, h_out_keys.dtype)
+    d_out_items = DeviceArray.empty(h_out_items.shape, h_out_items.dtype)
 
     i_input_keys = CacheModifiedInputIterator(d_in_keys, modifier="stream")
     i_input_items = CacheModifiedInputIterator(d_in_items, modifier="stream")
@@ -289,19 +286,17 @@ def test_merge_sort_pairs_copy_iterator_input(dtype, num_items, op, monkeypatch)
 
 
 def test_merge_sort_with_stream(cuda_stream):
-    cp_stream = cp.cuda.ExternalStream(cuda_stream.ptr)
     num_items = 10000
 
-    with cp_stream:
-        h_in_keys = random_array(num_items, np.int32)
-        d_in_keys = cp.asarray(h_in_keys)
-        d_out_keys = cp.empty_like(d_in_keys)
+    h_in_keys = random_array(num_items, np.int32)
+    d_in_keys = DeviceArray.from_numpy(h_in_keys, stream=cuda_stream)
+    d_out_keys = DeviceArray.empty(h_in_keys.shape, h_in_keys.dtype, stream=cuda_stream)
 
     merge_sort_device(
         d_in_keys, None, d_out_keys, None, compare_op, num_items, stream=cuda_stream
     )
 
-    got = d_out_keys.get()
+    got = d_out_keys.copy_to_host(stream=cuda_stream)
     h_in_keys.sort()
 
     np.testing.assert_array_equal(got, h_in_keys)
@@ -310,39 +305,41 @@ def test_merge_sort_with_stream(cuda_stream):
 def test_merge_sort_well_known_less():
     dtype = np.int32
 
-    d_in_keys = cp.array([5, 2, 8, 1, 9, 3], dtype=dtype)
-    d_out_keys = cp.empty_like(d_in_keys)
+    h_in_keys = np.array([5, 2, 8, 1, 9, 3], dtype=dtype)
+    d_in_keys = DeviceArray.from_numpy(h_in_keys)
+    d_out_keys = DeviceArray.empty(h_in_keys.shape, h_in_keys.dtype)
 
     cuda.compute.merge_sort(
         d_in_keys=d_in_keys,
         d_in_values=None,
         d_out_keys=d_out_keys,
         d_out_values=None,
-        num_items=len(d_in_keys),
+        num_items=len(h_in_keys),
         op=OpKind.LESS,
     )
 
     expected = np.array([1, 2, 3, 5, 8, 9])
-    np.testing.assert_equal(d_out_keys.get(), expected)
+    np.testing.assert_equal(d_out_keys.copy_to_host(), expected)
 
 
 def test_merge_sort_well_known_greater():
     dtype = np.int32
 
-    d_in_keys = cp.array([5, 2, 8, 1, 9, 3], dtype=dtype)
-    d_out_keys = cp.empty_like(d_in_keys)
+    h_in_keys = np.array([5, 2, 8, 1, 9, 3], dtype=dtype)
+    d_in_keys = DeviceArray.from_numpy(h_in_keys)
+    d_out_keys = DeviceArray.empty(h_in_keys.shape, h_in_keys.dtype)
 
     cuda.compute.merge_sort(
         d_in_keys=d_in_keys,
         d_in_values=None,
         d_out_keys=d_out_keys,
         d_out_values=None,
-        num_items=len(d_in_keys),
+        num_items=len(h_in_keys),
         op=OpKind.GREATER,
     )
 
     expected = np.array([9, 8, 5, 3, 2, 1])
-    np.testing.assert_equal(d_out_keys.get(), expected)
+    np.testing.assert_equal(d_out_keys.copy_to_host(), expected)
 
 
 def test_merge_sort_large_temp_storage_not_negative():
@@ -353,8 +350,8 @@ def test_merge_sort_large_temp_storage_not_negative():
     """
     num_items = 2**28
     dtype = np.int64
-    d_in_keys = cp.zeros(num_items, dtype=dtype)
-    d_out_keys = cp.empty(num_items, dtype=dtype)
+    d_in_keys = DeviceArray.empty(num_items, dtype)
+    d_out_keys = DeviceArray.empty(num_items, dtype)
 
     sorter = cuda.compute.make_merge_sort(
         d_in_keys=d_in_keys,
@@ -380,21 +377,23 @@ def test_merge_sort_large_temp_storage_not_negative():
 def test_merge_sort_with_values_well_known():
     dtype = np.int32
 
-    d_in_keys = cp.array([3, 1, 4, 2], dtype=dtype)
-    d_in_values = cp.array([30, 10, 40, 20], dtype=dtype)
-    d_out_keys = cp.empty_like(d_in_keys)
-    d_out_values = cp.empty_like(d_in_values)
+    h_in_keys = np.array([3, 1, 4, 2], dtype=dtype)
+    h_in_values = np.array([30, 10, 40, 20], dtype=dtype)
+    d_in_keys = DeviceArray.from_numpy(h_in_keys)
+    d_in_values = DeviceArray.from_numpy(h_in_values)
+    d_out_keys = DeviceArray.empty(h_in_keys.shape, h_in_keys.dtype)
+    d_out_values = DeviceArray.empty(h_in_values.shape, h_in_values.dtype)
 
     cuda.compute.merge_sort(
         d_in_keys=d_in_keys,
         d_in_values=d_in_values,
         d_out_keys=d_out_keys,
         d_out_values=d_out_values,
-        num_items=len(d_in_keys),
+        num_items=len(h_in_keys),
         op=OpKind.LESS,
     )
 
     expected_keys = np.array([1, 2, 3, 4])
     expected_values = np.array([10, 20, 30, 40])
-    np.testing.assert_equal(d_out_keys.get(), expected_keys)
-    np.testing.assert_equal(d_out_values.get(), expected_values)
+    np.testing.assert_equal(d_out_keys.copy_to_host(), expected_keys)
+    np.testing.assert_equal(d_out_values.copy_to_host(), expected_values)
diff --git a/python/cuda_cccl/tests/compute/test_nested_struct.py b/python/cuda_cccl/tests/compute/test_nested_struct.py
index 9cbfa40e393..d9aa986bf9c 100644
--- a/python/cuda_cccl/tests/compute/test_nested_struct.py
+++ b/python/cuda_cccl/tests/compute/test_nested_struct.py
@@ -1,8 +1,8 @@
 # Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-import cupy as cp
 import numpy as np
+from _utils.device_array import DeviceArray
 
 import cuda.compute
 from cuda.compute import ZipIterator, gpu_struct
@@ -25,13 +25,8 @@ def sum_nested(s1, s2):
         h_data[i]["inner"]["a"] = i * 2
         h_data[i]["inner"]["b"] = float(i * 3)
 
-    itemsize = h_data.dtype.itemsize
-    d_input = cp.empty(num_items * itemsize, dtype=np.uint8)
-    d_input.set(h_data.view(np.uint8))
-    d_input = d_input.view(Outer.dtype)
-
-    d_output = cp.empty(itemsize, dtype=np.uint8)
-    d_output = d_output.view(Outer.dtype)
+    d_input = DeviceArray.from_numpy(h_data)
+    d_output = DeviceArray.empty(1, Outer.dtype)
 
     h_init = Outer(0, Inner(0, 0.0))
 
@@ -39,7 +34,7 @@ def sum_nested(s1, s2):
         d_in=d_input, d_out=d_output, num_items=num_items, op=sum_nested, h_init=h_init
     )
 
-    result = d_output.view(np.uint8).get().view(Outer.dtype)[0]
+    result = d_output.copy_to_host()[0]
 
     expected_x = sum(range(num_items))
     expected_a = sum(i * 2 for i in range(num_items))
@@ -71,13 +66,8 @@ def sum_nested(s1, s2):
         h_data[i]["inner"]["a"] = i * 2
         h_data[i]["inner"]["b"] = float(i * 3)
 
-    itemsize = h_data.dtype.itemsize
-    d_input = cp.empty(num_items * itemsize, dtype=np.uint8)
-    d_input.set(h_data.view(np.uint8))
-    d_input = d_input.view(Outer.dtype)
-
-    d_output = cp.empty(itemsize, dtype=np.uint8)
-    d_output = d_output.view(Outer.dtype)
+    d_input = DeviceArray.from_numpy(h_data)
+    d_output = DeviceArray.empty(1, Outer.dtype)
 
     h_init = Outer(0, Inner(0, 0.0))
 
@@ -85,7 +75,7 @@ def sum_nested(s1, s2):
         d_in=d_input, d_out=d_output, num_items=num_items, op=sum_nested, h_init=h_init
     )
 
-    result = d_output.view(np.uint8).get().view(Outer.dtype)[0]
+    result = d_output.copy_to_host()[0]
 
     expected_x = sum(range(num_items))
     expected_a = sum(i * 2 for i in range(num_items))
@@ -113,28 +103,25 @@ def sum_pixels(p1, p2):
 
     num_items = 100
 
-    d_points = cp.empty(num_items, dtype=Point.dtype)
-    d_colors = cp.empty(num_items, dtype=Color.dtype)
-
     h_points = np.array([(i, i * 2) for i in range(num_items)], dtype=Point.dtype)
     h_colors = np.array(
         [(i % 256, (i * 2) % 256, (i * 3) % 256) for i in range(num_items)],
         dtype=Color.dtype,
     )
 
-    d_points.set(h_points)
-    d_colors.set(h_colors)
+    d_points = DeviceArray.from_numpy(h_points)
+    d_colors = DeviceArray.from_numpy(h_colors)
 
     zip_it = ZipIterator(d_points, d_colors)
 
-    d_output = cp.empty(1, dtype=Pixel.dtype)
+    d_output = DeviceArray.empty(1, Pixel.dtype)
     h_init = Pixel(Point(0, 0), Color(0, 0, 0))
 
     cuda.compute.reduce_into(
         d_in=zip_it, d_out=d_output, num_items=num_items, op=sum_pixels, h_init=h_init
     )
 
-    result = d_output.get()[0]
+    result = d_output.copy_to_host()[0]
 
     expected_x = sum(i for i in range(num_items))
     expected_y = sum(i * 2 for i in range(num_items))
@@ -224,13 +211,8 @@ def sum_nested(s1, s2):
         h_data[i]["inner"]["a"] = i * 2
         h_data[i]["inner"]["b"] = float(i * 3)
 
-    itemsize = h_data.dtype.itemsize
-    d_input = cp.empty(num_items * itemsize, dtype=np.uint8)
-    d_input.set(h_data.view(np.uint8))
-    d_input = d_input.view(Outer.dtype)
-
-    d_output = cp.empty(itemsize, dtype=np.uint8)
-    d_output = d_output.view(Outer.dtype)
+    d_input = DeviceArray.from_numpy(h_data)
+    d_output = DeviceArray.empty(1, Outer.dtype)
 
     # Use dictionary initialization for the init value
     h_init = Outer({"x": 0, "inner": {"a": 0, "b": 0.0}})
@@ -239,7 +221,7 @@ def sum_nested(s1, s2):
         d_in=d_input, d_out=d_output, num_items=num_items, op=sum_nested, h_init=h_init
     )
 
-    result = d_output.view(np.uint8).get().view(Outer.dtype)[0]
+    result = d_output.copy_to_host()[0]
 
     expected_x = sum(range(num_items))
     expected_a = sum(i * 2 for i in range(num_items))
@@ -267,13 +249,8 @@ def sum_nested_with_tuples(s1, s2):
         h_data[i]["inner"]["a"] = i * 2
         h_data[i]["inner"]["b"] = float(i * 3)
 
-    itemsize = h_data.dtype.itemsize
-    d_input = cp.empty(num_items * itemsize, dtype=np.uint8)
-    d_input.set(h_data.view(np.uint8))
-    d_input = d_input.view(Outer.dtype)
-
-    d_output = cp.empty(itemsize, dtype=np.uint8)
-    d_output = d_output.view(Outer.dtype)
+    d_input = DeviceArray.from_numpy(h_data)
+    d_output = DeviceArray.empty(1, Outer.dtype)
 
     h_init = Outer(0, Inner(0, 0.0))
 
@@ -285,7 +262,7 @@ def sum_nested_with_tuples(s1, s2):
         h_init=h_init,
     )
 
-    result = d_output.view(np.uint8).get().view(Outer.dtype)[0]
+    result = d_output.copy_to_host()[0]
 
     expected_x = sum(range(num_items))
     expected_a = sum(i * 2 for i in range(num_items))
@@ -320,13 +297,8 @@ def sum_deeply_nested(v1, v2):
         h_data[i]["middle"]["data"] = float(i * 2.5)
         h_data[i]["middle"]["nested"]["value"] = i * 3
 
-    itemsize = h_data.dtype.itemsize
-    d_input = cp.empty(num_items * itemsize, dtype=np.uint8)
-    d_input.set(h_data.view(np.uint8))
-    d_input = d_input.view(Level3.dtype)
-
-    d_output = cp.empty(itemsize, dtype=np.uint8)
-    d_output = d_output.view(Level3.dtype)
+    d_input = DeviceArray.from_numpy(h_data)
+    d_output = DeviceArray.empty(1, Level3.dtype)
 
     h_init = Level3(0, Level2(0.0, Level1(0)))
 
@@ -338,7 +310,7 @@ def sum_deeply_nested(v1, v2):
         h_init=h_init,
     )
 
-    result = d_output.view(np.uint8).get().view(Level3.dtype)[0]
+    result = d_output.copy_to_host()[0]
 
     expected_id = sum(i * 10 for i in range(num_items))
     expected_data = sum(float(i * 2.5) for i in range(num_items))
@@ -373,13 +345,8 @@ def sum_mixed(s1, s2):
         h_data[i]["inner2"]["c"] = float(i * 4)
         h_data[i]["inner2"]["d"] = float(i * 5)
 
-    itemsize = h_data.dtype.itemsize
-    d_input = cp.empty(num_items * itemsize, dtype=np.uint8)
-    d_input.set(h_data.view(np.uint8))
-    d_input = d_input.view(Outer.dtype)
-
-    d_output = cp.empty(itemsize, dtype=np.uint8)
-    d_output = d_output.view(Outer.dtype)
+    d_input = DeviceArray.from_numpy(h_data)
+    d_output = DeviceArray.empty(1, Outer.dtype)
 
     h_init = Outer(0, Inner1(0, 0), Inner2(0.0, 0.0))
 
@@ -387,7 +354,7 @@ def sum_mixed(s1, s2):
         d_in=d_input, d_out=d_output, num_items=num_items, op=sum_mixed, h_init=h_init
     )
 
-    result = d_output.view(np.uint8).get().view(Outer.dtype)[0]
+    result = d_output.copy_to_host()[0]
 
     expected_x = sum(range(num_items))
     expected_a = sum(i * 2 for i in range(num_items))
@@ -421,21 +388,18 @@ def sum_pixels_with_tuples(p1, p2):
 
     num_items = 100
 
-    d_points = cp.empty(num_items, dtype=Point.dtype)
-    d_colors = cp.empty(num_items, dtype=Color.dtype)
-
     h_points = np.array([(i, i * 2) for i in range(num_items)], dtype=Point.dtype)
     h_colors = np.array(
         [(i % 256, (i * 2) % 256, (i * 3) % 256) for i in range(num_items)],
         dtype=Color.dtype,
     )
 
-    d_points.set(h_points)
-    d_colors.set(h_colors)
+    d_points = DeviceArray.from_numpy(h_points)
+    d_colors = DeviceArray.from_numpy(h_colors)
 
     zip_it = ZipIterator(d_points, d_colors)
 
-    d_output = cp.empty(1, dtype=Pixel.dtype)
+    d_output = DeviceArray.empty(1, Pixel.dtype)
     h_init = Pixel(Point(0, 0), Color(0, 0, 0))
 
     cuda.compute.reduce_into(
@@ -446,7 +410,7 @@ def sum_pixels_with_tuples(p1, p2):
         h_init=h_init,
     )
 
-    result = d_output.get()[0]
+    result = d_output.copy_to_host()[0]
 
     expected_x = sum(i for i in range(num_items))
     expected_y = sum(i * 2 for i in range(num_items))
@@ -478,13 +442,8 @@ def sum_all_tuples(s1, s2):
         h_data[i]["field1"]["a"] = i
         h_data[i]["field2"]["b"] = float(i * 2)
 
-    itemsize = h_data.dtype.itemsize
-    d_input = cp.empty(num_items * itemsize, dtype=np.uint8)
-    d_input.set(h_data.view(np.uint8))
-    d_input = d_input.view(Outer.dtype)
-
-    d_output = cp.empty(itemsize, dtype=np.uint8)
-    d_output = d_output.view(Outer.dtype)
+    d_input = DeviceArray.from_numpy(h_data)
+    d_output = DeviceArray.empty(1, Outer.dtype)
 
     h_init = Outer(Inner1(0), Inner2(0.0))
 
@@ -496,7 +455,7 @@ def sum_all_tuples(s1, s2):
         h_init=h_init,
     )
 
-    result = d_output.view(np.uint8).get().view(Outer.dtype)[0]
+    result = d_output.copy_to_host()[0]
 
     expected_a = sum(range(num_items))
     expected_b = sum(float(i * 2) for i in range(num_items))
diff --git a/python/cuda_cccl/tests/compute/test_no_numba.py b/python/cuda_cccl/tests/compute/test_no_numba.py
index 6fb1ef0e811..264cf117aa0 100644
--- a/python/cuda_cccl/tests/compute/test_no_numba.py
+++ b/python/cuda_cccl/tests/compute/test_no_numba.py
@@ -2,9 +2,9 @@
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-import cupy as cp
 import numpy as np
 import pytest
+from _utils.device_array import DeviceArray
 
 import cuda.compute
 from cuda.compute import (
@@ -112,24 +112,25 @@ def test_import_numba_raises():
 
 def test_reduce_well_known_plus():
     h_input = np.arange(1, 14, dtype=np.int32)
-    d_input = cp.asarray(h_input)
-    d_output = cp.empty(1, dtype=np.int32)
+    d_input = DeviceArray.from_numpy(h_input)
+    d_output = DeviceArray.empty(1, np.int32)
     h_init = np.array([5], dtype=np.int32)
 
     cuda.compute.reduce_into(
         d_in=d_input,
         d_out=d_output,
-        num_items=d_input.size,
+        num_items=h_input.size,
         op=OpKind.PLUS,
         h_init=h_init,
     )
 
-    assert d_output.get()[0] == np.sum(h_input, initial=h_init[0])
+    assert d_output.copy_to_host()[0] == np.sum(h_input, initial=h_init[0])
 
 
 def test_exclusive_scan_well_known_plus():
-    d_input = cp.asarray([2, 4, 6, 8, 10, 12], dtype=np.uint16)
-    d_output = cp.empty_like(d_input)
+    h_input = np.asarray([2, 4, 6, 8, 10, 12], dtype=np.uint16)
+    d_input = DeviceArray.from_numpy(h_input)
+    d_output = DeviceArray.empty(h_input.shape, h_input.dtype)
     h_init = np.array([1], dtype=np.uint16)
 
     cuda.compute.exclusive_scan(
@@ -137,41 +138,46 @@ def test_exclusive_scan_well_known_plus():
         d_out=d_output,
         op=OpKind.PLUS,
         init_value=h_init,
-        num_items=d_input.size,
+        num_items=h_input.size,
     )
 
     expected = np.asarray([1, 3, 7, 13, 21, 31], dtype=np.uint16)
-    np.testing.assert_array_equal(d_output.get(), expected)
+    np.testing.assert_array_equal(d_output.copy_to_host(), expected)
 
 
 def test_binary_transform_well_known_plus():
-    d_lhs = cp.asarray([1.5, 2.5, 3.5, 4.5], dtype=np.float32)
-    d_rhs = cp.asarray([10.0, 20.0, 30.0, 40.0], dtype=np.float32)
-    d_output = cp.empty_like(d_lhs)
+    h_lhs = np.asarray([1.5, 2.5, 3.5, 4.5], dtype=np.float32)
+    h_rhs = np.asarray([10.0, 20.0, 30.0, 40.0], dtype=np.float32)
+    d_lhs = DeviceArray.from_numpy(h_lhs)
+    d_rhs = DeviceArray.from_numpy(h_rhs)
+    d_output = DeviceArray.empty(h_lhs.shape, h_lhs.dtype)
 
     cuda.compute.binary_transform(
         d_in1=d_lhs,
         d_in2=d_rhs,
         d_out=d_output,
         op=OpKind.PLUS,
-        num_items=d_lhs.size,
+        num_items=h_lhs.size,
     )
 
-    np.testing.assert_allclose(d_output.get(), d_lhs.get() + d_rhs.get())
+    np.testing.assert_allclose(d_output.copy_to_host(), h_lhs + h_rhs)
 
 
 def test_unary_transform_well_known_negate():
-    d_input = cp.asarray([-4, -2, 0, 2, 4], dtype=np.int8)
-    d_output = cp.empty_like(d_input)
+    h_input = np.asarray([-4, -2, 0, 2, 4], dtype=np.int8)
+    d_input = DeviceArray.from_numpy(h_input)
+    d_output = DeviceArray.empty(h_input.shape, h_input.dtype)
 
     cuda.compute.unary_transform(
         d_in=d_input,
         d_out=d_output,
         op=OpKind.NEGATE,
-        num_items=d_input.size,
+        num_items=h_input.size,
     )
 
-    np.testing.assert_array_equal(d_output.get(), np.asarray([4, 2, 0, -2, -4]))
+    np.testing.assert_array_equal(
+        d_output.copy_to_host(), np.asarray([4, 2, 0, -2, -4])
+    )
 
 
 @pytest.mark.parametrize(
@@ -184,28 +190,31 @@ def test_unary_transform_well_known_negate():
 def test_binary_search_explicit_opkind_less(search, side):
     h_data = np.asarray([1, 3, 3, 7, 9, 11], dtype=np.int64)
     h_values = np.asarray([0, 3, 4, 10, 12], dtype=np.int64)
-    d_out = cp.empty(h_values.size, dtype=np.uintp)
+    d_out = DeviceArray.empty(h_values.shape, np.uintp)
 
     search(
-        d_data=cp.asarray(h_data),
+        d_data=DeviceArray.from_numpy(h_data),
         num_items=h_data.size,
-        d_values=cp.asarray(h_values),
+        d_values=DeviceArray.from_numpy(h_values),
         num_values=h_values.size,
         d_out=d_out,
         comp=OpKind.LESS,
     )
 
     expected = np.searchsorted(h_data, h_values, side=side).astype(np.uintp)
-    np.testing.assert_array_equal(d_out.get(), expected)
+    np.testing.assert_array_equal(d_out.copy_to_host(), expected)
 
 
 def test_segmented_reduce_well_known_plus(monkeypatch):
     monkeypatch.setattr(cuda.compute._cccl_interop, "_check_sass", False)
 
-    d_input = cp.asarray([1, 2, 3, 4, 5, 6, 7, 8], dtype=np.uint32)
-    d_starts = cp.asarray([0, 3, 5], dtype=np.int32)
-    d_ends = cp.asarray([3, 5, 8], dtype=np.int32)
-    d_output = cp.empty(3, dtype=np.uint32)
+    h_input = np.asarray([1, 2, 3, 4, 5, 6, 7, 8], dtype=np.uint32)
+    h_starts = np.asarray([0, 3, 5], dtype=np.int32)
+    h_ends = np.asarray([3, 5, 8], dtype=np.int32)
+    d_input = DeviceArray.from_numpy(h_input)
+    d_starts = DeviceArray.from_numpy(h_starts)
+    d_ends = DeviceArray.from_numpy(h_ends)
+    d_output = DeviceArray.empty(3, np.uint32)
     h_init = np.array([0], dtype=np.uint32)
 
     cuda.compute.segmented_reduce(
@@ -218,64 +227,65 @@ def test_segmented_reduce_well_known_plus(monkeypatch):
         h_init=h_init,
     )
 
-    np.testing.assert_array_equal(d_output.get(), np.asarray([6, 9, 21]))
+    np.testing.assert_array_equal(d_output.copy_to_host(), np.asarray([6, 9, 21]))
 
 
 def test_merge_sort_well_known_less():
-    d_input = cp.asarray([3.5, -1.0, 2.25, 2.0, 7.0], dtype=np.float64)
-    d_output = cp.empty_like(d_input)
+    h_input = np.asarray([3.5, -1.0, 2.25, 2.0, 7.0], dtype=np.float64)
+    d_input = DeviceArray.from_numpy(h_input)
+    d_output = DeviceArray.empty(h_input.shape, h_input.dtype)
 
     cuda.compute.merge_sort(
         d_in_keys=d_input,
         d_in_values=None,
         d_out_keys=d_output,
         d_out_values=None,
-        num_items=d_input.size,
+        num_items=h_input.size,
         op=OpKind.LESS,
     )
 
-    np.testing.assert_array_equal(d_output.get(), np.sort(d_input.get()))
+    np.testing.assert_array_equal(d_output.copy_to_host(), np.sort(h_input))
 
 
 def test_radix_sort_key_value_pairs():
     h_keys = np.asarray([4, -2, 7, 1, -2, 0], dtype=np.int16)
     h_values = np.asarray([40, 20, 70, 10, 21, 0], dtype=np.uint8)
-    d_out_keys = cp.empty_like(cp.asarray(h_keys))
-    d_out_values = cp.empty_like(cp.asarray(h_values))
+    d_out_keys = DeviceArray.empty(h_keys.shape, h_keys.dtype)
+    d_out_values = DeviceArray.empty(h_values.shape, h_values.dtype)
 
     cuda.compute.radix_sort(
-        d_in_keys=cp.asarray(h_keys),
+        d_in_keys=DeviceArray.from_numpy(h_keys),
         d_out_keys=d_out_keys,
-        d_in_values=cp.asarray(h_values),
+        d_in_values=DeviceArray.from_numpy(h_values),
         d_out_values=d_out_values,
         num_items=h_keys.size,
         order=SortOrder.ASCENDING,
     )
 
     order = np.argsort(h_keys, stable=True)
-    np.testing.assert_array_equal(d_out_keys.get(), h_keys[order])
-    np.testing.assert_array_equal(d_out_values.get(), h_values[order])
+    np.testing.assert_array_equal(d_out_keys.copy_to_host(), h_keys[order])
+    np.testing.assert_array_equal(d_out_values.copy_to_host(), h_values[order])
 
 
 def test_segmented_sort_keys():
     h_keys = np.asarray([3, 1, 2, 9, 7, 8, 6, 5], dtype=np.uint64)
     h_offsets = np.asarray([0, 3, 6, 8], dtype=np.int64)
-    d_output = cp.empty_like(cp.asarray(h_keys))
+    d_output = DeviceArray.empty(h_keys.shape, h_keys.dtype)
 
     cuda.compute.segmented_sort(
-        d_in_keys=cp.asarray(h_keys),
+        d_in_keys=DeviceArray.from_numpy(h_keys),
         d_out_keys=d_output,
         d_in_values=None,
         d_out_values=None,
         num_items=h_keys.size,
         num_segments=h_offsets.size - 1,
-        start_offsets_in=cp.asarray(h_offsets[:-1]),
-        end_offsets_in=cp.asarray(h_offsets[1:]),
+        start_offsets_in=DeviceArray.from_numpy(h_offsets[:-1]),
+        end_offsets_in=DeviceArray.from_numpy(h_offsets[1:]),
         order=SortOrder.ASCENDING,
     )
 
     expected = np.asarray([1, 2, 3, 7, 8, 9, 5, 6], dtype=np.uint64)
-    np.testing.assert_array_equal(d_output.get(), expected)
+    np.testing.assert_array_equal(d_output.copy_to_host(), expected)
 
 
 def test_unique_by_key_well_known_equal_to(monkeypatch):
@@ -283,11 +293,13 @@ def test_unique_by_key_well_known_equal_to(monkeypatch):
     if cc_major >= 9:
         monkeypatch.setattr(cuda.compute._cccl_interop, "_check_sass", False)
 
-    d_keys = cp.asarray([1, 1, 2, 2, 2, 3, 4, 4], dtype=np.int16)
-    d_values = cp.asarray([10, 11, 20, 21, 22, 30, 40, 41], dtype=np.int8)
-    d_out_keys = cp.empty_like(d_keys)
-    d_out_values = cp.empty_like(d_values)
-    d_num_selected = cp.empty(1, dtype=np.int64)
+    h_keys = np.asarray([1, 1, 2, 2, 2, 3, 4, 4], dtype=np.int16)
+    h_values = np.asarray([10, 11, 20, 21, 22, 30, 40, 41], dtype=np.int8)
+    d_keys = DeviceArray.from_numpy(h_keys)
+    d_values = DeviceArray.from_numpy(h_values)
+    d_out_keys = DeviceArray.empty(h_keys.shape, h_keys.dtype)
+    d_out_values = DeviceArray.empty(h_values.shape, h_values.dtype)
+    d_num_selected = DeviceArray.empty(1, np.int64)
 
     cuda.compute.unique_by_key(
         d_in_keys=d_keys,
@@ -296,20 +308,24 @@ def test_unique_by_key_well_known_equal_to(monkeypatch):
         d_out_items=d_out_values,
         d_out_num_selected=d_num_selected,
         op=OpKind.EQUAL_TO,
-        num_items=d_keys.size,
+        num_items=h_keys.size,
     )
 
-    num_selected = int(d_num_selected.get()[0])
-    np.testing.assert_array_equal(d_out_keys.get()[:num_selected], [1, 2, 3, 4])
-    np.testing.assert_array_equal(d_out_values.get()[:num_selected], [10, 20, 30, 40])
+    num_selected = int(d_num_selected.copy_to_host()[0])
+    np.testing.assert_array_equal(
+        d_out_keys.copy_to_host()[:num_selected], [1, 2, 3, 4]
+    )
+    np.testing.assert_array_equal(
+        d_out_values.copy_to_host()[:num_selected], [10, 20, 30, 40]
+    )
 
 
 def test_histogram_even_small_range():
     h_samples = np.asarray([0.5, 1.5, 2.5, 2.75, 3.0, 3.5], dtype=np.float32)
-    d_histogram = cp.empty(4, dtype=np.int32)
+    d_histogram = DeviceArray.empty(4, np.int32)
 
     cuda.compute.histogram_even(
-        d_samples=cp.asarray(h_samples),
+        d_samples=DeviceArray.from_numpy(h_samples),
         d_histogram=d_histogram,
         num_output_levels=5,
         lower_level=np.float32(0.0),
@@ -318,35 +334,35 @@ def test_histogram_even_small_range():
     )
 
     expected, _ = np.histogram(h_samples, bins=4, range=(0.0, 4.0))
-    np.testing.assert_array_equal(d_histogram.get(), expected.astype(np.int32))
+    np.testing.assert_array_equal(d_histogram.copy_to_host(), expected.astype(np.int32))
 
 
 def test_select_raw_op():
     h_input = np.arange(12, dtype=np.int32)
-    d_output = cp.empty_like(cp.asarray(h_input))
-    d_num_selected = cp.empty(1, dtype=np.uint64)
+    d_output = DeviceArray.empty(h_input.shape, h_input.dtype)
+    d_num_selected = DeviceArray.empty(1, np.uint64)
 
     cuda.compute.select(
-        d_in=cp.asarray(h_input),
+        d_in=DeviceArray.from_numpy(h_input),
         d_out=d_output,
         d_num_selected_out=d_num_selected,
         cond=_raw_even_i32_op(),
         num_items=h_input.size,
     )
 
-    num_selected = int(d_num_selected.get()[0])
-    np.testing.assert_array_equal(d_output.get()[:num_selected], h_input[::2])
+    num_selected = int(d_num_selected.copy_to_host()[0])
+    np.testing.assert_array_equal(d_output.copy_to_host()[:num_selected], h_input[::2])
 
 
 def test_three_way_partition_raw_op():
     h_input = np.arange(12, dtype=np.int32)
-    d_first = cp.empty_like(cp.asarray(h_input))
-    d_second = cp.empty_like(cp.asarray(h_input))
-    d_unselected = cp.empty_like(cp.asarray(h_input))
-    d_num_selected = cp.empty(2, dtype=np.uint64)
+    d_first = DeviceArray.empty(h_input.shape, h_input.dtype)
+    d_second = DeviceArray.empty(h_input.shape, h_input.dtype)
+    d_unselected = DeviceArray.empty(h_input.shape, h_input.dtype)
+    d_num_selected = DeviceArray.empty(2, np.uint64)
 
     cuda.compute.three_way_partition(
-        d_in=cp.asarray(h_input),
+        d_in=DeviceArray.from_numpy(h_input),
         d_first_part_out=d_first,
         d_second_part_out=d_second,
         d_unselected_out=d_unselected,
@@ -356,51 +372,57 @@ def test_three_way_partition_raw_op():
         num_items=h_input.size,
     )
 
-    selected = d_num_selected.get()
+    selected = d_num_selected.copy_to_host()
     first_count = int(selected[0])
     second_count = int(selected[1])
     unselected_count = h_input.size - first_count - second_count
 
-    np.testing.assert_array_equal(d_first.get()[:first_count], h_input[:4])
-    np.testing.assert_array_equal(d_second.get()[:second_count], h_input[4:8])
-    np.testing.assert_array_equal(d_unselected.get()[:unselected_count], h_input[8:])
+    np.testing.assert_array_equal(d_first.copy_to_host()[:first_count], h_input[:4])
+    np.testing.assert_array_equal(d_second.copy_to_host()[:second_count], h_input[4:8])
+    np.testing.assert_array_equal(
+        d_unselected.copy_to_host()[:unselected_count], h_input[8:]
+    )
 
 
 def test_raw_op_reduce():
     h_input = np.asarray([10, 20, 30, 40], dtype=np.int64)
-    d_output = cp.empty(1, dtype=np.int64)
+    d_output = DeviceArray.empty(1, np.int64)
 
     cuda.compute.reduce_into(
-        d_in=cp.asarray(h_input),
+        d_in=DeviceArray.from_numpy(h_input),
         d_out=d_output,
         num_items=h_input.size,
         op=_raw_plus_i64_op(),
         h_init=np.array([5], dtype=np.int64),
     )
 
-    assert d_output.get()[0] == 105
+    assert d_output.copy_to_host()[0] == 105
 
 
 def test_stream_argument(cuda_stream):
-    d_lhs = cp.asarray([2, 4, 6, 8, 10], dtype=np.int32)
-    d_rhs = cp.asarray([1, 3, 5, 7, 9], dtype=np.int32)
-    d_output = cp.empty_like(d_lhs)
+    h_lhs = np.asarray([2, 4, 6, 8, 10], dtype=np.int32)
+    h_rhs = np.asarray([1, 3, 5, 7, 9], dtype=np.int32)
+    d_lhs = DeviceArray.from_numpy(h_lhs, stream=cuda_stream)
+    d_rhs = DeviceArray.from_numpy(h_rhs, stream=cuda_stream)
+    d_output = DeviceArray.empty(h_lhs.shape, h_lhs.dtype, stream=cuda_stream)
 
     cuda.compute.binary_transform(
         d_in1=d_lhs,
         d_in2=d_rhs,
         d_out=d_output,
         op=OpKind.PLUS,
-        num_items=d_lhs.size,
+        num_items=h_lhs.size,
         stream=cuda_stream,
     )
 
-    cp.cuda.Device().synchronize()
-    np.testing.assert_array_equal(d_output.get(), np.asarray([3, 7, 11, 15, 19]))
+    np.testing.assert_array_equal(
+        d_output.copy_to_host(stream=cuda_stream),
+        np.asarray([3, 7, 11, 15, 19]),
+    )
 
 
 def test_counting_iterator_reduce():
-    d_output = cp.empty(1, dtype=np.int32)
+    d_output = DeviceArray.empty(1, np.int32)
 
     cuda.compute.reduce_into(
         d_in=CountingIterator(np.int32(3)),
@@ -410,11 +432,11 @@ def test_counting_iterator_reduce():
         h_init=np.array([0], dtype=np.int32),
     )
 
-    assert d_output.get()[0] == 52
+    assert d_output.copy_to_host()[0] == 52
 
 
 def test_constant_iterator_reduce():
-    d_output = cp.empty(1, dtype=np.float32)
+    d_output = DeviceArray.empty(1, np.float32)
 
     cuda.compute.reduce_into(
         d_in=ConstantIterator(np.float32(1.5)),
@@ -424,73 +446,82 @@ def test_constant_iterator_reduce():
         h_init=np.array([0], dtype=np.float32),
     )
 
-    np.testing.assert_allclose(d_output.get()[0], np.float32(12.0))
+    np.testing.assert_allclose(d_output.copy_to_host()[0], np.float32(12.0))
 
 
 def test_cache_modified_input_iterator_reduce():
-    d_input = cp.asarray([2, 4, 6, 8, 10], dtype=np.uint16)
-    d_output = cp.empty(1, dtype=np.uint16)
+    h_input = np.asarray([2, 4, 6, 8, 10], dtype=np.uint16)
+    d_input = DeviceArray.from_numpy(h_input)
+    d_output = DeviceArray.empty(1, np.uint16)
     iterator = CacheModifiedInputIterator(d_input, modifier="stream")
 
     cuda.compute.reduce_into(
         d_in=iterator,
         d_out=d_output,
-        num_items=d_input.size,
+        num_items=h_input.size,
         op=OpKind.PLUS,
         h_init=np.array([0], dtype=np.uint16),
     )
 
-    assert d_output.get()[0] == 30
+    assert d_output.copy_to_host()[0] == 30
 
 
 def test_reverse_input_iterator_scan():
-    d_input = cp.asarray([1, 2, 3, 4, 5], dtype=np.int32)
-    d_output = cp.empty_like(d_input)
+    h_input = np.asarray([1, 2, 3, 4, 5], dtype=np.int32)
+    d_input = DeviceArray.from_numpy(h_input)
+    d_output = DeviceArray.empty(h_input.shape, h_input.dtype)
 
     cuda.compute.inclusive_scan(
         d_in=ReverseIterator(d_input),
         d_out=d_output,
         op=OpKind.PLUS,
         init_value=np.array([0], dtype=np.int32),
-        num_items=d_input.size,
+        num_items=h_input.size,
     )
 
-    np.testing.assert_array_equal(d_output.get(), np.asarray([5, 9, 12, 14, 15]))
+    np.testing.assert_array_equal(
+        d_output.copy_to_host(), np.asarray([5, 9, 12, 14, 15])
+    )
 
 
 def test_reverse_output_iterator_scan():
-    d_input = cp.asarray([1, 2, 3, 4, 5], dtype=np.int32)
-    d_output = cp.empty_like(d_input)
+    h_input = np.asarray([1, 2, 3, 4, 5], dtype=np.int32)
+    d_input = DeviceArray.from_numpy(h_input)
+    d_output = DeviceArray.empty(h_input.shape, h_input.dtype)
 
     cuda.compute.inclusive_scan(
         d_in=d_input,
         d_out=ReverseIterator(d_output),
         op=OpKind.PLUS,
         init_value=np.array([0], dtype=np.int32),
-        num_items=d_input.size,
+        num_items=h_input.size,
     )
 
-    np.testing.assert_array_equal(d_output.get(), np.asarray([15, 10, 6, 3, 1]))
+    np.testing.assert_array_equal(
+        d_output.copy_to_host(), np.asarray([15, 10, 6, 3, 1])
+    )
 
 
 def test_permutation_iterator_reduce():
-    d_values = cp.asarray([10, 20, 30, 40, 50, 60], dtype=np.int64)
-    d_indices = cp.asarray([4, 2, 5, 1], dtype=np.int32)
-    d_output = cp.empty(1, dtype=np.int64)
+    h_values = np.asarray([10, 20, 30, 40, 50, 60], dtype=np.int64)
+    h_indices = np.asarray([4, 2, 5, 1], dtype=np.int32)
+    d_values = DeviceArray.from_numpy(h_values)
+    d_indices = DeviceArray.from_numpy(h_indices)
+    d_output = DeviceArray.empty(1, np.int64)
 
     cuda.compute.reduce_into(
         d_in=PermutationIterator(d_values, d_indices),
         d_out=d_output,
-        num_items=d_indices.size,
+        num_items=h_indices.size,
         op=OpKind.PLUS,
         h_init=np.array([0], dtype=np.int64),
     )
 
-    assert d_output.get()[0] == 160
+    assert d_output.copy_to_host()[0] == 160
 
 
 def test_transform_iterator_reduce():
-    d_output = cp.empty(1, dtype=np.int32)
+    d_output = DeviceArray.empty(1, np.int32)
     iterator = TransformIterator(
         CountingIterator(np.int32(1)), _raw_square_i32_op(), value_type=cccl_int32
     )
@@ -503,12 +534,13 @@ def test_transform_iterator_reduce():
         h_init=np.array([0], dtype=np.int32),
     )
 
-    assert d_output.get()[0] == 91
+    assert d_output.copy_to_host()[0] == 91
 
 
 def test_transform_output_iterator_reduce():
-    d_input = cp.asarray([1, 2, 3, 4], dtype=np.int16)
-    d_output = cp.empty(1, dtype=np.int16)
+    h_input = np.asarray([1, 2, 3, 4], dtype=np.int16)
+    d_input = DeviceArray.from_numpy(h_input)
+    d_output = DeviceArray.empty(1, np.int16)
     output_iterator = TransformOutputIterator(
         d_output, _raw_negate_i16_op(), output_value_type=cccl_int16
     )
@@ -516,32 +548,34 @@ def test_transform_output_iterator_reduce():
     cuda.compute.reduce_into(
         d_in=d_input,
         d_out=output_iterator,
-        num_items=d_input.size,
+        num_items=h_input.size,
         op=OpKind.PLUS,
         h_init=np.array([0], dtype=np.int16),
     )
 
-    assert d_output.get()[0] == -10
+    assert d_output.copy_to_host()[0] == -10
 
 
 def test_zip_iterator_transform():
-    d_lhs = cp.asarray([1, 2, 3, 4, 5], dtype=np.int32)
-    d_rhs = cp.asarray([10, 20, 30, 40, 50], dtype=np.int32)
-    d_output = cp.empty_like(d_lhs)
+    h_lhs = np.asarray([1, 2, 3, 4, 5], dtype=np.int32)
+    h_rhs = np.asarray([10, 20, 30, 40, 50], dtype=np.int32)
+    d_lhs = DeviceArray.from_numpy(h_lhs)
+    d_rhs = DeviceArray.from_numpy(h_rhs)
+    d_output = DeviceArray.empty(h_lhs.shape, h_lhs.dtype)
 
     cuda.compute.unary_transform(
         d_in=ZipIterator(d_lhs, d_rhs),
         d_out=d_output,
         op=_raw_zip_sum_i32_op(),
-        num_items=d_lhs.size,
+        num_items=h_lhs.size,
     )
 
-    np.testing.assert_array_equal(d_output.get(), d_lhs.get() + d_rhs.get())
+    np.testing.assert_array_equal(d_output.copy_to_host(), h_lhs + h_rhs)
 
 
 def test_shuffle_iterator_transform():
     num_items = 17
-    d_output = cp.empty(num_items, dtype=np.int64)
+    d_output = DeviceArray.empty(num_items, np.int64)
 
     cuda.compute.unary_transform(
         d_in=ShuffleIterator(num_items, seed=123),
@@ -550,19 +584,23 @@ def test_shuffle_iterator_transform():
         num_items=num_items,
     )
 
-    result = d_output.get()
+    result = d_output.copy_to_host()
     assert sorted(result.tolist()) == list(range(num_items))
 
 
 def test_discard_iterator_transform():
-    d_input = cp.asarray([1, 2, 3, 4, 5], dtype=np.int32)
-    d_reference = cp.full_like(d_input, -1)
+    h_input = np.asarray([1, 2, 3, 4, 5], dtype=np.int32)
+    h_reference = np.full_like(h_input, -1)
+    d_input = DeviceArray.from_numpy(h_input)
+    d_reference = DeviceArray.from_numpy(h_reference)
 
     cuda.compute.unary_transform(
         d_in=d_input,
         d_out=DiscardIterator(d_reference),
         op=OpKind.IDENTITY,
-        num_items=d_input.size,
+        num_items=h_input.size,
     )
 
-    np.testing.assert_array_equal(d_reference.get(), np.full(5, -1, dtype=np.int32))
+    np.testing.assert_array_equal(
+        d_reference.copy_to_host(), np.full(5, -1, dtype=np.int32)
+    )
diff --git a/python/cuda_cccl/tests/compute/test_permutation_iterator.py b/python/cuda_cccl/tests/compute/test_permutation_iterator.py
index 74d74a7b4e6..862f058f56b 100644
--- a/python/cuda_cccl/tests/compute/test_permutation_iterator.py
+++ b/python/cuda_cccl/tests/compute/test_permutation_iterator.py
@@ -1,8 +1,8 @@
 # Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-import cupy as cp
 import numpy as np
+from _utils.device_array import DeviceArray
 
 import cuda.compute
 from cuda.compute.iterators import (
@@ -13,65 +13,68 @@
 
 
 def test_permutation_iterator_equality():
-    values1 = cp.asarray([10, 20, 30, 40, 50], dtype="int32")
-    values2 = cp.asarray([100, 200, 300], dtype="int32")
-    values3 = cp.asarray([10, 20, 30, 40, 50], dtype="int64")
+    d_values1 = DeviceArray.from_numpy(np.asarray([10, 20, 30, 40, 50], dtype="int32"))
+    d_values2 = DeviceArray.from_numpy(np.asarray([100, 200, 300], dtype="int32"))
+    d_values3 = DeviceArray.from_numpy(np.asarray([10, 20, 30, 40, 50], dtype="int64"))
 
-    indices1 = cp.asarray([0, 2, 1], dtype="int32")
-    indices2 = cp.asarray([1, 0, 2], dtype="int32")
-    indices3 = cp.asarray([0, 2, 1], dtype="int64")
+    d_indices1 = DeviceArray.from_numpy(np.asarray([0, 2, 1], dtype="int32"))
+    d_indices2 = DeviceArray.from_numpy(np.asarray([1, 0, 2], dtype="int32"))
+    d_indices3 = DeviceArray.from_numpy(np.asarray([0, 2, 1], dtype="int64"))
 
     # Same value and index types should have same kind
-    it1 = PermutationIterator(values1, indices1)
-    it2 = PermutationIterator(values1, indices2)
-    it3 = PermutationIterator(values2, indices1)
+    it1 = PermutationIterator(d_values1, d_indices1)
+    it2 = PermutationIterator(d_values1, d_indices2)
+    it3 = PermutationIterator(d_values2, d_indices1)
 
     assert it1.kind == it2.kind == it3.kind
 
     # Different value type should have different kind
-    it4 = PermutationIterator(values3, indices1)
+    it4 = PermutationIterator(d_values3, d_indices1)
     assert it1.kind != it4.kind
 
     # Different index type should have different kind
-    it5 = PermutationIterator(values1, indices3)
+    it5 = PermutationIterator(d_values1, d_indices3)
     assert it1.kind != it5.kind
 
 
 def test_permutation_iterator_with_array_values():
-    values = cp.asarray([10, 20, 30, 40, 50], dtype="int32")
-    indices = cp.asarray([2, 0, 4, 1], dtype="int32")
-    perm_it = PermutationIterator(values, indices)
+    h_values = np.asarray([10, 20, 30, 40, 50], dtype="int32")
+    h_indices = np.asarray([2, 0, 4, 1], dtype="int32")
+    d_values = DeviceArray.from_numpy(h_values)
+    d_indices = DeviceArray.from_numpy(h_indices)
+    perm_it = PermutationIterator(d_values, d_indices)
 
     h_init = np.array([0], dtype="int32")
-    d_output = cp.empty(1, dtype="int32")
+    d_output = DeviceArray.empty(1, np.int32)
     cuda.compute.reduce_into(
         d_in=perm_it,
         d_out=d_output,
-        num_items=len(indices),
+        num_items=len(h_indices),
         op=cuda.compute.OpKind.PLUS,
         h_init=h_init,
     )
-    assert d_output[0] == values[indices].sum()
+    assert d_output.copy_to_host()[0] == h_values[h_indices].sum()
 
 
 def test_permutation_iterator_with_iterator_values():
     values_it = CountingIterator(np.int32(10))
-    indices = cp.asarray([2, 0, 4, 1], dtype="int32")
-    perm_it = PermutationIterator(values_it, indices)
+    h_indices = np.asarray([2, 0, 4, 1], dtype="int32")
+    d_indices = DeviceArray.from_numpy(h_indices)
+    perm_it = PermutationIterator(values_it, d_indices)
 
     h_init = np.array([0], dtype="int32")
-    d_output = cp.empty(1, dtype="int32")
+    d_output = DeviceArray.empty(1, np.int32)
 
     cuda.compute.reduce_into(
         d_in=perm_it,
         d_out=d_output,
-        num_items=len(indices),
+        num_items=len(h_indices),
         op=cuda.compute.OpKind.PLUS,
         h_init=h_init,
     )
 
-    expected = cp.arange(10, 20)[indices].sum()
-    assert d_output[0] == expected
+    expected = np.arange(10, 20)[h_indices].sum()
+    assert d_output.copy_to_host()[0] == expected
 
 
 def test_permutation_iterator_of_zip_iterator():
@@ -80,29 +83,32 @@ class Pair:
         value_0: np.int32
         value_1: np.int32
 
-    d_values1 = cp.asarray([10, 20, 30, 40, 50], dtype="int32")
-    d_values2 = cp.asarray([1, 2, 3, 4, 5], dtype="int32")
+    h_values1 = np.asarray([10, 20, 30, 40, 50], dtype="int32")
+    h_values2 = np.asarray([1, 2, 3, 4, 5], dtype="int32")
+    d_values1 = DeviceArray.from_numpy(h_values1)
+    d_values2 = DeviceArray.from_numpy(h_values2)
     zip_it = ZipIterator(d_values1, d_values2)
-    indices = cp.asarray([2, 0, 4], dtype="int32")
-    perm_it = PermutationIterator(zip_it, indices)
+    h_indices = np.asarray([2, 0, 4], dtype="int32")
+    d_indices = DeviceArray.from_numpy(h_indices)
+    perm_it = PermutationIterator(zip_it, d_indices)
 
     def sum_both_fields(a, b):
         return Pair(a.value_0 + b.value_0, a.value_1 + b.value_1)
 
     h_init = Pair(0, 0)
-    d_output = cp.empty(1, dtype=Pair.dtype)
+    d_output = DeviceArray.empty(1, Pair.dtype)
 
     cuda.compute.reduce_into(
         d_in=perm_it,
         d_out=d_output,
-        num_items=len(indices),
+        num_items=len(h_indices),
         op=sum_both_fields,
         h_init=h_init,
     )
 
-    result = d_output.get()[0]
-    assert result["value_0"] == d_values1[indices].sum()
-    assert result["value_1"] == d_values2[indices].sum()
+    result = d_output.copy_to_host()[0]
+    assert result["value_0"] == h_values1[h_indices].sum()
+    assert result["value_1"] == h_values2[h_indices].sum()
 
 
 def test_zip_iterator_of_permutation_iterators():
@@ -111,12 +117,16 @@ class Pair:
         value_0: np.int32
         value_1: np.int32
 
-    d_values1 = cp.asarray([10, 20, 30, 40, 50], dtype="int32")
-    d_values2 = cp.asarray([100, 200, 300, 400, 500], dtype="int32")
-    indices1 = cp.asarray([4, 1, 3, 0], dtype="int32")
-    indices2 = cp.asarray([2, 4, 0, 1], dtype="int32")
-    perm_it1 = PermutationIterator(d_values1, indices1)
-    perm_it2 = PermutationIterator(d_values2, indices2)
+    h_values1 = np.asarray([10, 20, 30, 40, 50], dtype="int32")
+    h_values2 = np.asarray([100, 200, 300, 400, 500], dtype="int32")
+    h_indices1 = np.asarray([4, 1, 3, 0], dtype="int32")
+    h_indices2 = np.asarray([2, 4, 0, 1], dtype="int32")
+    d_values1 = DeviceArray.from_numpy(h_values1)
+    d_values2 = DeviceArray.from_numpy(h_values2)
+    d_indices1 = DeviceArray.from_numpy(h_indices1)
+    d_indices2 = DeviceArray.from_numpy(h_indices2)
+    perm_it1 = PermutationIterator(d_values1, d_indices1)
+    perm_it2 = PermutationIterator(d_values2, d_indices2)
 
     zip_it = ZipIterator(perm_it1, perm_it2)
 
@@ -124,9 +134,9 @@ def sum_both_fields(a, b):
         return Pair(a.value_0 + b.value_0, a.value_1 + b.value_1)
 
     h_init = Pair(0, 0)
-    d_output = cp.empty(1, dtype=Pair.dtype)
+    d_output = DeviceArray.empty(1, Pair.dtype)
 
-    num_items = len(indices1)
+    num_items = len(h_indices1)
     cuda.compute.reduce_into(
         d_in=zip_it,
         d_out=d_output,
@@ -135,26 +145,28 @@ def sum_both_fields(a, b):
         h_init=h_init,
     )
 
-    result = d_output.get()[0]
-    assert result["value_0"] == d_values1[indices1].sum()
-    assert result["value_1"] == d_values2[indices2].sum()
+    result = d_output.copy_to_host()[0]
+    assert result["value_0"] == h_values1[h_indices1].sum()
+    assert result["value_1"] == h_values2[h_indices2].sum()
 
 
 def test_unary_transform_of_permutation_iterator():
-    values = cp.asarray([10, 20, 30, 40, 50], dtype="int32")
-    indices = cp.asarray([2, 0, 4, 1], dtype="int32")
-    perm_it = PermutationIterator(values, indices)
+    h_values = np.asarray([10, 20, 30, 40, 50], dtype="int32")
+    h_indices = np.asarray([2, 0, 4, 1], dtype="int32")
+    d_values = DeviceArray.from_numpy(h_values)
+    d_indices = DeviceArray.from_numpy(h_indices)
+    perm_it = PermutationIterator(d_values, d_indices)
 
     def op(a):
         return a + 1
 
-    d_out = cp.empty_like(values, shape=(len(indices),))
+    d_out = DeviceArray.empty(len(h_indices), h_values.dtype)
     cuda.compute.unary_transform(
-        d_in=perm_it, d_out=d_out, op=op, num_items=len(indices)
+        d_in=perm_it, d_out=d_out, op=op, num_items=len(h_indices)
     )
 
-    expected = values[indices] + 1
-    assert cp.all(d_out == expected)
+    expected = h_values[h_indices] + 1
+    np.testing.assert_array_equal(d_out.copy_to_host(), expected)
 
 
 def test_caching_permutation_iterator():
@@ -163,22 +175,26 @@ def test_caching_permutation_iterator():
 
     # Test 1: Same structure → same kind
     it1 = PermutationIterator(
-        cp.arange(10, dtype=np.int32), cp.arange(10, dtype=np.int32)
+        DeviceArray.from_numpy(np.arange(10, dtype=np.int32)),
+        DeviceArray.from_numpy(np.arange(10, dtype=np.int32)),
     )
     it2 = PermutationIterator(
-        cp.arange(20, dtype=np.int32), cp.arange(5, dtype=np.int32)
+        DeviceArray.from_numpy(np.arange(20, dtype=np.int32)),
+        DeviceArray.from_numpy(np.arange(5, dtype=np.int32)),
     )
     assert it1.kind == it2.kind, "Same structure should have same kind"
 
     # Test 2: Different index type → different kind
     it3 = PermutationIterator(
-        cp.arange(10, dtype=np.int32), cp.arange(10, dtype=np.int64)
+        DeviceArray.from_numpy(np.arange(10, dtype=np.int32)),
+        DeviceArray.from_numpy(np.arange(10, dtype=np.int64)),
     )
     assert it1.kind != it3.kind, "Different index type should have different kind"
 
     # Test 3: Different value type → different kind
     it4 = PermutationIterator(
-        cp.arange(10, dtype=np.int64), cp.arange(10, dtype=np.int32)
+        DeviceArray.from_numpy(np.arange(10, dtype=np.int64)),
+        DeviceArray.from_numpy(np.arange(10, dtype=np.int32)),
     )
     assert it1.kind != it4.kind, "Different value type should have different kind"
 
@@ -189,8 +205,8 @@ def test_caching_permutation_iterator():
     iterators = []
     for i in range(3):
         it = PermutationIterator(
-            cp.arange(i * 10, (i + 1) * 10, dtype=np.float32),
-            cp.arange(5, dtype=np.int32),
+            DeviceArray.from_numpy(np.arange(i * 10, (i + 1) * 10, dtype=np.float32)),
+            DeviceArray.from_numpy(np.arange(5, dtype=np.int32)),
         )
         # Trigger compilation by accessing Op objects
         it.get_advance_op()
@@ -207,7 +223,8 @@ def test_caching_permutation_iterator():
 def test_permutation_iterator_advance():
     """Test PermutationIterator.__add__ only advances indices, not values."""
     # Create values array [10, 20, 30, 40, 50, 60, 70]
-    values = cp.asarray([10, 20, 30, 40, 50, 60, 70], dtype="int32")
+    h_values = np.asarray([10, 20, 30, 40, 50, 60, 70], dtype="int32")
+    d_values = DeviceArray.from_numpy(h_values)
 
     # Create indices array [2, 0, 4, 1, 3, 5]
     # indices[0] = 2 -> values[2] = 30
@@ -216,9 +233,10 @@ def test_permutation_iterator_advance():
     # indices[3] = 1 -> values[1] = 20
     # indices[4] = 3 -> values[3] = 40
     # indices[5] = 5 -> values[5] = 60
-    indices = cp.asarray([2, 0, 4, 1, 3, 5], dtype="int32")
+    h_indices = np.asarray([2, 0, 4, 1, 3, 5], dtype="int32")
+    d_indices = DeviceArray.from_numpy(h_indices)
 
-    perm_it = PermutationIterator(values, indices)
+    perm_it = PermutationIterator(d_values, d_indices)
 
     # Advance by 2 positions (should skip first 2 indices)
     offset = 2
@@ -228,9 +246,9 @@ def test_permutation_iterator_advance():
     # Should process indices[2:] = [4, 1, 3, 5]
     # Which accesses values[4, 1, 3, 5] = [50, 20, 40, 60]
     h_init = np.array([0], dtype="int32")
-    d_output = cp.empty(1, dtype="int32")
+    d_output = DeviceArray.empty(1, np.int32)
 
-    remaining_items = len(indices) - offset
+    remaining_items = len(h_indices) - offset
     cuda.compute.reduce_into(
         d_in=advanced_perm_it,
         d_out=d_output,
@@ -240,7 +258,6 @@ def test_permutation_iterator_advance():
     )
 
     # Expected: values[indices[2:]] = values[[4, 1, 3, 5]] = [50, 20, 40, 60]
-    expected = values[indices[offset:]].sum().get()
-    assert d_output[0].get() == expected, (
-        f"Expected {expected}, got {d_output[0].get()}"
-    )
+    expected = h_values[h_indices[offset:]].sum()
+    result = d_output.copy_to_host()[0]
+    assert result == expected, f"Expected {expected}, got {result}"
diff --git a/python/cuda_cccl/tests/compute/test_radix_sort.py b/python/cuda_cccl/tests/compute/test_radix_sort.py
index 62b8e7dcb44..61e407c556b 100644
--- a/python/cuda_cccl/tests/compute/test_radix_sort.py
+++ b/python/cuda_cccl/tests/compute/test_radix_sort.py
@@ -5,10 +5,9 @@
 import itertools
 from typing import Tuple
 
-import cupy as cp
-import numba
 import numpy as np
 import pytest
+from _utils.device_array import DeviceArray, get_compute_capability
 
 import cuda.compute
 from cuda.compute import (
@@ -148,7 +147,7 @@ def host_sort(h_in_keys, h_in_values, order, begin_bit=None, end_bit=None) -> Tu
     DTYPE_SIZE,
 )
 def test_radix_sort_keys(dtype, num_items, monkeypatch):
-    cc_major, _ = numba.cuda.get_current_device().compute_capability
+    cc_major, _ = get_compute_capability()
     # Skip sass verification for CC 9.0+ due to a bug in NVRTC.
     # TODO: add NVRTC version check, ref nvbug 5243118
     if cc_major >= 9:
@@ -164,8 +163,8 @@ def test_radix_sort_keys(dtype, num_items, monkeypatch):
     h_in_keys = random_array(num_items, dtype, max_value=20)
     h_out_keys = np.empty(num_items, dtype=dtype)
 
-    d_in_keys = numba.cuda.to_device(h_in_keys)
-    d_out_keys = numba.cuda.to_device(h_out_keys)
+    d_in_keys = DeviceArray.from_numpy(h_in_keys)
+    d_out_keys = DeviceArray.empty(h_out_keys.shape, h_out_keys.dtype)
 
     radix_sort_device(d_in_keys, d_out_keys, None, None, order, num_items)
 
@@ -195,10 +194,10 @@ def test_radix_sort_pairs(dtype, num_items, monkeypatch):
     h_out_keys = np.empty(num_items, dtype=dtype)
     h_out_values = np.empty(num_items, dtype=np.float32)
 
-    d_in_keys = numba.cuda.to_device(h_in_keys)
-    d_in_values = numba.cuda.to_device(h_in_values)
-    d_out_keys = numba.cuda.to_device(h_out_keys)
-    d_out_values = numba.cuda.to_device(h_out_values)
+    d_in_keys = DeviceArray.from_numpy(h_in_keys)
+    d_in_values = DeviceArray.from_numpy(h_in_values)
+    d_out_keys = DeviceArray.empty(h_out_keys.shape, h_out_keys.dtype)
+    d_out_values = DeviceArray.empty(h_out_values.shape, h_out_values.dtype)
 
     radix_sort_device(
         d_in_keys, d_out_keys, d_in_values, d_out_values, order, num_items
@@ -218,7 +217,7 @@ def test_radix_sort_pairs(dtype, num_items, monkeypatch):
     DTYPE_SIZE,
 )
 def test_radix_sort_keys_double_buffer(dtype, num_items, monkeypatch):
-    cc_major, _ = numba.cuda.get_current_device().compute_capability
+    cc_major, _ = get_compute_capability()
     # Skip sass verification for CC 9.0+ due to a bug in NVRTC.
     # TODO: add NVRTC version check, ref nvbug 5243118
     if cc_major >= 9:
@@ -234,8 +233,8 @@ def test_radix_sort_keys_double_buffer(dtype, num_items, monkeypatch):
     h_in_keys = random_array(num_items, dtype, max_value=20)
     h_out_keys = np.empty(num_items, dtype=dtype)
 
-    d_in_keys = numba.cuda.to_device(h_in_keys)
-    d_out_keys = numba.cuda.to_device(h_out_keys)
+    d_in_keys = DeviceArray.from_numpy(h_in_keys)
+    d_out_keys = DeviceArray.empty(h_out_keys.shape, h_out_keys.dtype)
 
     keys_double_buffer = DoubleBuffer(d_in_keys, d_out_keys)
 
@@ -253,7 +252,7 @@ def test_radix_sort_keys_double_buffer(dtype, num_items, monkeypatch):
     DTYPE_SIZE,
 )
 def test_radix_sort_pairs_double_buffer(dtype, num_items, monkeypatch):
-    cc_major, _ = numba.cuda.get_current_device().compute_capability
+    cc_major, _ = get_compute_capability()
     # NOTE: int16 failures seen only with NVRTC 13.1:
     if cc_major >= 9 or np.isdtype(dtype, (np.int16, np.uint32)):
         import cuda.compute._cccl_interop
@@ -270,10 +269,10 @@ def test_radix_sort_pairs_double_buffer(dtype, num_items, monkeypatch):
     h_out_keys = np.empty(num_items, dtype=dtype)
     h_out_values = np.empty(num_items, dtype=np.float32)
 
-    d_in_keys = numba.cuda.to_device(h_in_keys)
-    d_in_values = numba.cuda.to_device(h_in_values)
-    d_out_keys = numba.cuda.to_device(h_out_keys)
-    d_out_values = numba.cuda.to_device(h_out_values)
+    d_in_keys = DeviceArray.from_numpy(h_in_keys)
+    d_in_values = DeviceArray.from_numpy(h_in_values)
+    d_out_keys = DeviceArray.empty(h_out_keys.shape, h_out_keys.dtype)
+    d_out_values = DeviceArray.empty(h_out_values.shape, h_out_values.dtype)
 
     keys_double_buffer = DoubleBuffer(d_in_keys, d_out_keys)
     values_double_buffer = DoubleBuffer(d_in_values, d_out_values)
@@ -304,7 +303,7 @@ def test_radix_sort_pairs_double_buffer(dtype, num_items, monkeypatch):
     DTYPE_SIZE_BIT_WINDOW,
 )
 def test_radix_sort_pairs_bit_window(dtype, num_items, monkeypatch):
-    cc_major, _ = numba.cuda.get_current_device().compute_capability
+    cc_major, _ = get_compute_capability()
     # NOTE: int16 failures seen only with NVRTC 13.1:
     if cc_major >= 9 or np.isdtype(dtype, (np.int16, np.uint32)):
         import cuda.compute._cccl_interop
@@ -329,10 +328,10 @@ def test_radix_sort_pairs_bit_window(dtype, num_items, monkeypatch):
         h_out_keys = np.empty(num_items, dtype=dtype)
         h_out_values = np.empty(num_items, dtype=np.float32)
 
-        d_in_keys = numba.cuda.to_device(h_in_keys)
-        d_in_values = numba.cuda.to_device(h_in_values)
-        d_out_keys = numba.cuda.to_device(h_out_keys)
-        d_out_values = numba.cuda.to_device(h_out_values)
+        d_in_keys = DeviceArray.from_numpy(h_in_keys)
+        d_in_values = DeviceArray.from_numpy(h_in_values)
+        d_out_keys = DeviceArray.empty(h_out_keys.shape, h_out_keys.dtype)
+        d_out_values = DeviceArray.empty(h_out_values.shape, h_out_values.dtype)
 
         radix_sort_device(
             d_in_keys,
@@ -384,10 +383,10 @@ def test_radix_sort_pairs_double_buffer_bit_window(dtype, num_items, monkeypatch
         h_out_keys = np.empty(num_items, dtype=dtype)
         h_out_values = np.empty(num_items, dtype=np.float32)
 
-        d_in_keys = numba.cuda.to_device(h_in_keys)
-        d_in_values = numba.cuda.to_device(h_in_values)
-        d_out_keys = numba.cuda.to_device(h_out_keys)
-        d_out_values = numba.cuda.to_device(h_out_values)
+        d_in_keys = DeviceArray.from_numpy(h_in_keys)
+        d_in_values = DeviceArray.from_numpy(h_in_values)
+        d_out_keys = DeviceArray.empty(h_out_keys.shape, h_out_keys.dtype)
+        d_out_values = DeviceArray.empty(h_out_values.shape, h_out_values.dtype)
 
         keys_double_buffer = DoubleBuffer(d_in_keys, d_out_keys)
         values_double_buffer = DoubleBuffer(d_in_values, d_out_values)
@@ -434,8 +433,8 @@ def test_radix_sort_large_num_items(dtype, monkeypatch):
 
     h_in_keys = np.arange(num_items - 1, -1, -1, dtype=dtype)
 
-    d_in_keys = cp.asarray(h_in_keys)
-    d_out_keys = cp.empty(num_items, dtype=dtype)
+    d_in_keys = DeviceArray.from_numpy(h_in_keys)
+    d_out_keys = DeviceArray.empty(num_items, dtype)
 
     cuda.compute.radix_sort(
         d_in_keys=d_in_keys,
@@ -446,31 +445,37 @@ def test_radix_sort_large_num_items(dtype, monkeypatch):
         order=SortOrder.ASCENDING,
     )
 
-    h_out_keys = d_out_keys.get()
+    h_out_keys = d_out_keys.copy_to_host()
     h_expected, _ = host_sort(h_in_keys, None, SortOrder.ASCENDING)
 
     np.testing.assert_array_equal(h_out_keys, h_expected)
 
 
 def test_radix_sort_with_stream(cuda_stream):
-    cp_stream = cp.cuda.ExternalStream(cuda_stream.ptr)
     num_items = 10000
 
-    with cp_stream:
-        h_in_keys = random_array(num_items, np.int32)
-        d_in_keys = cp.asarray(h_in_keys)
-        d_out_keys = cp.empty_like(d_in_keys)
+    h_in_keys = random_array(num_items, np.int32)
+    d_in_keys = DeviceArray.from_numpy(h_in_keys, stream=cuda_stream)
+    d_out_keys = DeviceArray.empty(h_in_keys.shape, h_in_keys.dtype, stream=cuda_stream)
 
-    radix_sort_device(d_in_keys, d_out_keys, None, None, SortOrder.ASCENDING, num_items)
+    radix_sort_device(
+        d_in_keys,
+        d_out_keys,
+        None,
+        None,
+        SortOrder.ASCENDING,
+        num_items,
+        stream=cuda_stream,
+    )
 
-    got = d_out_keys.get()
+    got = d_out_keys.copy_to_host(stream=cuda_stream)
     h_in_keys.sort()
 
     np.testing.assert_array_equal(got, h_in_keys)
 
 
 def test_radix_sort(monkeypatch):
-    cc_major, _ = numba.cuda.get_current_device().compute_capability
+    cc_major, _ = get_compute_capability()
     # Skip sass verification for CC 9.0+ due to a bug in NVRTC.
     # TODO: add NVRTC version check, ref nvbug 5243118
     if cc_major >= 9:
@@ -482,19 +487,16 @@ def test_radix_sort(monkeypatch):
             False,
         )
 
-    import cupy as cp
-    import numpy as np
-
     h_in_keys = np.array([-5, 0, 2, -3, 2, 4, 0, -1, 2, 8], dtype="int32")
     h_in_values = np.array(
         [-3.2, 2.2, 1.9, 4.0, -3.9, 2.7, 0, 8.3 - 1, 2.9, 5.4], dtype="float32"
     )
 
-    d_in_keys = cp.asarray(h_in_keys)
-    d_in_values = cp.asarray(h_in_values)
+    d_in_keys = DeviceArray.from_numpy(h_in_keys)
+    d_in_values = DeviceArray.from_numpy(h_in_values)
 
-    d_out_keys = cp.empty_like(d_in_keys)
-    d_out_values = cp.empty_like(d_in_values)
+    d_out_keys = DeviceArray.empty(h_in_keys.shape, h_in_keys.dtype)
+    d_out_values = DeviceArray.empty(h_in_values.shape, h_in_values.dtype)
 
     # Call single-phase API directly with num_items parameter
     cuda.compute.radix_sort(
@@ -502,13 +504,13 @@ def test_radix_sort(monkeypatch):
         d_out_keys=d_out_keys,
         d_in_values=d_in_values,
         d_out_values=d_out_values,
-        num_items=d_in_keys.size,
+        num_items=h_in_keys.size,
         order=SortOrder.ASCENDING,
     )
 
     # Check the result is correct
-    h_out_keys = cp.asnumpy(d_out_keys)
-    h_out_items = cp.asnumpy(d_out_values)
+    h_out_keys = d_out_keys.copy_to_host()
+    h_out_items = d_out_values.copy_to_host()
 
     argsort = np.argsort(h_in_keys, stable=True)
     h_in_keys = np.array(h_in_keys)[argsort]
@@ -519,7 +521,7 @@ def test_radix_sort(monkeypatch):
 
 
 def test_radix_sort_double_buffer(monkeypatch):
-    cc_major, _ = numba.cuda.get_current_device().compute_capability
+    cc_major, _ = get_compute_capability()
     # Skip sass verification for CC 9.0+ due to a bug in NVRTC.
     # TODO: add NVRTC version check, ref nvbug 5243118
     if cc_major >= 9:
@@ -531,19 +533,16 @@ def test_radix_sort_double_buffer(monkeypatch):
             False,
         )
 
-    import cupy as cp
-    import numpy as np
-
     h_in_keys = np.array([-5, 0, 2, -3, 2, 4, 0, -1, 2, 8], dtype="int32")
     h_in_values = np.array(
         [-3.2, 2.2, 1.9, 4.0, -3.9, 2.7, 0, 8.3 - 1, 2.9, 5.4], dtype="float32"
     )
 
-    d_in_keys = cp.asarray(h_in_keys)
-    d_in_values = cp.asarray(h_in_values)
+    d_in_keys = DeviceArray.from_numpy(h_in_keys)
+    d_in_values = DeviceArray.from_numpy(h_in_values)
 
-    d_out_keys = cp.empty_like(d_in_keys)
-    d_out_values = cp.empty_like(d_in_values)
+    d_out_keys = DeviceArray.empty(h_in_keys.shape, h_in_keys.dtype)
+    d_out_values = DeviceArray.empty(h_in_values.shape, h_in_values.dtype)
 
     keys_double_buffer = DoubleBuffer(d_in_keys, d_out_keys)
     values_double_buffer = DoubleBuffer(d_in_values, d_out_values)
@@ -554,13 +553,13 @@ def test_radix_sort_double_buffer(monkeypatch):
         d_out_keys=None,
         d_in_values=values_double_buffer,
         d_out_values=None,
-        num_items=d_in_keys.size,
+        num_items=h_in_keys.size,
         order=SortOrder.ASCENDING,
     )
 
     # Check the result is correct
-    h_out_keys = cp.asnumpy(keys_double_buffer.current())
-    h_out_values = cp.asnumpy(values_double_buffer.current())
+    h_out_keys = keys_double_buffer.current().copy_to_host()
+    h_out_values = values_double_buffer.current().copy_to_host()
 
     argsort = np.argsort(h_in_keys, stable=True)
     h_in_keys = np.array(h_in_keys)[argsort]
diff --git a/python/cuda_cccl/tests/compute/test_raw_op.py b/python/cuda_cccl/tests/compute/test_raw_op.py
index 9ad88fe6182..b8544658a01 100644
--- a/python/cuda_cccl/tests/compute/test_raw_op.py
+++ b/python/cuda_cccl/tests/compute/test_raw_op.py
@@ -6,15 +6,15 @@
 import re
 import struct
 
-import cupy as cp
 import numpy as np
 import pytest
+from _utils.device_array import DeviceArray, get_compute_capability
 
 import cuda.compute
 from cuda.compute import types
 from cuda.compute._cpp_compile import _get_include_paths
 from cuda.compute.op import RawOp
-from cuda.core import Device, Program, ProgramOptions
+from cuda.core import Program, ProgramOptions
 
 # Mark all tests in this module as no_numba
 pytestmark = pytest.mark.no_numba
@@ -22,9 +22,7 @@
 
 def get_arch():
     """Get the SM architecture string for the current device."""
-    device = Device()
-    device.set_current()
-    cc_major, cc_minor = device.compute_capability
+    cc_major, cc_minor = get_compute_capability()
     return f"sm_{cc_major}{cc_minor}"
 
 
@@ -115,8 +113,8 @@ def test_cpp_op_basic_add():
     # Create test data
     num_items = 100
     h_input = np.arange(num_items, dtype=np.int32)
-    d_input = cp.array(h_input)
-    d_output = cp.empty(1, dtype=np.int32)
+    d_input = DeviceArray.from_numpy(h_input)
+    d_output = DeviceArray.empty(1, np.int32)
 
     # Use the custom op with reduce_into
     h_init = np.array(0, dtype=np.int32)
@@ -125,7 +123,7 @@ def test_cpp_op_basic_add():
     )
 
     # Verify result
-    result = d_output.get()[0]
+    result = d_output.copy_to_host()[0]
     expected = np.sum(h_input)
     assert result == expected, f"Expected {expected}, got {result}"
 
@@ -148,8 +146,8 @@ def test_cpp_op_max():
     # Create test data
     num_items = 100
     h_input = np.random.randn(num_items).astype(np.float32)
-    d_input = cp.array(h_input)
-    d_output = cp.empty(1, dtype=np.float32)
+    d_input = DeviceArray.from_numpy(h_input)
+    d_output = DeviceArray.empty(1, np.float32)
 
     # Use the custom op with reduce_into
     h_init = np.array(-np.inf, dtype=np.float32)
@@ -158,7 +156,7 @@ def test_cpp_op_max():
     )
 
     # Verify result
-    result = d_output.get()[0]
+    result = d_output.copy_to_host()[0]
     expected = np.max(h_input)
     assert np.isclose(result, expected), f"Expected {expected}, got {result}"
 
@@ -176,8 +174,8 @@ def test_cpp_op_multiply():
     # Create test data - use small numbers to avoid overflow
     num_items = 5
     h_input = np.array([1, 2, 3, 4, 5], dtype=np.int32)
-    d_input = cp.array(h_input)
-    d_output = cp.empty(1, dtype=np.int32)
+    d_input = DeviceArray.from_numpy(h_input)
+    d_output = DeviceArray.empty(1, np.int32)
 
     # Use the custom op with reduce_into
     h_init = np.array(1, dtype=np.int32)
@@ -186,7 +184,7 @@ def test_cpp_op_multiply():
     )
 
     # Verify result
-    result = d_output.get()[0]
+    result = d_output.copy_to_host()[0]
     expected = np.prod(h_input)
     assert result == expected, f"Expected {expected}, got {result}"
 
@@ -207,8 +205,8 @@ def test_cpp_op_complex_logic():
     # Create test data with specific bit patterns
     num_items = 5
     h_input = np.array([1, 2, 4, 8, 16], dtype=np.int32)  # Powers of 2
-    d_input = cp.array(h_input)
-    d_output = cp.empty(1, dtype=np.int32)
+    d_input = DeviceArray.from_numpy(h_input)
+    d_output = DeviceArray.empty(1, np.int32)
 
     # Use the custom op with reduce_into
     h_init = np.array(0, dtype=np.int32)
@@ -217,7 +215,7 @@ def test_cpp_op_complex_logic():
     )
 
     # Expected: 1 | 2 | 4 | 8 | 16 = 31 (all bits set)
-    result = d_output.get()[0]
+    result = d_output.copy_to_host()[0]
     expected = 31
     assert result == expected, f"Expected {expected}, got {result}"
 
@@ -235,8 +233,8 @@ def test_cpp_op_different_types():
     # Create test data
     num_items = 50
     h_input = np.random.randn(num_items).astype(np.float64)
-    d_input = cp.array(h_input)
-    d_output = cp.empty(1, dtype=np.float64)
+    d_input = DeviceArray.from_numpy(h_input)
+    d_output = DeviceArray.empty(1, np.float64)
 
     # Use the custom op with reduce_into
     h_init = np.array(0.0, dtype=np.float64)
@@ -245,7 +243,7 @@ def test_cpp_op_different_types():
     )
 
     # Verify result
-    result = d_output.get()[0]
+    result = d_output.copy_to_host()[0]
     expected = np.sum(h_input)
     assert np.isclose(result, expected), f"Expected {expected}, got {result}"
 
@@ -264,8 +262,8 @@ def test_cpp_op_name_extraction():
     # Create test data
     num_items = 10
     h_input = np.arange(num_items, dtype=np.int32)
-    d_input = cp.array(h_input)
-    d_output = cp.empty(1, dtype=np.int32)
+    d_input = DeviceArray.from_numpy(h_input)
+    d_output = DeviceArray.empty(1, np.int32)
 
     # Use the custom op with reduce_into
     h_init = np.array(0, dtype=np.int32)
@@ -274,7 +272,7 @@ def test_cpp_op_name_extraction():
     )
 
     # Verify result
-    result = d_output.get()[0]
+    result = d_output.copy_to_host()[0]
     expected = np.sum(h_input)
     assert result == expected, f"Expected {expected}, got {result}"
 
@@ -294,8 +292,8 @@ def test_cpp_op_min():
     # Create test data
     num_items = 100
     h_input = np.random.randint(-1000, 1000, num_items, dtype=np.int32)
-    d_input = cp.array(h_input)
-    d_output = cp.empty(1, dtype=np.int32)
+    d_input = DeviceArray.from_numpy(h_input)
+    d_output = DeviceArray.empty(1, np.int32)
 
     # Use the custom op with reduce_into
     h_init = np.array(np.iinfo(np.int32).max, dtype=np.int32)
@@ -304,7 +302,7 @@ def test_cpp_op_min():
     )
 
     # Verify result
-    result = d_output.get()[0]
+    result = d_output.copy_to_host()[0]
     expected = np.min(h_input)
     assert result == expected, f"Expected {expected}, got {result}"
 
@@ -341,14 +339,8 @@ def test_cpp_op_with_struct():
         h_data[i]["x"] = i
         h_data[i]["y"] = i * 2
 
-    # Convert to device arrays using uint8 view
-    itemsize = h_data.dtype.itemsize
-    d_input = cp.empty(num_items * itemsize, dtype=np.uint8)
-    d_input.set(h_data.view(np.uint8))
-    d_input = d_input.view(Point.dtype)
-
-    d_output = cp.empty(itemsize, dtype=np.uint8)
-    d_output = d_output.view(Point.dtype)
+    d_input = DeviceArray.from_numpy(h_data)
+    d_output = DeviceArray.empty(1, Point.dtype)
 
     # Initial point (0, 0)
     h_init = Point(0, 0)
@@ -359,7 +351,7 @@ def test_cpp_op_with_struct():
     )
 
     # Verify result
-    result = d_output.view(np.uint8).get().view(Point.dtype)[0]
+    result = d_output.copy_to_host()[0]
     expected_x = sum(range(num_items))  # 0+1+2+...+9 = 45
     expected_y = sum(i * 2 for i in range(num_items))  # 0+2+4+...+18 = 90
 
@@ -383,13 +375,13 @@ def test_cpp_op_with_transform_iterator():
     # Create input data
     num_items = 10
     h_input = np.arange(num_items, dtype=np.int32)
-    d_input = cp.array(h_input)
+    d_input = DeviceArray.from_numpy(h_input)
 
     # Create transform iterator with RawOp
     transform_iter = TransformIterator(d_input, op, value_type=types.int32)
 
     # Use the transform iterator with reduce
-    d_output = cp.empty(1, dtype=np.int32)
+    d_output = DeviceArray.empty(1, np.int32)
     h_init = np.array(0, dtype=np.int32)
 
     # Sum the doubled values using built-in PLUS operator
@@ -402,7 +394,7 @@ def test_cpp_op_with_transform_iterator():
     )
 
     # Verify result: sum of (0*2, 1*2, 2*2, ..., 9*2) = 2 * sum(0..9) = 2 * 45 = 90
-    result = d_output.get()[0]
+    result = d_output.copy_to_host()[0]
     expected = 2 * np.sum(h_input)
     assert result == expected, f"Expected {expected}, got {result}"
 
@@ -410,7 +402,7 @@ def test_cpp_op_with_transform_iterator():
 def test_cpp_stateful_op_reduce_with_constant():
     """Test stateful RawOp with a simple stateful reduce."""
     # State: a single int32 constant value (10) on device
-    d_constant = cp.array([10], dtype=np.int32)
+    d_constant = DeviceArray.from_numpy(np.array([10], dtype=np.int32))
     constant_ptr = d_constant.__cuda_array_interface__["data"][0]
     state_data = struct.pack("P", constant_ptr)
     state_alignment = np.dtype(np.intp).alignment
@@ -435,8 +427,8 @@ def test_cpp_stateful_op_reduce_with_constant():
     # Create test data
     num_items = 5
     h_input = np.array([1, 2, 3, 4, 5], dtype=np.int32)
-    d_input = cp.array(h_input)
-    d_output = cp.empty(1, dtype=np.int32)
+    d_input = DeviceArray.from_numpy(h_input)
+    d_output = DeviceArray.empty(1, np.int32)
 
     # Use the stateful op with reduce_into
     h_init = np.array(0, dtype=np.int32)
@@ -445,7 +437,7 @@ def test_cpp_stateful_op_reduce_with_constant():
     )
 
     # Get result
-    result = d_output.get()[0]
+    result = d_output.copy_to_host()[0]
     # Each reduction adds 10, so we expect input sum + some multiple of 10
     # The exact value depends on tree structure, but should be > sum(inputs)
     sum_inputs = np.sum(h_input)
@@ -455,7 +447,7 @@ def test_cpp_stateful_op_reduce_with_constant():
 def test_cpp_stateful_op_select_with_counter():
     """Test stateful RawOp with select_if that atomically updates a counter."""
     # Create a device counter initialized to 0
-    d_counter = cp.zeros(1, dtype=np.int32)
+    d_counter = DeviceArray.from_numpy(np.zeros(1, dtype=np.int32))
 
     # State: pointer to the counter
     counter_ptr = d_counter.__cuda_array_interface__["data"][0]
@@ -497,11 +489,11 @@ def test_cpp_stateful_op_select_with_counter():
     # Create test data: 0 to 19
     num_items = 20
     h_input = np.arange(num_items, dtype=np.int32)
-    d_input = cp.array(h_input)
+    d_input = DeviceArray.from_numpy(h_input)
 
     # Allocate output arrays
-    d_output = cp.empty(num_items, dtype=np.int32)
-    d_num_selected = cp.empty(1, dtype=np.int32)
+    d_output = DeviceArray.empty(num_items, np.int32)
+    d_num_selected = DeviceArray.empty(1, np.int32)
 
     # Run select
     cuda.compute.select(
@@ -513,8 +505,8 @@ def test_cpp_stateful_op_select_with_counter():
     )
 
     # Get results
-    num_selected = d_num_selected.get()[0]
-    counter_value = d_counter.get()[0]
+    num_selected = d_num_selected.copy_to_host()[0]
+    counter_value = d_counter.copy_to_host()[0]
 
     # Verify: should have selected 10 even numbers (0, 2, 4, ..., 18)
     expected_count = 10
@@ -526,7 +518,7 @@ def test_cpp_stateful_op_select_with_counter():
     )
 
     # Verify the selected values are correct
-    selected_values = d_output.get()[:num_selected]
+    selected_values = d_output.copy_to_host()[:num_selected]
     expected_selected = np.arange(0, 20, 2, dtype=np.int32)
     assert np.array_equal(selected_values, expected_selected), (
         "Selected values don't match"
diff --git a/python/cuda_cccl/tests/compute/test_reduce.py b/python/cuda_cccl/tests/compute/test_reduce.py
index df3f024f0ee..5af40f95ad7 100644
--- a/python/cuda_cccl/tests/compute/test_reduce.py
+++ b/python/cuda_cccl/tests/compute/test_reduce.py
@@ -5,11 +5,9 @@
 import functools
 import random
 
-import cupy as cp
-import numba.cuda
 import numpy as np
 import pytest
-from cupy.cuda import runtime
+from _utils.device_array import DeviceArray
 
 import cuda.compute
 from cuda.compute import (
@@ -73,13 +71,13 @@ def add_op(a, b):
 def test_device_reduce(dtype, num_items, op):
     init_value = 42
     h_init = np.array([init_value], dtype=dtype)
-    d_output = numba.cuda.device_array(1, dtype=dtype)
+    d_output = DeviceArray.empty(1, dtype)
 
     h_input = random_int(num_items, dtype)
-    d_input = numba.cuda.to_device(h_input)
+    d_input = DeviceArray.from_numpy(h_input)
 
     cuda.compute.reduce_into(
-        d_in=d_input, d_out=d_output, num_items=d_input.size, op=op, h_init=h_init
+        d_in=d_input, d_out=d_output, num_items=h_input.size, op=op, h_init=h_init
     )
     h_output = d_output.copy_to_host()
     assert h_output[0] == pytest.approx(
@@ -94,16 +92,16 @@ def test_device_reduce_with_lambda():
     num_items = 1024
 
     h_init = np.array([init_value], dtype=dtype)
-    d_output = numba.cuda.device_array(1, dtype=dtype)
+    d_output = DeviceArray.empty(1, dtype)
 
     h_input = random_int(num_items, dtype)
-    d_input = numba.cuda.to_device(h_input)
+    d_input = DeviceArray.from_numpy(h_input)
 
     # Use a lambda function directly as the reducer
     cuda.compute.reduce_into(
         d_in=d_input,
         d_out=d_output,
-        num_items=d_input.size,
+        num_items=h_input.size,
         op=lambda a, b: a + b,
         h_init=h_init,
     )
@@ -118,16 +116,16 @@ def test_device_reduce_with_lambda_variable():
     num_items = 1024
 
     h_init = np.array([init_value], dtype=dtype)
-    d_output = numba.cuda.device_array(1, dtype=dtype)
+    d_output = DeviceArray.empty(1, dtype)
 
     h_input = random_int(num_items, dtype)
-    d_input = numba.cuda.to_device(h_input)
+    d_input = DeviceArray.from_numpy(h_input)
 
     # Use a lambda function assigned to a variable as the reducer
     cuda.compute.reduce_into(
         d_in=d_input,
         d_out=d_output,
-        num_items=d_input.size,
+        num_items=h_input.size,
         op=add_op_lambda,
         h_init=h_init,
     )
@@ -137,13 +135,13 @@ def test_device_reduce_with_lambda_variable():
 
 def test_complex_device_reduce():
     h_init = np.array([40.0 + 2.0j], dtype=complex)
-    d_output = numba.cuda.device_array(1, dtype=complex)
+    d_output = DeviceArray.empty(1, complex)
 
     for num_items in [42, 420000]:
         real_imag = np.random.random((2, num_items))
         h_input = real_imag[0] + 1j * real_imag[1]
-        d_input = numba.cuda.to_device(h_input)
-        assert d_input.size == num_items
+        d_input = DeviceArray.from_numpy(h_input)
+        assert h_input.size == num_items
         cuda.compute.reduce_into(
             d_in=d_input, d_out=d_output, num_items=num_items, op=add_op, h_init=h_init
         )
@@ -162,11 +160,11 @@ def _test_device_sum_with_iterator(
 
     if use_numpy_array:
         h_input = np.array(l_varr, dtype_inp)
-        d_input = numba.cuda.to_device(h_input)
+        d_input = DeviceArray.from_numpy(h_input)
     else:
         d_input = i_input
 
-    d_output = numba.cuda.device_array(1, dtype_out)  # to store device sum
+    d_output = DeviceArray.empty(1, dtype_out)  # to store device sum
 
     h_init = np.array([start_sum_with], dtype_out)
 
@@ -216,7 +214,7 @@ def test_device_sum_cache_modified_input_it(
     l_varr = [rng.randrange(100) for _ in range(num_items)]
     dtype_inp = np.dtype(supported_value_type)
     dtype_out = dtype_inp
-    input_devarr = numba.cuda.to_device(np.array(l_varr, dtype=dtype_inp))
+    input_devarr = DeviceArray.from_numpy(np.array(l_varr, dtype=dtype_inp))
     i_input = CacheModifiedInputIterator(input_devarr, modifier="stream")
     _test_device_sum_with_iterator(
         l_varr, start_sum_with, i_input, dtype_inp, dtype_out, use_numpy_array
@@ -316,7 +314,7 @@ def test_device_sum_map_mul_map_mul_count_it(
         ("int32", "int64"),
     ],
 )
-def test_device_sum_map_mul2_cp_array_it(
+def test_device_sum_map_mul2_device_array_it(
     use_numpy_array, value_type_name_pair, num_items=3, start_sum_with=10
 ):
     vtn_out, vtn_inp = value_type_name_pair
@@ -324,7 +322,7 @@ def test_device_sum_map_mul2_cp_array_it(
     dtype_out = np.dtype(vtn_out)
     rng = random.Random(0)
     l_d_in = [rng.randrange(100) for _ in range(num_items)]
-    a_d_in = cp.array(l_d_in, dtype_inp)
+    a_d_in = DeviceArray.from_numpy(np.asarray(l_d_in, dtype=dtype_inp))
     i_input = TransformIterator(a_d_in, mul2)
     l_varr = [mul2(v) for v in l_d_in]
     _test_device_sum_with_iterator(
@@ -338,14 +336,14 @@ def sum_op(x, y):
 
     # inputs are device arrays
     reducer_1 = cuda.compute.make_reduce_into(
-        d_in=cp.zeros(3, dtype="int64"),
-        d_out=cp.zeros(1, dtype="int64"),
+        d_in=DeviceArray.empty(3, dtype="int64"),
+        d_out=DeviceArray.empty(1, dtype="int64"),
         op=sum_op,
         h_init=np.zeros(1, dtype="int64"),
     )
     reducer_2 = cuda.compute.make_reduce_into(
-        d_in=cp.zeros(3, dtype="int64"),
-        d_out=cp.zeros(1, dtype="int64"),
+        d_in=DeviceArray.empty(3, dtype="int64"),
+        d_out=DeviceArray.empty(1, dtype="int64"),
         op=sum_op,
         h_init=np.zeros(1, dtype="int64"),
     )
@@ -353,14 +351,14 @@ def sum_op(x, y):
 
     # inputs are device arrays of different dtype:
     reducer_1 = cuda.compute.make_reduce_into(
-        d_in=cp.zeros(3, dtype="int64"),
-        d_out=cp.zeros(1, dtype="int64"),
+        d_in=DeviceArray.empty(3, dtype="int64"),
+        d_out=DeviceArray.empty(1, dtype="int64"),
         op=sum_op,
         h_init=np.zeros(1, dtype="int64"),
     )
     reducer_2 = cuda.compute.make_reduce_into(
-        d_in=cp.zeros(3, dtype="int32"),
-        d_out=cp.zeros(1, dtype="int64"),
+        d_in=DeviceArray.empty(3, dtype="int32"),
+        d_out=DeviceArray.empty(1, dtype="int64"),
         op=sum_op,
         h_init=np.zeros(1, dtype="int64"),
     )
@@ -368,14 +366,14 @@ def sum_op(x, y):
 
     # outputs are of different dtype:
     reducer_1 = cuda.compute.make_reduce_into(
-        d_in=cp.zeros(3, dtype="int64"),
-        d_out=cp.zeros(1, dtype="int64"),
+        d_in=DeviceArray.empty(3, dtype="int64"),
+        d_out=DeviceArray.empty(1, dtype="int64"),
         op=sum_op,
         h_init=np.zeros(1, dtype="int64"),
     )
     reducer_2 = cuda.compute.make_reduce_into(
-        d_in=cp.zeros(3, dtype="int64"),
-        d_out=cp.zeros(1, dtype="int32"),
+        d_in=DeviceArray.empty(3, dtype="int64"),
+        d_out=DeviceArray.empty(1, dtype="int32"),
         op=sum_op,
         h_init=np.zeros(1, dtype="int64"),
     )
@@ -384,14 +382,14 @@ def sum_op(x, y):
     # inputs are of same dtype but different size
     # (should still use cached reducer):
     reducer_1 = cuda.compute.make_reduce_into(
-        d_in=cp.zeros(3, dtype="int64"),
-        d_out=cp.zeros(1, dtype="int64"),
+        d_in=DeviceArray.empty(3, dtype="int64"),
+        d_out=DeviceArray.empty(1, dtype="int64"),
         op=sum_op,
         h_init=np.zeros(1, dtype="int64"),
     )
     reducer_2 = cuda.compute.make_reduce_into(
-        d_in=cp.zeros(5, dtype="int64"),
-        d_out=cp.zeros(1, dtype="int64"),
+        d_in=DeviceArray.empty(5, dtype="int64"),
+        d_out=DeviceArray.empty(1, dtype="int64"),
         op=sum_op,
         h_init=np.zeros(1, dtype="int64"),
     )
@@ -401,13 +399,13 @@ def sum_op(x, y):
     # same value type:
     reducer_1 = cuda.compute.make_reduce_into(
         d_in=CountingIterator(np.int32(0)),
-        d_out=cp.zeros(1, dtype="int64"),
+        d_out=DeviceArray.empty(1, dtype="int64"),
         op=sum_op,
         h_init=np.zeros(1, dtype="int64"),
     )
     reducer_2 = cuda.compute.make_reduce_into(
         d_in=CountingIterator(np.int32(0)),
-        d_out=cp.zeros(1, dtype="int64"),
+        d_out=DeviceArray.empty(1, dtype="int64"),
         op=sum_op,
         h_init=np.zeros(1, dtype="int64"),
     )
@@ -416,13 +414,13 @@ def sum_op(x, y):
     # inputs are counting iterators of different value type:
     reducer_1 = cuda.compute.make_reduce_into(
         d_in=CountingIterator(np.int32(0)),
-        d_out=cp.zeros(1, dtype="int64"),
+        d_out=DeviceArray.empty(1, dtype="int64"),
         op=sum_op,
         h_init=np.zeros(1, dtype="int64"),
     )
     reducer_2 = cuda.compute.make_reduce_into(
         d_in=CountingIterator(np.int64(0)),
-        d_out=cp.zeros(1, dtype="int64"),
+        d_out=DeviceArray.empty(1, dtype="int64"),
         op=sum_op,
         h_init=np.zeros(1, dtype="int64"),
     )
@@ -440,13 +438,13 @@ def op3(x):
     # inputs are TransformIterators
     reducer_1 = cuda.compute.make_reduce_into(
         d_in=TransformIterator(CountingIterator(np.int32(0)), op1),
-        d_out=cp.zeros(1, dtype="int64"),
+        d_out=DeviceArray.empty(1, dtype="int64"),
         op=sum_op,
         h_init=np.zeros(1, dtype="int64"),
     )
     reducer_2 = cuda.compute.make_reduce_into(
         d_in=TransformIterator(CountingIterator(np.int32(0)), op1),
-        d_out=cp.zeros(1, dtype="int64"),
+        d_out=DeviceArray.empty(1, dtype="int64"),
         op=sum_op,
         h_init=np.zeros(1, dtype="int64"),
     )
@@ -456,13 +454,13 @@ def op3(x):
     # op:
     reducer_1 = cuda.compute.make_reduce_into(
         d_in=TransformIterator(CountingIterator(np.int32(0)), op1),
-        d_out=cp.zeros(1, dtype="int64"),
+        d_out=DeviceArray.empty(1, dtype="int64"),
         op=sum_op,
         h_init=np.zeros(1, dtype="int64"),
     )
     reducer_2 = cuda.compute.make_reduce_into(
         d_in=TransformIterator(CountingIterator(np.int32(0)), op2),
-        d_out=cp.zeros(1, dtype="int64"),
+        d_out=DeviceArray.empty(1, dtype="int64"),
         op=sum_op,
         h_init=np.zeros(1, dtype="int64"),
     )
@@ -472,13 +470,13 @@ def op3(x):
     # but different name:
     reducer_1 = cuda.compute.make_reduce_into(
         d_in=TransformIterator(CountingIterator(np.int32(0)), op1),
-        d_out=cp.zeros(1, dtype="int64"),
+        d_out=DeviceArray.empty(1, dtype="int64"),
         op=sum_op,
         h_init=np.zeros(1, dtype="int64"),
     )
     reducer_2 = cuda.compute.make_reduce_into(
         d_in=TransformIterator(CountingIterator(np.int32(0)), op3),
-        d_out=cp.zeros(1, dtype="int64"),
+        d_out=DeviceArray.empty(1, dtype="int64"),
         op=sum_op,
         h_init=np.zeros(1, dtype="int64"),
     )
@@ -487,13 +485,13 @@ def op3(x):
     # but different state:
     reducer_1 = cuda.compute.make_reduce_into(
         d_in=CountingIterator(np.int32(0)),
-        d_out=cp.zeros(1, dtype="int64"),
+        d_out=DeviceArray.empty(1, dtype="int64"),
         op=sum_op,
         h_init=np.zeros(1, dtype="int64"),
     )
     reducer_2 = cuda.compute.make_reduce_into(
         d_in=CountingIterator(np.int32(1)),
-        d_out=cp.zeros(1, dtype="int64"),
+        d_out=DeviceArray.empty(1, dtype="int64"),
         op=sum_op,
         h_init=np.zeros(1, dtype="int64"),
     )
@@ -502,17 +500,17 @@ def op3(x):
 
     # inputs are TransformIterators of same kind
     # but different state:
-    ary1 = cp.asarray([0, 1, 2], dtype="int64")
-    ary2 = cp.asarray([0, 1], dtype="int64")
+    ary1 = DeviceArray.from_numpy(np.asarray([0, 1, 2], dtype="int64"))
+    ary2 = DeviceArray.from_numpy(np.asarray([0, 1], dtype="int64"))
     reducer_1 = cuda.compute.make_reduce_into(
         d_in=TransformIterator(ary1, op1),
-        d_out=cp.zeros(1, dtype="int64"),
+        d_out=DeviceArray.empty(1, dtype="int64"),
         op=sum_op,
         h_init=np.zeros(1, dtype="int64"),
     )
     reducer_2 = cuda.compute.make_reduce_into(
         d_in=TransformIterator(ary2, op1),
-        d_out=cp.zeros(1, dtype="int64"),
+        d_out=DeviceArray.empty(1, dtype="int64"),
         op=sum_op,
         h_init=np.zeros(1, dtype="int64"),
     )
@@ -522,13 +520,13 @@ def op3(x):
     # but different state:
     reducer_1 = cuda.compute.make_reduce_into(
         d_in=TransformIterator(CountingIterator(np.int32(0)), op1),
-        d_out=cp.zeros(1, dtype="int64"),
+        d_out=DeviceArray.empty(1, dtype="int64"),
         op=sum_op,
         h_init=np.zeros(1, dtype="int64"),
     )
     reducer_2 = cuda.compute.make_reduce_into(
         d_in=TransformIterator(CountingIterator(np.int32(1)), op1),
-        d_out=cp.zeros(1, dtype="int64"),
+        d_out=DeviceArray.empty(1, dtype="int64"),
         op=sum_op,
         h_init=np.zeros(1, dtype="int64"),
     )
@@ -537,13 +535,13 @@ def op3(x):
     # inputs are TransformIterators with different kind:
     reducer_1 = cuda.compute.make_reduce_into(
         d_in=TransformIterator(CountingIterator(np.int32(0)), op1),
-        d_out=cp.zeros(1, dtype="int64"),
+        d_out=DeviceArray.empty(1, dtype="int64"),
         op=sum_op,
         h_init=np.zeros(1, dtype="int64"),
     )
     reducer_2 = cuda.compute.make_reduce_into(
         d_in=TransformIterator(CountingIterator(np.int64(0)), op1),
-        d_out=cp.zeros(1, dtype="int64"),
+        d_out=DeviceArray.empty(1, dtype="int64"),
         op=sum_op,
         h_init=np.zeros(1, dtype="int64"),
     )
@@ -553,27 +551,25 @@ def op3(x):
 @pytest.fixture(params=[True, False])
 def array_2d(request):
     f_contiguous = request.param
-    arr = cp.random.rand(5, 10)
-    if f_contiguous:
-        try:
-            return cp.asfortranarray(arr)
-        except ImportError:  # cublas unavailable
-            return arr
-    else:
-        return arr
+    array = np.random.rand(5, 10)
+    return np.asfortranarray(array) if f_contiguous else array
 
 
 def test_reduce_2d_array(array_2d):
     def binary_op(x, y):
         return x + y
 
-    d_out = cp.empty(1, dtype=array_2d.dtype)
+    d_in = DeviceArray.from_numpy(array_2d)
+    d_out = DeviceArray.empty(1, dtype=array_2d.dtype)
     h_init = np.asarray([0], dtype=array_2d.dtype)
-    d_in = array_2d
     cuda.compute.reduce_into(
-        d_in=d_in, d_out=d_out, num_items=d_in.size, op=binary_op, h_init=h_init
+        d_in=d_in,
+        d_out=d_out,
+        num_items=array_2d.size,
+        op=binary_op,
+        h_init=h_init,
     )
-    np.testing.assert_allclose(d_in.sum().get(), d_out.get())
+    np.testing.assert_allclose(array_2d.sum(), d_out.copy_to_host())
 
 
 def test_reduce_non_contiguous():
@@ -581,16 +577,28 @@ def binary_op(x, y):
         return x + y
 
     size = 10
-    d_out = cp.empty(1, dtype="int64")
+
+    class DeviceArrayView:
+        def __init__(self, base, host_view):
+            self._base = base
+            self.__cuda_array_interface__ = {
+                **base.__cuda_array_interface__,
+                "shape": host_view.shape,
+                "strides": host_view.strides,
+            }
+
+    d_out = DeviceArray.empty(1, dtype="int64")
     h_init = np.asarray([0], dtype="int64")
 
-    d_in = cp.zeros((size, 2))[:, 0]
+    h_base = np.zeros((size, 2))
+    d_in = DeviceArrayView(DeviceArray.from_numpy(h_base), h_base[:, 0])
     with pytest.raises(ValueError, match="Non-contiguous arrays are not supported."):
         _ = cuda.compute.make_reduce_into(
             d_in=d_in, d_out=d_out, op=binary_op, h_init=h_init
         )
 
-    d_in = cp.zeros(size)[::2]
+    h_base = np.zeros(size)
+    d_in = DeviceArrayView(DeviceArray.from_numpy(h_base), h_base[::2])
     with pytest.raises(ValueError, match="Non-contiguous arrays are not supported."):
         _ = cuda.compute.make_reduce_into(
             d_in=d_in, d_out=d_out, op=binary_op, h_init=h_init
@@ -604,21 +612,18 @@ def add_op(x, y):
     h_init = np.asarray([0], dtype=np.int32)
     h_in = random_int(5, np.int32)
 
-    cp_stream = cp.cuda.ExternalStream(cuda_stream.ptr)
-    with cp_stream:
-        d_in = cp.asarray(h_in)
-        d_out = cp.empty(1, dtype=np.int32)
+    d_in = DeviceArray.from_numpy(h_in, stream=cuda_stream)
+    d_out = DeviceArray.empty(1, np.int32, stream=cuda_stream)
 
     cuda.compute.reduce_into(
         d_in=d_in,
         d_out=d_out,
-        num_items=d_in.size,
+        num_items=h_in.size,
         op=add_op,
         h_init=h_init,
         stream=cuda_stream,
     )
-    with cp_stream:
-        cp.testing.assert_allclose(d_in.sum().get(), d_out.get())
+    np.testing.assert_allclose(h_in.sum(), d_out.copy_to_host(stream=cuda_stream))
 
 
 def test_reduce_invalid_stream():
@@ -646,9 +651,9 @@ def __cuda_stream__(self):
     def add_op(x, y):
         return x + y
 
-    d_out = cp.empty(1)
+    d_out = DeviceArray.empty(1, np.float64)
     h_init = np.empty(1)
-    d_in = cp.empty(1)
+    d_in = DeviceArray.empty(1, np.float64)
     reduce_into = cuda.compute.make_reduce_into(
         d_in=d_in, d_out=d_out, op=add_op, h_init=h_init
     )
@@ -661,7 +666,7 @@ def add_op(x, y):
             d_in=d_in,
             d_out=d_out,
             op=add_op,
-            num_items=d_in.size,
+            num_items=1,
             h_init=h_init,
             stream=Stream1(),
         )
@@ -674,7 +679,7 @@ def add_op(x, y):
             d_in=d_in,
             d_out=d_out,
             op=add_op,
-            num_items=d_in.size,
+            num_items=1,
             h_init=h_init,
             stream=Stream2(),
         )
@@ -685,7 +690,7 @@ def add_op(x, y):
             d_in=d_in,
             d_out=d_out,
             op=add_op,
-            num_items=d_in.size,
+            num_items=1,
             h_init=h_init,
             stream=Stream3(),
         )
@@ -694,55 +699,58 @@ def add_op(x, y):
 def test_device_reduce_well_known_plus():
     dtype = np.int32
     h_init = np.array([0], dtype=dtype)
-    d_input = cp.array([1, 2, 3, 4, 5], dtype=dtype)
-    d_output = cp.empty(1, dtype=dtype)
+    h_input = np.array([1, 2, 3, 4, 5], dtype=dtype)
+    d_input = DeviceArray.from_numpy(h_input)
+    d_output = DeviceArray.empty(1, dtype=dtype)
 
     cuda.compute.reduce_into(
         d_in=d_input,
         d_out=d_output,
-        num_items=len(d_input),
+        num_items=len(h_input),
         op=OpKind.PLUS,
         h_init=h_init,
     )
 
     expected_output = 15
-    assert (d_output == expected_output).all()
+    assert d_output.copy_to_host()[0] == expected_output
 
 
 def test_device_reduce_well_known_minimum():
     dtype = np.int32
     h_init = np.array([100], dtype=dtype)
-    d_input = cp.array([8, 6, 7, 5, 3, 0, 9], dtype=dtype)
-    d_output = cp.empty(1, dtype=dtype)
+    h_input = np.array([8, 6, 7, 5, 3, 0, 9], dtype=dtype)
+    d_input = DeviceArray.from_numpy(h_input)
+    d_output = DeviceArray.empty(1, dtype=dtype)
 
     cuda.compute.reduce_into(
         d_in=d_input,
         d_out=d_output,
-        num_items=len(d_input),
+        num_items=len(h_input),
         op=OpKind.MINIMUM,
         h_init=h_init,
     )
 
     expected_output = 0
-    assert (d_output == expected_output).all()
+    assert d_output.copy_to_host()[0] == expected_output
 
 
 def test_device_reduce_well_known_maximum():
     dtype = np.int32
     h_init = np.array([-100], dtype=dtype)
-    d_input = cp.array([8, 6, 7, 5, 3, 0, 9], dtype=dtype)
-    d_output = cp.empty(1, dtype=dtype)
+    h_input = np.array([8, 6, 7, 5, 3, 0, 9], dtype=dtype)
+    d_input = DeviceArray.from_numpy(h_input)
+    d_output = DeviceArray.empty(1, dtype=dtype)
 
     cuda.compute.reduce_into(
         d_in=d_input,
         d_out=d_output,
-        num_items=len(d_input),
+        num_items=len(h_input),
         op=OpKind.MAXIMUM,
         h_init=h_init,
     )
 
     expected_output = 9
-    assert (d_output == expected_output).all()
+    assert d_output.copy_to_host()[0] == expected_output
 
 
 def test_cache_modified_input_iterator():
@@ -750,19 +758,18 @@ def add_op(a, b):
         return a + b
 
     values = [8, 6, 7, 5, 3, 0, 9]
-    d_input = cp.array(values, dtype=np.int32)
-    d_output = cp.empty(1, dtype=np.int32)
+    d_input = DeviceArray.from_numpy(np.asarray(values, dtype=np.int32))
 
     iterator = CacheModifiedInputIterator(d_input, modifier="stream")
     h_init = np.array([0], dtype=np.int32)
-    d_output = cp.empty(1, dtype=np.int32)
+    d_output = DeviceArray.empty(1, dtype=np.int32)
 
     cuda.compute.reduce_into(
         d_in=iterator, d_out=d_output, num_items=len(values), op=add_op, h_init=h_init
     )
 
     expected_output = functools.reduce(lambda a, b: a + b, values)
-    assert (d_output == expected_output).all()
+    assert d_output.copy_to_host()[0] == expected_output
 
 
 def test_constant_iterator():
@@ -774,14 +781,14 @@ def add_op(a, b):
 
     constant_it = ConstantIterator(np.int32(value))
     h_init = np.array([0], dtype=np.int32)
-    d_output = cp.empty(1, dtype=np.int32)
+    d_output = DeviceArray.empty(1, dtype=np.int32)
 
     cuda.compute.reduce_into(
         d_in=constant_it, d_out=d_output, num_items=num_items, op=add_op, h_init=h_init
     )
 
     expected_output = functools.reduce(lambda a, b: a + b, [value] * num_items)
-    assert (d_output == expected_output).all()
+    assert d_output.copy_to_host()[0] == expected_output
 
 
 def test_counting_iterator():
@@ -793,7 +800,7 @@ def add_op(a, b):
 
     first_it = CountingIterator(np.int32(first_item))  # Input sequence
     h_init = np.array([0], dtype=np.int32)  # Initial value for the reduction
-    d_output = cp.empty(1, dtype=np.int32)  # Storage for output
+    d_output = DeviceArray.empty(1, dtype=np.int32)  # Storage for output
 
     cuda.compute.reduce_into(
         d_in=first_it, d_out=d_output, num_items=num_items, op=add_op, h_init=h_init
@@ -802,7 +809,7 @@ def add_op(a, b):
     expected_output = functools.reduce(
         lambda a, b: a + b, range(first_item, first_item + num_items)
     )
-    assert (d_output == expected_output).all()
+    assert d_output.copy_to_host()[0] == expected_output
 
 
 def test_transform_iterator():
@@ -817,7 +824,7 @@ def square_op(a):
 
     transform_it = TransformIterator(CountingIterator(np.int32(first_item)), square_op)
     h_init = np.array([0], dtype=np.int32)
-    d_output = cp.empty(1, dtype=np.int32)
+    d_output = DeviceArray.empty(1, dtype=np.int32)
 
     cuda.compute.reduce_into(
         d_in=transform_it, d_out=d_output, num_items=num_items, op=add_op, h_init=h_init
@@ -826,7 +833,7 @@ def square_op(a):
     expected_output = functools.reduce(
         lambda a, b: a + b, [a**2 for a in range(first_item, first_item + num_items)]
     )
-    assert (d_output == expected_output).all()
+    assert d_output.copy_to_host()[0] == expected_output
 
 
 def test_reduce_struct_type():
@@ -839,19 +846,19 @@ class Pixel:
     def max_g_value(x, y):
         return x if x.g > y.g else y
 
-    d_rgb = cp.random.randint(0, 256, (10, 3), dtype=np.int32).view(Pixel.dtype)
-    d_out = cp.empty(1, Pixel.dtype)
+    h_rgb = np.random.randint(0, 256, (10, 3), dtype=np.int32).view(Pixel.dtype)
+    d_rgb = DeviceArray.from_numpy(h_rgb)
+    d_out = DeviceArray.empty(1, Pixel.dtype)
 
     h_init = Pixel(0, 0, 0)
 
     cuda.compute.reduce_into(
-        d_in=d_rgb, d_out=d_out, num_items=d_rgb.size, op=max_g_value, h_init=h_init
+        d_in=d_rgb, d_out=d_out, num_items=h_rgb.size, op=max_g_value, h_init=h_init
     )
 
-    h_rgb = d_rgb.get()
     expected = h_rgb[h_rgb.view("int32")[:, 1].argmax()]
 
-    np.testing.assert_equal(expected["g"], d_out.get()["g"])
+    np.testing.assert_equal(expected["g"], d_out.copy_to_host()["g"])
 
 
 @pytest.mark.no_verify_sass(reason="LDL/STL instructions emitted for this test.")
@@ -872,14 +879,15 @@ def transform_op(v):
 
     nelems = 4096
 
-    d_in = cp.random.randn(nelems)
+    h_in = np.random.randn(nelems)
+    d_in = DeviceArray.from_numpy(h_in)
     # input values must be transformed to MinMax structures
     # in-place to map computation to data-parallel reduction
     # algorithm that requires commutative binary operation
     # with both operands having the same type.
     tr_it = TransformIterator(d_in, transform_op)
 
-    d_out = cp.empty(tuple(), dtype=MinMax.dtype)
+    d_out = DeviceArray.empty(tuple(), dtype=MinMax.dtype)
 
     # initial value set with identity elements of
     # minimum and maximum operators
@@ -891,9 +899,9 @@ def transform_op(v):
     )
 
     # display values computed on the device
-    actual = d_out.get()
+    actual = d_out.copy_to_host()
 
-    h = np.abs(d_in.get())
+    h = np.abs(h_in)
     expected = np.asarray([(h.min(), h.max())], dtype=MinMax.dtype)
 
     assert actual == expected
@@ -905,8 +913,8 @@ def test_reduce_transform_output_iterator(floating_array):
     h_init = np.array([0], dtype=dtype)
 
     # Use the floating_array fixture which provides random floating-point data of size 1000
-    d_input = floating_array
-    d_output = cp.empty(1, dtype=dtype)
+    d_input = DeviceArray.from_numpy(floating_array)
+    d_output = DeviceArray.empty(1, dtype=dtype)
 
     def sqrt(x: dtype) -> dtype:
         return x**0.5
@@ -916,26 +924,26 @@ def sqrt(x: dtype) -> dtype:
     cuda.compute.reduce_into(
         d_in=d_input,
         d_out=d_out_it,
-        num_items=len(d_input),
+        num_items=len(floating_array),
         op=OpKind.PLUS,
         h_init=h_init,
     )
 
-    expected = cp.sqrt(cp.sum(d_input))
-    np.testing.assert_allclose(d_output.get(), expected.get(), atol=1e-6)
+    expected = np.sqrt(np.sum(floating_array))
+    np.testing.assert_allclose(d_output.copy_to_host(), expected, atol=1e-6)
 
 
 def test_reduce_with_not_guaranteed_determinism(floating_array):
     dtype = floating_array.dtype
     h_init = np.array([0], dtype=dtype)
 
-    d_input = floating_array
-    d_output = cp.empty(1, dtype=dtype)
+    d_input = DeviceArray.from_numpy(floating_array)
+    d_output = DeviceArray.empty(1, dtype=dtype)
 
     cuda.compute.reduce_into(
         d_in=d_input,
         d_out=d_output,
-        num_items=len(d_input),
+        num_items=len(floating_array),
         op=OpKind.PLUS,
         h_init=h_init,
         determinism=Determinism.NOT_GUARANTEED,
@@ -944,20 +952,21 @@ def test_reduce_with_not_guaranteed_determinism(floating_array):
 
 def test_reduce_bool():
     h_init = np.array([False])
-    d_input = cp.array([True, False, True])
-    d_output = cp.empty_like(d_input, shape=(1,))
+    h_input = np.array([True, False, True])
+    d_input = DeviceArray.from_numpy(h_input)
+    d_output = DeviceArray.empty(1, h_input.dtype)
 
     # Perform the reduction.
     cuda.compute.reduce_into(
         d_in=d_input,
         d_out=d_output,
-        num_items=len(d_input),
+        num_items=len(h_input),
         op=OpKind.MAXIMUM,
         h_init=h_init,
     )
 
     expected = True
-    assert d_output.get()[0] == expected
+    assert d_output.copy_to_host()[0] == expected
 
 
 def test_reduce_input_and_accumulator_type_mismatch():
@@ -969,29 +978,16 @@ class AccumulatorType:
     def op(foo1: AccumulatorType, foo2: AccumulatorType):
         return AccumulatorType(foo1.x + foo2.x, foo1.y + foo2.y)
 
-    def to_cupy_record(h_array):
-        # a helper function to copy a numpy array of record type
-        # into a cupy array. The cupy `asarray` function doesn't
-        # work for record types.
-        d_array = cp.empty(h_array.nbytes, dtype=np.uint8)
-        runtime.memcpy(
-            d_array.data.ptr,
-            h_array.ctypes.data,
-            h_array.nbytes,
-            runtime.memcpyHostToDevice,
-        )
-        return d_array.view(h_array.dtype).reshape(h_array.shape)
-
     # input data is {int32, int64}
     dtype = np.dtype([("x", np.int32), ("y", np.int64)], align=True)
     h_data = np.asarray([(1, 2), (3, 4), (5, 6)], dtype=dtype)
-    d_data = to_cupy_record(h_data)
+    d_data = DeviceArray.from_numpy(h_data)
 
     # output and h_init, both are AccumulatorType
-    d_out = cp.empty(1, AccumulatorType.dtype)
+    d_out = DeviceArray.empty(1, AccumulatorType.dtype)
     h_init = AccumulatorType(0, 0)  # Init is AccumulatorType
 
     with pytest.raises(TypeError, match="reduce_into dtype mismatch: input dtype"):
         cuda.compute.reduce_into(
-            d_in=d_data, d_out=d_out, op=op, num_items=d_data.size, h_init=h_init
+            d_in=d_data, d_out=d_out, op=op, num_items=h_data.size, h_init=h_init
         )
diff --git a/python/cuda_cccl/tests/compute/test_scan.py b/python/cuda_cccl/tests/compute/test_scan.py
index af6d941b11b..d2f2a8ad14a 100644
--- a/python/cuda_cccl/tests/compute/test_scan.py
+++ b/python/cuda_cccl/tests/compute/test_scan.py
@@ -3,10 +3,9 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 
-import cupy as cp
-import numba.cuda
 import numpy as np
 import pytest
+from _utils.device_array import DeviceArray, get_compute_capability
 
 import cuda.compute
 from cuda.compute import (
@@ -52,7 +51,7 @@ def scan_device(d_input, d_output, num_items, op, h_init, force_inclusive, strea
     [True, False],
 )
 def test_scan_array_input(force_inclusive, input_array, monkeypatch):
-    cc_major, _ = numba.cuda.get_current_device().compute_capability
+    cc_major, _ = get_compute_capability()
     # Skip sass verification if input is complex
     # as LDL/STL instructions are emitted for complex types.
     # Also skip for:
@@ -88,15 +87,16 @@ def op(a, b):
     is_short_dtype = dtype.itemsize < 16
     # for small range data types make input small to assure that
     # accumulation does not overflow
-    d_input = input_array[:31] if is_short_dtype else input_array
+    h_input = input_array[:31] if is_short_dtype else input_array
+    d_input = DeviceArray.from_numpy(h_input)
 
     h_init = np.array([42], dtype=dtype)
-    d_output = cp.empty_like(d_input)
+    d_output = DeviceArray.empty(h_input.shape, h_input.dtype)
 
-    scan_device(d_input, d_output, len(d_input), reduce_op, h_init, force_inclusive)
+    scan_device(d_input, d_output, h_input.size, reduce_op, h_init, force_inclusive)
 
-    got = d_output.get()
-    expected = scan_host(d_input.get(), op, h_init, force_inclusive)
+    got = d_output.copy_to_host()
+    expected = scan_host(h_input, op, h_init, force_inclusive)
 
     if np.isdtype(dtype, ("real floating", "complex floating")):
         real_dt = np.finfo(dtype).dtype
@@ -119,11 +119,11 @@ def op(a, b):
     num_items = 1024
     dtype = np.dtype("int32")
     h_init = np.array([42], dtype=dtype)
-    d_output = cp.empty(num_items, dtype=dtype)
+    d_output = DeviceArray.empty(num_items, dtype)
 
     scan_device(d_input, d_output, num_items, op, h_init, force_inclusive)
 
-    got = d_output.get()
+    got = d_output.copy_to_host()
     expected = scan_host(
         np.arange(1, num_items + 1, dtype=dtype), op, h_init, force_inclusive
     )
@@ -143,11 +143,11 @@ def op(a, b):
     d_input = ReverseIterator(CountingIterator(np.int32(num_items)))
     dtype = np.dtype("int32")
     h_init = np.array([0], dtype=dtype)
-    d_output = cp.empty(num_items, dtype=dtype)
+    d_output = DeviceArray.empty(num_items, dtype)
 
     scan_device(d_input, d_output, num_items, op, h_init, force_inclusive)
 
-    got = d_output.get()
+    got = d_output.copy_to_host()
     expected = scan_host(
         np.arange(num_items, 0, -1, dtype=dtype), op, h_init, force_inclusive
     )
@@ -169,19 +169,20 @@ class XY:
     def op(a, b):
         return XY(a.x + b.x, a.y + b.y)
 
-    d_input = cp.random.randint(0, 256, (10, 2), dtype=np.int32).view(XY.dtype)
-    d_output = cp.empty_like(d_input)
+    h_input = np.random.randint(0, 256, (10, 2), dtype=np.int32).view(XY.dtype)
+    d_input = DeviceArray.from_numpy(h_input)
+    d_output = DeviceArray.empty(h_input.shape, h_input.dtype)
 
     h_init = XY(0, 0)
 
-    scan_device(d_input, d_output, len(d_input), op, h_init, force_inclusive)
+    scan_device(d_input, d_output, len(h_input), op, h_init, force_inclusive)
 
-    got = d_output.get()
+    got = d_output.copy_to_host()
     expected_x = scan_host(
-        d_input.get()["x"], lambda a, b: a + b, np.asarray([h_init.x]), force_inclusive
+        h_input["x"], lambda a, b: a + b, np.asarray([h_init.x]), force_inclusive
     )
     expected_y = scan_host(
-        d_input.get()["y"], lambda a, b: a + b, np.asarray([h_init.y]), force_inclusive
+        h_input["y"], lambda a, b: a + b, np.asarray([h_init.y]), force_inclusive
     )
 
     np.testing.assert_allclose(expected_x, got["x"], rtol=1e-5)
@@ -196,20 +197,18 @@ def test_scan_with_stream(force_inclusive, cuda_stream):
     def op(a, b):
         return a + b
 
-    cp_stream = cp.cuda.ExternalStream(cuda_stream.ptr)
-
-    with cp_stream:
-        d_input = cp.random.randint(0, 256, 1024, dtype=np.int32)
-        d_output = cp.empty_like(d_input)
+    h_input = np.random.randint(0, 256, 1024, dtype=np.int32)
+    d_input = DeviceArray.from_numpy(h_input, stream=cuda_stream)
+    d_output = DeviceArray.empty(h_input.shape, h_input.dtype, stream=cuda_stream)
 
     h_init = np.array([42], dtype=np.int32)
 
     scan_device(
-        d_input, d_output, len(d_input), op, h_init, force_inclusive, stream=cuda_stream
+        d_input, d_output, h_input.size, op, h_init, force_inclusive, stream=cuda_stream
     )
 
-    got = d_output.get()
-    expected = scan_host(d_input.get(), op, h_init, force_inclusive)
+    got = d_output.copy_to_host(stream=cuda_stream)
+    expected = scan_host(h_input, op, h_init, force_inclusive)
 
     np.testing.assert_allclose(expected, got, rtol=1e-5)
 
@@ -217,23 +216,24 @@ def op(a, b):
 def test_exclusive_scan_well_known_plus():
     dtype = np.int32
     h_init = np.array([0], dtype=dtype)
-    d_input = cp.array([1, 2, 3, 4, 5], dtype=dtype)
-    d_output = cp.empty_like(d_input, dtype=dtype)
+    h_input = np.array([1, 2, 3, 4, 5], dtype=dtype)
+    d_input = DeviceArray.from_numpy(h_input)
+    d_output = DeviceArray.empty(h_input.shape, dtype)
 
     cuda.compute.exclusive_scan(
         d_in=d_input,
         d_out=d_output,
         op=OpKind.PLUS,
         init_value=h_init,
-        num_items=d_input.size,
+        num_items=h_input.size,
     )
 
     expected = np.array([0, 1, 3, 6, 10])
-    np.testing.assert_equal(d_output.get(), expected)
+    np.testing.assert_equal(d_output.copy_to_host(), expected)
 
 
 def test_inclusive_scan_well_known_plus(monkeypatch):
-    cc_major, _ = numba.cuda.get_current_device().compute_capability
+    cc_major, _ = get_compute_capability()
     # Skip SASS check for CC 9.0+, due to a bug in NVRTC.
     # TODO: add NVRTC version check, ref nvbug 5243118
     if cc_major >= 9:
@@ -247,19 +247,20 @@ def test_inclusive_scan_well_known_plus(monkeypatch):
 
     dtype = np.int32
     h_init = np.array([0], dtype=dtype)
-    d_input = cp.array([1, 2, 3, 4, 5], dtype=dtype)
-    d_output = cp.empty_like(d_input, dtype=dtype)
+    h_input = np.array([1, 2, 3, 4, 5], dtype=dtype)
+    d_input = DeviceArray.from_numpy(h_input)
+    d_output = DeviceArray.empty(h_input.shape, dtype)
 
     cuda.compute.inclusive_scan(
         d_in=d_input,
         d_out=d_output,
         op=OpKind.PLUS,
         init_value=h_init,
-        num_items=d_input.size,
+        num_items=h_input.size,
     )
 
     expected = np.array([1, 3, 6, 10, 15])
-    np.testing.assert_equal(d_output.get(), expected)
+    np.testing.assert_equal(d_output.copy_to_host(), expected)
 
 
 @pytest.mark.xfail(
@@ -268,19 +269,20 @@ def test_inclusive_scan_well_known_plus(monkeypatch):
 def test_exclusive_scan_well_known_maximum():
     dtype = np.int32
     h_init = np.array([1], dtype=dtype)
-    d_input = cp.array([-5, 0, 2, -3, 2, 4, 0, -1, 2, 8], dtype=dtype)
-    d_output = cp.empty_like(d_input, dtype=dtype)
+    h_input = np.array([-5, 0, 2, -3, 2, 4, 0, -1, 2, 8], dtype=dtype)
+    d_input = DeviceArray.from_numpy(h_input)
+    d_output = DeviceArray.empty(h_input.shape, dtype)
 
     cuda.compute.exclusive_scan(
         d_in=d_input,
         d_out=d_output,
         op=OpKind.MAXIMUM,
         init_value=h_init,
-        num_items=d_input.size,
+        num_items=h_input.size,
     )
 
     expected = np.array([1, 1, 1, 2, 2, 2, 4, 4, 4, 4])
-    np.testing.assert_equal(d_output.get(), expected)
+    np.testing.assert_equal(d_output.copy_to_host(), expected)
 
 
 def test_scan_transform_output_iterator(floating_array):
@@ -289,8 +291,9 @@ def test_scan_transform_output_iterator(floating_array):
     h_init = np.array([0], dtype=dtype)
 
     # Use the floating_array fixture which provides random floating-point data of size 1000
-    d_input = floating_array
-    d_output = cp.empty_like(d_input, dtype=dtype)
+    h_input = floating_array
+    d_input = DeviceArray.from_numpy(h_input)
+    d_output = DeviceArray.empty(h_input.shape, dtype)
 
     def square(x: dtype) -> dtype:
         return x * x
@@ -302,15 +305,17 @@ def square(x: dtype) -> dtype:
         d_out=d_out_it,
         op=OpKind.PLUS,
         init_value=h_init,
-        num_items=d_input.size,
+        num_items=h_input.size,
     )
 
-    expected = cp.cumsum(d_input) ** 2
+    expected = np.cumsum(h_input) ** 2
     # Use more lenient tolerance for float32 due to precision differences
     if dtype == np.float32:
-        np.testing.assert_allclose(d_output.get(), expected.get(), atol=1e-4, rtol=1e-4)
+        np.testing.assert_allclose(
+            d_output.copy_to_host(), expected, atol=1e-4, rtol=1e-4
+        )
     else:
-        np.testing.assert_allclose(d_output.get(), expected.get(), atol=1e-6)
+        np.testing.assert_allclose(d_output.copy_to_host(), expected, atol=1e-6)
 
 
 def test_exclusive_scan_max():
@@ -318,19 +323,20 @@ def max_op(a, b):
         return max(a, b)
 
     h_init = np.array([1], dtype="int32")
-    d_input = cp.array([-5, 0, 2, -3, 2, 4, 0, -1, 2, 8], dtype="int32")
-    d_output = cp.empty_like(d_input, dtype="int32")
+    h_input = np.array([-5, 0, 2, -3, 2, 4, 0, -1, 2, 8], dtype=np.int32)
+    d_input = DeviceArray.from_numpy(h_input)
+    d_output = DeviceArray.empty(h_input.shape, h_input.dtype)
 
     cuda.compute.exclusive_scan(
         d_in=d_input,
         d_out=d_output,
         op=max_op,
         init_value=h_init,
-        num_items=d_input.size,
+        num_items=h_input.size,
     )
 
     expected = np.asarray([1, 1, 1, 2, 2, 2, 4, 4, 4, 4])
-    np.testing.assert_equal(d_output.get(), expected)
+    np.testing.assert_equal(d_output.copy_to_host(), expected)
 
 
 def test_inclusive_scan_add():
@@ -338,23 +344,24 @@ def add_op(a, b):
         return a + b
 
     h_init = np.array([0], dtype="int32")
-    d_input = cp.array([-5, 0, 2, -3, 2, 4, 0, -1, 2, 8], dtype="int32")
-    d_output = cp.empty_like(d_input, dtype="int32")
+    h_input = np.array([-5, 0, 2, -3, 2, 4, 0, -1, 2, 8], dtype=np.int32)
+    d_input = DeviceArray.from_numpy(h_input)
+    d_output = DeviceArray.empty(h_input.shape, h_input.dtype)
 
     cuda.compute.inclusive_scan(
         d_in=d_input,
         d_out=d_output,
         op=add_op,
         init_value=h_init,
-        num_items=d_input.size,
+        num_items=h_input.size,
     )
 
     expected = np.asarray([-5, -5, -3, -6, -4, 0, 0, -1, 1, 9])
-    np.testing.assert_equal(d_output.get(), expected)
+    np.testing.assert_equal(d_output.copy_to_host(), expected)
 
 
 def test_reverse_input_iterator(monkeypatch):
-    cc_major, _ = numba.cuda.get_current_device().compute_capability
+    cc_major, _ = get_compute_capability()
     # Skip SASS check for CC 9.0+, due to a bug in NVRTC.
     # TODO: add NVRTC version check, ref nvbug 5243118
     if cc_major >= 9:
@@ -370,8 +377,9 @@ def add_op(a, b):
         return a + b
 
     h_init = np.array([0], dtype="int32")
-    d_input = cp.array([-5, 0, 2, -3, 2, 4, 0, -1, 2, 8], dtype="int32")
-    d_output = cp.empty_like(d_input, dtype="int32")
+    h_input = np.array([-5, 0, 2, -3, 2, 4, 0, -1, 2, 8], dtype=np.int32)
+    d_input = DeviceArray.from_numpy(h_input)
+    d_output = DeviceArray.empty(h_input.shape, h_input.dtype)
     reverse_it = ReverseIterator(d_input)
 
     cuda.compute.inclusive_scan(
@@ -379,12 +387,12 @@ def add_op(a, b):
         d_out=d_output,
         op=add_op,
         init_value=h_init,
-        num_items=len(d_input),
+        num_items=h_input.size,
     )
 
     # Check the result is correct
     expected = np.asarray([8, 10, 9, 9, 13, 15, 12, 14, 14, 9])
-    np.testing.assert_equal(d_output.get(), expected)
+    np.testing.assert_equal(d_output.copy_to_host(), expected)
 
 
 @pytest.mark.no_verify_sass(reason="LDL/STL instructions emitted for this test.")
@@ -393,8 +401,9 @@ def add_op(a, b):
         return a + b
 
     h_init = np.array([0], dtype="int32")
-    d_input = cp.array([-5, 0, 2, -3, 2, 4, 0, -1, 2, 8], dtype="int32")
-    d_output = cp.empty_like(d_input, dtype="int32")
+    h_input = np.array([-5, 0, 2, -3, 2, 4, 0, -1, 2, 8], dtype=np.int32)
+    d_input = DeviceArray.from_numpy(h_input)
+    d_output = DeviceArray.empty(h_input.shape, h_input.dtype)
     reverse_it = ReverseIterator(d_output)
 
     cuda.compute.inclusive_scan(
@@ -402,11 +411,11 @@ def add_op(a, b):
         d_out=reverse_it,
         op=add_op,
         init_value=h_init,
-        num_items=len(d_input),
+        num_items=h_input.size,
     )
 
     expected = np.asarray([9, 1, -1, 0, 0, -4, -6, -3, -5, -5])
-    np.testing.assert_equal(d_output.get(), expected)
+    np.testing.assert_equal(d_output.copy_to_host(), expected)
 
 
 @pytest.mark.parametrize(
@@ -417,16 +426,16 @@ def test_future_init_value(force_inclusive):
     num_items = 1024
     dtype = np.dtype("int32")
 
-    d_input = cp.random.randint(0, 256, num_items, dtype=dtype)
-    d_output = cp.empty_like(d_input)
-    init_value = cp.array([42], dtype=dtype)
+    h_input = np.random.randint(0, 256, num_items, dtype=dtype)
+    h_init = np.array([42], dtype=dtype)
+    d_input = DeviceArray.from_numpy(h_input)
+    d_output = DeviceArray.empty(h_input.shape, h_input.dtype)
+    init_value = DeviceArray.from_numpy(h_init)
 
     scan_device(d_input, d_output, num_items, OpKind.PLUS, init_value, force_inclusive)
 
-    got = d_output.get()
-    expected = scan_host(
-        d_input.get(), lambda a, b: a + b, init_value.get(), force_inclusive
-    )
+    got = d_output.copy_to_host()
+    expected = scan_host(h_input, lambda a, b: a + b, h_init, force_inclusive)
     np.testing.assert_array_equal(expected, got)
 
 
@@ -436,7 +445,7 @@ def test_no_init_value(monkeypatch):
     dtype = np.dtype("int32")
 
     # Skip SASS check for CC 9.0 due to LDL/STL CI failure.
-    cc_major, _ = numba.cuda.get_current_device().compute_capability
+    cc_major, _ = get_compute_capability()
     if cc_major >= 9:
         import cuda.compute._cccl_interop
 
@@ -446,13 +455,14 @@ def test_no_init_value(monkeypatch):
             False,
         )
 
-    d_input = cp.random.randint(0, 256, num_items, dtype=dtype)
-    d_output = cp.empty_like(d_input)
+    h_input = np.random.randint(0, 256, num_items, dtype=dtype)
+    d_input = DeviceArray.from_numpy(h_input)
+    d_output = DeviceArray.empty(h_input.shape, h_input.dtype)
 
     scan_device(d_input, d_output, num_items, OpKind.PLUS, None, force_inclusive)
 
-    got = d_output.get()
-    expected = scan_host(d_input.get(), lambda a, b: a + b, [0], force_inclusive)
+    got = d_output.copy_to_host()
+    expected = scan_host(h_input, lambda a, b: a + b, [0], force_inclusive)
     np.testing.assert_array_equal(expected, got)
 
 
@@ -462,11 +472,11 @@ def test_no_init_value_iterator():
     dtype = np.dtype("float64")
 
     d_input = CountingIterator(np.float64(0))
-    d_output = cp.empty(num_items, dtype=dtype)
+    d_output = DeviceArray.empty(num_items, dtype)
 
     scan_device(d_input, d_output, num_items, OpKind.PLUS, None, force_inclusive)
 
-    got = d_output.get()
+    got = d_output.copy_to_host()
     expected = scan_host(
         np.arange(0, num_items, dtype=dtype), lambda a, b: a + b, [0], force_inclusive
     )
@@ -477,8 +487,9 @@ def test_no_init_value_iterator():
 def test_inclusive_scan_with_lambda():
     """Test inclusive_scan with a lambda function as the scan operator."""
     h_init = np.array([0], dtype=np.int32)
-    d_input = cp.array([1, 2, 3, 4, 5], dtype=np.int32)
-    d_output = cp.empty_like(d_input)
+    h_input = np.array([1, 2, 3, 4, 5], dtype=np.int32)
+    d_input = DeviceArray.from_numpy(h_input)
+    d_output = DeviceArray.empty(h_input.shape, h_input.dtype)
 
     # Use a lambda function directly as the scan operator
     cuda.compute.inclusive_scan(
@@ -486,21 +497,22 @@ def test_inclusive_scan_with_lambda():
         d_out=d_output,
         op=lambda a, b: a + b,
         init_value=h_init,
-        num_items=len(d_input),
+        num_items=h_input.size,
     )
 
     expected = np.array([1, 3, 6, 10, 15], dtype=np.int32)
-    np.testing.assert_array_equal(d_output.get(), expected)
+    np.testing.assert_array_equal(d_output.copy_to_host(), expected)
 
 
 @pytest.mark.parametrize("force_inclusive", [True, False])
 def test_scan_bool_maximum(force_inclusive):
     h_init = np.array([False], dtype=np.bool_)
-    d_input = cp.array([False, True, False, True], dtype=np.bool_)
-    d_output = cp.empty_like(d_input)
+    h_input = np.array([False, True, False, True], dtype=np.bool_)
+    d_input = DeviceArray.from_numpy(h_input)
+    d_output = DeviceArray.empty(h_input.shape, h_input.dtype)
 
     scan_device(
-        d_input, d_output, len(d_input), OpKind.MAXIMUM, h_init, force_inclusive
+        d_input, d_output, h_input.size, OpKind.MAXIMUM, h_init, force_inclusive
     )
 
     if force_inclusive:
@@ -508,4 +520,4 @@ def test_scan_bool_maximum(force_inclusive):
     else:
         expected = np.array([False, False, True, True], dtype=np.bool_)
 
-    np.testing.assert_array_equal(d_output.get(), expected)
+    np.testing.assert_array_equal(d_output.copy_to_host(), expected)
diff --git a/python/cuda_cccl/tests/compute/test_segmented_reduce.py b/python/cuda_cccl/tests/compute/test_segmented_reduce.py
index 950d860a14d..105e4ff895d 100644
--- a/python/cuda_cccl/tests/compute/test_segmented_reduce.py
+++ b/python/cuda_cccl/tests/compute/test_segmented_reduce.py
@@ -2,9 +2,9 @@
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-import cupy as cp
 import numpy as np
 import pytest
+from _utils.device_array import DeviceArray
 
 import cuda.compute
 from cuda.compute import (
@@ -17,6 +17,19 @@
 )
 
 
+def is_out_of_memory_error(error):
+    # cuda-core exception types vary by memory resource, so classify by message.
+    message = str(error).lower()
+    return any(
+        marker in message
+        for marker in (
+            "out of memory",
+            "out_of_memory",
+            "failed to allocate memory from pool",
+        )
+    )
+
+
 @pytest.fixture(params=["i4", "u4", "i8", "u8"])
 def offset_dtype(request):
     return np.dtype(request.param)
@@ -36,22 +49,24 @@ def binary_op(a, b):
 
     assert input_array.ndim == 1
     sz = input_array.size
-    rng = cp.random
+    rng = np.random.default_rng()
     n_segments = 16
-    h_offsets = cp.zeros(n_segments + 1, dtype="int64")
+    h_offsets = np.zeros(n_segments + 1, dtype="int64")
     h_offsets[1:] = rng.multinomial(sz, [1 / n_segments] * n_segments)
 
-    offsets = cp.cumsum(cp.asarray(h_offsets, dtype=offset_dtype), dtype=offset_dtype)
+    offsets = np.cumsum(np.asarray(h_offsets, dtype=offset_dtype), dtype=offset_dtype)
 
-    start_offsets = offsets[:-1]
-    end_offsets = offsets[1:]
+    h_start_offsets = offsets[:-1]
+    h_end_offsets = offsets[1:]
 
     assert offsets.dtype == np.dtype(offset_dtype)
-    assert cp.all(start_offsets <= end_offsets)
-    assert end_offsets[-1] == sz
+    assert np.all(h_start_offsets <= h_end_offsets)
+    assert h_end_offsets[-1] == sz
 
-    d_in = cp.asarray(input_array)
-    d_out = cp.empty(n_segments, dtype=d_in.dtype)
+    d_in = DeviceArray.from_numpy(input_array)
+    d_out = DeviceArray.empty(n_segments, input_array.dtype)
+    start_offsets = DeviceArray.from_numpy(h_start_offsets)
+    end_offsets = DeviceArray.from_numpy(h_end_offsets)
 
     h_init = np.zeros(tuple(), dtype=input_array.dtype)
 
@@ -71,11 +86,16 @@ def binary_op(a, b):
         h_init=h_init,
     )
 
-    d_expected = cp.empty_like(d_out)
+    expected = np.empty(n_segments, dtype=input_array.dtype)
     for i in range(n_segments):
-        d_expected[i] = cp.sum(d_in[start_offsets[i] : end_offsets[i]])
+        expected[i] = np.sum(input_array[h_start_offsets[i] : h_end_offsets[i]])
 
-    assert cp.all(d_out == d_expected)
+    result = d_out.copy_to_host()
+    if np.issubdtype(input_array.dtype, np.inexact):
+        tolerance = 4 * np.finfo(input_array.dtype).eps
+        np.testing.assert_allclose(result, expected, rtol=tolerance, atol=tolerance)
+    else:
+        np.testing.assert_array_equal(result, expected)
 
 
 def test_segmented_reduce_struct_type(monkeypatch):
@@ -85,8 +105,6 @@ def test_segmented_reduce_struct_type(monkeypatch):
         "_check_sass",
         False,
     )
-    import cupy as cp
-    import numpy as np
 
     @gpu_struct
     class Pixel:
@@ -102,13 +120,18 @@ def align_up(n, m):
 
     segment_size = 64
     n_pixels = align_up(4000, 64)
-    offsets = cp.arange(n_pixels + segment_size - 1, step=segment_size, dtype=np.int64)
-    start_offsets = offsets[:-1]
-    end_offsets = offsets[1:]
-    n_segments = start_offsets.size
-
-    d_rgb = cp.random.randint(0, 256, (n_pixels, 3), dtype=np.int32).view(Pixel.dtype)
-    d_out = cp.empty(n_segments, Pixel.dtype)
+    offsets = np.arange(n_pixels + segment_size - 1, step=segment_size, dtype=np.int64)
+    h_start_offsets = offsets[:-1]
+    h_end_offsets = offsets[1:]
+    n_segments = h_start_offsets.size
+
+    rng = np.random.default_rng()
+    h_rgb = rng.integers(0, 256, (n_pixels, 3), dtype=np.int32)
+    h_rgb = h_rgb.view(Pixel.dtype).reshape(n_pixels)
+    d_rgb = DeviceArray.from_numpy(h_rgb)
+    d_out = DeviceArray.empty(n_segments, Pixel.dtype)
+    start_offsets = DeviceArray.from_numpy(h_start_offsets)
+    end_offsets = DeviceArray.from_numpy(h_end_offsets)
 
     h_init = Pixel(0, 0, 0)
 
@@ -123,10 +146,10 @@ def align_up(n, m):
         h_init=h_init,
     )
 
-    h_rgb = np.reshape(d_rgb.get(), (n_segments, -1))
+    h_rgb = np.reshape(h_rgb, (n_segments, -1))
     expected = h_rgb[np.arange(h_rgb.shape[0]), h_rgb["g"].argmax(axis=-1)]
 
-    np.testing.assert_equal(expected["g"], d_out.get()["g"])
+    np.testing.assert_equal(expected["g"], d_out.copy_to_host()["g"])
 
 
 @pytest.mark.large
@@ -173,10 +196,12 @@ def scale(row_id):
 
     num_segments = (2**15 + 2**3) * 2**16
     try:
-        res = cp.full(num_segments, fill_value=127, dtype=cp.uint8)
-    except cp.cuda.memory.OutOfMemoryError:
+        res = DeviceArray.empty(num_segments, np.uint8)
+    except Exception as error:
+        if not is_out_of_memory_error(error):
+            raise
         pytest.skip("Insufficient memory to run the large number of segments test")
-    assert res.size == num_segments
+    assert res.nbytes == num_segments * np.dtype(np.uint8).itemsize
 
     def my_add(a: np.uint8, b: np.uint8) -> np.uint8:
         return (a + b) % np.uint8(7)
@@ -246,10 +271,12 @@ def _plus(a, b):
 
     num_segments = (2**15 + 2**3) * 2**16
     try:
-        res = cp.full(num_segments, fill_value=-1, dtype=cp.int16)
-    except cp.cuda.memory.OutOfMemoryError:
+        res = DeviceArray.empty(num_segments, np.int16)
+    except Exception as error:
+        if not is_out_of_memory_error(error):
+            raise
         pytest.skip("Insufficient memory to run the large number of segments test")
-    assert res.size == num_segments
+    assert res.nbytes == num_segments * np.dtype(np.int16).itemsize
 
     h_init = np.zeros(tuple(), dtype=np.int16)
 
@@ -280,10 +307,13 @@ def test_segmented_reduce_well_known_plus(monkeypatch):
     h_init = np.array([0], dtype=dtype)
 
     # Create segmented data: [1, 2, 3] | [4, 5] | [6, 7, 8, 9]
-    d_input = cp.array([1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=dtype)
-    d_starts = cp.array([0, 3, 5], dtype=np.int32)
-    d_ends = cp.array([3, 5, 9], dtype=np.int32)
-    d_output = cp.empty(3, dtype=dtype)
+    h_input = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=dtype)
+    h_starts = np.array([0, 3, 5], dtype=np.int32)
+    h_ends = np.array([3, 5, 9], dtype=np.int32)
+    d_input = DeviceArray.from_numpy(h_input)
+    d_starts = DeviceArray.from_numpy(h_starts)
+    d_ends = DeviceArray.from_numpy(h_ends)
+    d_output = DeviceArray.empty(3, dtype)
 
     cuda.compute.segmented_reduce(
         d_in=d_input,
@@ -296,7 +326,7 @@ def test_segmented_reduce_well_known_plus(monkeypatch):
     )
 
     expected = np.array([6, 9, 30])
-    np.testing.assert_equal(d_output.get(), expected)
+    np.testing.assert_equal(d_output.copy_to_host(), expected)
 
 
 def test_segmented_reduce_well_known_maximum(monkeypatch):
@@ -310,10 +340,13 @@ def test_segmented_reduce_well_known_maximum(monkeypatch):
     h_init = np.array([-100], dtype=dtype)
 
     # Create segmented data: [1, 9, 3] | [4, 2] | [6, 7, 1, 8]
-    d_input = cp.array([1, 9, 3, 4, 2, 6, 7, 1, 8], dtype=dtype)
-    d_starts = cp.array([0, 3, 5], dtype=np.int32)
-    d_ends = cp.array([3, 5, 9], dtype=np.int32)
-    d_output = cp.empty(3, dtype=dtype)
+    h_input = np.array([1, 9, 3, 4, 2, 6, 7, 1, 8], dtype=dtype)
+    h_starts = np.array([0, 3, 5], dtype=np.int32)
+    h_ends = np.array([3, 5, 9], dtype=np.int32)
+    d_input = DeviceArray.from_numpy(h_input)
+    d_starts = DeviceArray.from_numpy(h_starts)
+    d_ends = DeviceArray.from_numpy(h_ends)
+    d_output = DeviceArray.empty(3, dtype)
 
     cuda.compute.segmented_reduce(
         d_in=d_input,
@@ -326,7 +359,7 @@ def test_segmented_reduce_well_known_maximum(monkeypatch):
     )
 
     expected = np.array([9, 4, 8])  # max of each segment
-    np.testing.assert_equal(d_output.get(), expected)
+    np.testing.assert_equal(d_output.copy_to_host(), expected)
 
 
 def test_segmented_reduce_bool_maximum(monkeypatch):
@@ -339,10 +372,13 @@ def test_segmented_reduce_bool_maximum(monkeypatch):
     h_init = np.array([False], dtype=np.bool_)
 
     # Create segmented data: [False, True] | [False, False] | [True]
-    d_input = cp.array([False, True, False, False, True], dtype=np.bool_)
-    d_starts = cp.array([0, 2, 4], dtype=np.int32)
-    d_ends = cp.array([2, 4, 5], dtype=np.int32)
-    d_output = cp.empty(3, dtype=np.bool_)
+    h_input = np.array([False, True, False, False, True], dtype=np.bool_)
+    h_starts = np.array([0, 2, 4], dtype=np.int32)
+    h_ends = np.array([2, 4, 5], dtype=np.int32)
+    d_input = DeviceArray.from_numpy(h_input)
+    d_starts = DeviceArray.from_numpy(h_starts)
+    d_ends = DeviceArray.from_numpy(h_ends)
+    d_output = DeviceArray.empty(3, np.bool_)
 
     cuda.compute.segmented_reduce(
         d_in=d_input,
@@ -355,7 +391,7 @@ def test_segmented_reduce_bool_maximum(monkeypatch):
     )
 
     expected = np.array([True, False, True], dtype=np.bool_)
-    np.testing.assert_equal(d_output.get(), expected)
+    np.testing.assert_equal(d_output.copy_to_host(), expected)
 
 
 def test_segmented_reduce_transform_output_iterator(floating_array, monkeypatch):
@@ -370,13 +406,15 @@ def test_segmented_reduce_transform_output_iterator(floating_array, monkeypatch)
     h_init = np.array([0], dtype=dtype)
 
     # Use the floating_array fixture which provides random floating-point data of size 1000
-    d_input = floating_array
+    d_input = DeviceArray.from_numpy(floating_array)
 
     # Create 2 segments of roughly equal size
-    segment_size = d_input.size // 2
-    d_output = cp.empty(2, dtype=dtype)
-    start_offsets = cp.array([0, segment_size], dtype=np.int32)
-    end_offsets = cp.array([segment_size, d_input.size], dtype=np.int32)
+    segment_size = floating_array.size // 2
+    d_output = DeviceArray.empty(2, dtype)
+    start_offsets = DeviceArray.from_numpy(np.array([0, segment_size], dtype=np.int32))
+    end_offsets = DeviceArray.from_numpy(
+        np.array([segment_size, floating_array.size], dtype=np.int32)
+    )
 
     def sqrt(x: dtype) -> dtype:
         return x**0.5
@@ -393,15 +431,15 @@ def sqrt(x: dtype) -> dtype:
         h_init=h_init,
     )
 
-    expected = cp.sqrt(
-        cp.array(
+    expected = np.sqrt(
+        np.array(
             [
-                cp.sum(d_input[0:segment_size]),
-                cp.sum(d_input[segment_size : d_input.size]),
+                np.sum(floating_array[:segment_size]),
+                np.sum(floating_array[segment_size:]),
             ]
         )
     )
-    np.testing.assert_allclose(d_output.get(), expected.get(), atol=1e-6)
+    np.testing.assert_allclose(d_output.copy_to_host(), expected, atol=1e-6)
 
 
 def test_device_segmented_reduce_for_rowwise_sum(monkeypatch):
@@ -416,7 +454,7 @@ def add_op(a, b):
         return a + b
 
     n_rows, n_cols = 67, 12345
-    rng = cp.random.default_rng()
+    rng = np.random.default_rng()
     mat = rng.integers(low=-31, high=32, dtype=np.int32, size=(n_rows, n_cols))
 
     def make_scaler(step):
@@ -431,9 +469,9 @@ def scale(row_id):
 
     end_offsets = start_offsets + 1
 
-    d_input = mat
+    d_input = DeviceArray.from_numpy(mat)
     h_init = np.zeros(tuple(), dtype=np.int32)
-    d_output = cp.empty(n_rows, dtype=d_input.dtype)
+    d_output = DeviceArray.empty(n_rows, mat.dtype)
 
     cuda.compute.segmented_reduce(
         d_in=d_input,
@@ -445,8 +483,8 @@ def scale(row_id):
         h_init=h_init,
     )
 
-    expected = cp.sum(mat, axis=-1)
-    assert cp.all(d_output == expected)
+    expected = np.sum(mat, axis=-1)
+    np.testing.assert_array_equal(d_output.copy_to_host(), expected)
 
 
 def test_segmented_reduce_with_lambda(monkeypatch):
@@ -461,10 +499,13 @@ def test_segmented_reduce_with_lambda(monkeypatch):
     h_init = np.array([0], dtype=dtype)
 
     # Create segmented data: [1, 2, 3] | [4, 5] | [6, 7, 8, 9]
-    d_input = cp.array([1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=dtype)
-    d_starts = cp.array([0, 3, 5], dtype=np.int32)
-    d_ends = cp.array([3, 5, 9], dtype=np.int32)
-    d_output = cp.empty(3, dtype=dtype)
+    h_input = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=dtype)
+    h_starts = np.array([0, 3, 5], dtype=np.int32)
+    h_ends = np.array([3, 5, 9], dtype=np.int32)
+    d_input = DeviceArray.from_numpy(h_input)
+    d_starts = DeviceArray.from_numpy(h_starts)
+    d_ends = DeviceArray.from_numpy(h_ends)
+    d_output = DeviceArray.empty(3, dtype)
 
     # Use a lambda function directly as the reducer
     cuda.compute.segmented_reduce(
@@ -478,7 +519,7 @@ def test_segmented_reduce_with_lambda(monkeypatch):
     )
 
     expected = np.array([6, 9, 30])  # sum of each segment
-    np.testing.assert_equal(d_output.get(), expected)
+    np.testing.assert_equal(d_output.copy_to_host(), expected)
 
 
 @pytest.mark.parametrize(
@@ -501,21 +542,24 @@ def test_segmented_reduce_max_segment_size(max_seg_size, monkeypatch):
         False,
     )
     dtype = np.int32
-    rng = cp.random
+    rng = np.random.default_rng()
     num_segments = 1024
     h_init = np.zeros(1, dtype=dtype)
 
     # Non-uniform segment sizes in [1, max_seg_size]
-    sizes = rng.randint(1, max_seg_size + 1, size=num_segments, dtype=np.int64)
-    offsets = cp.zeros(num_segments + 1, dtype=np.int64)
-    offsets[1:] = cp.cumsum(sizes)
+    sizes = rng.integers(1, max_seg_size + 1, size=num_segments, dtype=np.int64)
+    offsets = np.zeros(num_segments + 1, dtype=np.int64)
+    offsets[1:] = np.cumsum(sizes)
 
-    total = int(offsets[-1].item())
-    d_input = rng.randint(0, 100, size=total, dtype=dtype)
-    d_output = cp.empty(num_segments, dtype=dtype)
+    total = int(offsets[-1])
+    h_input = rng.integers(0, 100, size=total, dtype=dtype)
+    d_input = DeviceArray.from_numpy(h_input)
+    d_output = DeviceArray.empty(num_segments, dtype)
 
-    d_starts = offsets[:-1]
-    d_ends = offsets[1:]
+    h_starts = offsets[:-1]
+    h_ends = offsets[1:]
+    d_starts = DeviceArray.from_numpy(h_starts)
+    d_ends = DeviceArray.from_numpy(h_ends)
 
     cuda.compute.segmented_reduce(
         d_in=d_input,
@@ -528,8 +572,8 @@ def test_segmented_reduce_max_segment_size(max_seg_size, monkeypatch):
         max_segment_size=max_seg_size,
     )
 
-    expected = cp.empty(num_segments, dtype=dtype)
+    expected = np.empty(num_segments, dtype=dtype)
     for i in range(num_segments):
-        expected[i] = cp.sum(d_input[int(d_starts[i].item()) : int(d_ends[i].item())])
+        expected[i] = np.sum(h_input[h_starts[i] : h_ends[i]])
 
-    np.testing.assert_array_equal(d_output.get(), expected.get())
+    np.testing.assert_array_equal(d_output.copy_to_host(), expected)
diff --git a/python/cuda_cccl/tests/compute/test_segmented_sort.py b/python/cuda_cccl/tests/compute/test_segmented_sort.py
index 2bdd03abb42..5a874e5bfd6 100644
--- a/python/cuda_cccl/tests/compute/test_segmented_sort.py
+++ b/python/cuda_cccl/tests/compute/test_segmented_sort.py
@@ -4,10 +4,9 @@
 
 from typing import Tuple
 
-import cupy as cp
-import numba
 import numpy as np
 import pytest
+from _utils.device_array import DeviceArray
 
 import cuda.compute
 
@@ -118,8 +117,10 @@ def test_segmented_sort_keys(dtype, num_segments, segment_size, monkeypatch):
     h_in_keys = random_array(num_items, dtype, max_value=50)
     start_offsets, end_offsets = make_uniform_segments(num_segments, segment_size)
 
-    d_in_keys = numba.cuda.to_device(h_in_keys)
-    d_out_keys = numba.cuda.to_device(np.empty_like(h_in_keys))
+    d_in_keys = DeviceArray.from_numpy(h_in_keys)
+    d_out_keys = DeviceArray.empty(h_in_keys.shape, h_in_keys.dtype)
+    d_start_offsets = DeviceArray.from_numpy(start_offsets)
+    d_end_offsets = DeviceArray.from_numpy(end_offsets)
 
     cuda.compute.segmented_sort(
         d_in_keys=d_in_keys,
@@ -128,8 +129,8 @@ def test_segmented_sort_keys(dtype, num_segments, segment_size, monkeypatch):
         d_out_values=None,
         num_items=num_items,
         num_segments=num_segments,
-        start_offsets_in=cp.asarray(start_offsets),
-        end_offsets_in=cp.asarray(end_offsets),
+        start_offsets_in=d_start_offsets,
+        end_offsets_in=d_end_offsets,
         order=order,
     )
 
@@ -153,10 +154,12 @@ def test_segmented_sort_pairs(dtype, num_segments, segment_size):
 
     start_offsets, end_offsets = make_uniform_segments(num_segments, segment_size)
 
-    d_in_keys = numba.cuda.to_device(h_in_keys)
-    d_in_vals = numba.cuda.to_device(h_in_vals)
-    d_out_keys = numba.cuda.to_device(np.empty_like(h_in_keys))
-    d_out_vals = numba.cuda.to_device(np.empty_like(h_in_vals))
+    d_in_keys = DeviceArray.from_numpy(h_in_keys)
+    d_in_vals = DeviceArray.from_numpy(h_in_vals)
+    d_out_keys = DeviceArray.empty(h_in_keys.shape, h_in_keys.dtype)
+    d_out_vals = DeviceArray.empty(h_in_vals.shape, h_in_vals.dtype)
+    d_start_offsets = DeviceArray.from_numpy(start_offsets)
+    d_end_offsets = DeviceArray.from_numpy(end_offsets)
 
     cuda.compute.segmented_sort(
         d_in_keys=d_in_keys,
@@ -165,8 +168,8 @@ def test_segmented_sort_pairs(dtype, num_segments, segment_size):
         d_out_values=d_out_vals,
         num_items=num_items,
         num_segments=num_segments,
-        start_offsets_in=cp.asarray(start_offsets),
-        end_offsets_in=cp.asarray(end_offsets),
+        start_offsets_in=d_start_offsets,
+        end_offsets_in=d_end_offsets,
         order=order,
     )
 
@@ -189,8 +192,10 @@ def test_segmented_sort_keys_double_buffer(dtype, num_segments, segment_size):
     h_in_keys = random_array(num_items, dtype, max_value=20)
     start_offsets, end_offsets = make_uniform_segments(num_segments, segment_size)
 
-    d_in_keys = numba.cuda.to_device(h_in_keys)
-    d_tmp_keys = numba.cuda.to_device(np.empty_like(h_in_keys))
+    d_in_keys = DeviceArray.from_numpy(h_in_keys)
+    d_tmp_keys = DeviceArray.empty(h_in_keys.shape, h_in_keys.dtype)
+    d_start_offsets = DeviceArray.from_numpy(start_offsets)
+    d_end_offsets = DeviceArray.from_numpy(end_offsets)
     keys_db = cuda.compute.DoubleBuffer(d_in_keys, d_tmp_keys)
 
     cuda.compute.segmented_sort(
@@ -200,8 +205,8 @@ def test_segmented_sort_keys_double_buffer(dtype, num_segments, segment_size):
         d_out_values=None,
         num_items=num_items,
         num_segments=num_segments,
-        start_offsets_in=cp.asarray(start_offsets),
-        end_offsets_in=cp.asarray(end_offsets),
+        start_offsets_in=d_start_offsets,
+        end_offsets_in=d_end_offsets,
         order=order,
     )
 
@@ -224,10 +229,12 @@ def test_segmented_sort_pairs_double_buffer(dtype, num_segments, segment_size):
 
     start_offsets, end_offsets = make_uniform_segments(num_segments, segment_size)
 
-    d_in_keys = numba.cuda.to_device(h_in_keys)
-    d_in_vals = numba.cuda.to_device(h_in_vals)
-    d_tmp_keys = numba.cuda.to_device(np.empty_like(h_in_keys))
-    d_tmp_vals = numba.cuda.to_device(np.empty_like(h_in_vals))
+    d_in_keys = DeviceArray.from_numpy(h_in_keys)
+    d_in_vals = DeviceArray.from_numpy(h_in_vals)
+    d_tmp_keys = DeviceArray.empty(h_in_keys.shape, h_in_keys.dtype)
+    d_tmp_vals = DeviceArray.empty(h_in_vals.shape, h_in_vals.dtype)
+    d_start_offsets = DeviceArray.from_numpy(start_offsets)
+    d_end_offsets = DeviceArray.from_numpy(end_offsets)
 
     keys_db = cuda.compute.DoubleBuffer(d_in_keys, d_tmp_keys)
     vals_db = cuda.compute.DoubleBuffer(d_in_vals, d_tmp_vals)
@@ -239,8 +246,8 @@ def test_segmented_sort_pairs_double_buffer(dtype, num_segments, segment_size):
         d_out_values=None,
         num_items=num_items,
         num_segments=num_segments,
-        start_offsets_in=cp.asarray(start_offsets),
-        end_offsets_in=cp.asarray(end_offsets),
+        start_offsets_in=d_start_offsets,
+        end_offsets_in=d_end_offsets,
         order=order,
     )
 
@@ -297,10 +304,12 @@ def test_segmented_sort_variable_segment_sizes(num_segments):
     h_in_keys = random_array(num_items, np.int32, max_value=100)
     h_in_vals = random_array(num_items, np.float32)
 
-    d_in_keys = numba.cuda.to_device(h_in_keys)
-    d_in_vals = numba.cuda.to_device(h_in_vals)
-    d_out_keys = numba.cuda.to_device(np.empty_like(h_in_keys))
-    d_out_vals = numba.cuda.to_device(np.empty_like(h_in_vals))
+    d_in_keys = DeviceArray.from_numpy(h_in_keys)
+    d_in_vals = DeviceArray.from_numpy(h_in_vals)
+    d_out_keys = DeviceArray.empty(h_in_keys.shape, h_in_keys.dtype)
+    d_out_vals = DeviceArray.empty(h_in_vals.shape, h_in_vals.dtype)
+    d_start_offsets = DeviceArray.from_numpy(start_offsets)
+    d_end_offsets = DeviceArray.from_numpy(end_offsets)
 
     cuda.compute.segmented_sort(
         d_in_keys=d_in_keys,
@@ -309,8 +318,8 @@ def test_segmented_sort_variable_segment_sizes(num_segments):
         d_out_values=d_out_vals,
         num_items=num_items,
         num_segments=num_segments,
-        start_offsets_in=cp.asarray(start_offsets),
-        end_offsets_in=cp.asarray(end_offsets),
+        start_offsets_in=d_start_offsets,
+        end_offsets_in=d_end_offsets,
         order=order,
     )
 
diff --git a/python/cuda_cccl/tests/compute/test_select.py b/python/cuda_cccl/tests/compute/test_select.py
index f2483860a4a..d55adeba83b 100644
--- a/python/cuda_cccl/tests/compute/test_select.py
+++ b/python/cuda_cccl/tests/compute/test_select.py
@@ -2,9 +2,9 @@
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-import cupy as cp
 import numpy as np
 import pytest
+from _utils.device_array import DeviceArray
 
 import cuda.compute
 from cuda.compute import CacheModifiedInputIterator, ZipIterator, gpu_struct
@@ -59,6 +59,10 @@ def _host_select(h_in: np.ndarray, cond):
     return selected, np.int64(selected.size)
 
 
+def _read_count(array: DeviceArray) -> int:
+    return int(array.copy_to_host()[0])
+
+
 @pytest.mark.parametrize("dtype,num_items", select_params)
 def test_select_basic(dtype, num_items):
     h_in = random_array(num_items, dtype, max_value=100)
@@ -66,10 +70,9 @@ def test_select_basic(dtype, num_items):
     def even_op(x):
         return x % 2 == 0
 
-    d_in = cp.empty(num_items, dtype=dtype)
-    d_in.set(h_in)
-    d_out = cp.empty_like(d_in)
-    d_num_selected = cp.empty(2, dtype=np.uint64)
+    d_in = DeviceArray.from_numpy(h_in)
+    d_out = DeviceArray.empty(h_in.shape, h_in.dtype)
+    d_num_selected = DeviceArray.empty(2, np.uint64)
 
     cuda.compute.select(
         d_in=d_in,
@@ -79,8 +82,8 @@ def even_op(x):
         num_items=num_items,
     )
 
-    num_selected = int(d_num_selected[0].get())
-    got = d_out.get()[:num_selected]
+    num_selected = _read_count(d_num_selected)
+    got = d_out.copy_to_host()[:num_selected]
 
     expected, expected_count = _host_select(h_in, even_op)
 
@@ -95,9 +98,9 @@ def test_select_greater_than(dtype, num_items):
     def greater_than_42(x):
         return x > 42
 
-    d_in = cp.asarray(h_in)
-    d_out = cp.empty_like(d_in)
-    d_num_selected = cp.empty(2, dtype=np.uint64)
+    d_in = DeviceArray.from_numpy(h_in)
+    d_out = DeviceArray.empty(h_in.shape, h_in.dtype)
+    d_num_selected = DeviceArray.empty(2, np.uint64)
 
     cuda.compute.select(
         d_in=d_in,
@@ -107,8 +110,8 @@ def greater_than_42(x):
         num_items=num_items,
     )
 
-    num_selected = int(d_num_selected[0].get())
-    got = d_out.get()[:num_selected]
+    num_selected = _read_count(d_num_selected)
+    got = d_out.copy_to_host()[:num_selected]
 
     expected, expected_count = _host_select(h_in, greater_than_42)
 
@@ -124,9 +127,9 @@ def test_select_all_pass(dtype):
     def always_true(x):
         return True
 
-    d_in = cp.asarray(h_in)
-    d_out = cp.empty_like(d_in)
-    d_num_selected = cp.empty(2, dtype=np.uint64)
+    d_in = DeviceArray.from_numpy(h_in)
+    d_out = DeviceArray.empty(h_in.shape, h_in.dtype)
+    d_num_selected = DeviceArray.empty(2, np.uint64)
 
     cuda.compute.select(
         d_in=d_in,
@@ -136,8 +139,8 @@ def always_true(x):
         num_items=num_items,
     )
 
-    num_selected = int(d_num_selected[0].get())
-    got = d_out.get()[:num_selected]
+    num_selected = _read_count(d_num_selected)
+    got = d_out.copy_to_host()[:num_selected]
 
     assert num_selected == num_items
     assert np.array_equal(got, h_in)
@@ -151,9 +154,9 @@ def test_select_none_pass(monkeypatch, dtype):
     def always_false(x):
         return False
 
-    d_in = cp.asarray(h_in)
-    d_out = cp.empty_like(d_in)
-    d_num_selected = cp.empty(2, dtype=np.int32)
+    d_in = DeviceArray.from_numpy(h_in)
+    d_out = DeviceArray.empty(h_in.shape, h_in.dtype)
+    d_num_selected = DeviceArray.empty(2, np.int32)
 
     cuda.compute.select(
         d_in=d_in,
@@ -163,7 +166,7 @@ def always_false(x):
         num_items=num_items,
     )
 
-    num_selected = int(d_num_selected[0].get())
+    num_selected = _read_count(d_num_selected)
 
     assert num_selected == 0
 
@@ -176,9 +179,9 @@ def test_select_empty():
     def even_op(x):
         return x % 2 == 0
 
-    d_in = cp.asarray(h_in)
-    d_out = cp.empty_like(d_in)
-    d_num_selected = cp.empty(2, dtype=np.uint64)
+    d_in = DeviceArray.from_numpy(h_in)
+    d_out = DeviceArray.empty(h_in.shape, h_in.dtype)
+    d_num_selected = DeviceArray.empty(2, np.uint64)
 
     cuda.compute.select(
         d_in=d_in,
@@ -188,7 +191,7 @@ def even_op(x):
         num_items=num_items,
     )
 
-    num_selected = int(d_num_selected[0].get())
+    num_selected = _read_count(d_num_selected)
 
     assert num_selected == 0
 
@@ -201,10 +204,10 @@ def test_select_with_iterator(dtype):
     def less_than_50(x):
         return x < 50
 
-    d_in = cp.asarray(h_in)
+    d_in = DeviceArray.from_numpy(h_in)
     d_in_iter = CacheModifiedInputIterator(d_in, modifier="stream")
-    d_out = cp.empty_like(d_in)
-    d_num_selected = cp.empty(2, dtype=np.uint64)
+    d_out = DeviceArray.empty(h_in.shape, h_in.dtype)
+    d_num_selected = DeviceArray.empty(2, np.uint64)
 
     cuda.compute.select(
         d_in=d_in_iter,
@@ -214,8 +217,8 @@ def less_than_50(x):
         num_items=num_items,
     )
 
-    num_selected = int(d_num_selected[0].get())
-    got = d_out.get()[:num_selected]
+    num_selected = _read_count(d_num_selected)
+    got = d_out.copy_to_host()[:num_selected]
 
     expected, expected_count = _host_select(h_in, less_than_50)
 
@@ -231,9 +234,9 @@ def test_select_object_api(dtype):
     def divisible_by_3(x):
         return x % 3 == 0
 
-    d_in = cp.asarray(h_in)
-    d_out = cp.empty_like(d_in)
-    d_num_selected = cp.empty(2, dtype=np.uint64)
+    d_in = DeviceArray.from_numpy(h_in)
+    d_out = DeviceArray.empty(h_in.shape, h_in.dtype)
+    d_num_selected = DeviceArray.empty(2, np.uint64)
 
     # Create select object
     selector = cuda.compute.make_select(
@@ -254,7 +257,7 @@ def divisible_by_3(x):
     )
 
     # Allocate temp storage
-    d_temp_storage = cp.empty(temp_storage_bytes, dtype=np.uint8)
+    d_temp_storage = DeviceArray.empty(temp_storage_bytes, np.uint8)
 
     # Execute select
     selector(
@@ -266,8 +269,8 @@ def divisible_by_3(x):
         num_items=num_items,
     )
 
-    num_selected = int(d_num_selected[0].get())
-    got = d_out.get()[:num_selected]
+    num_selected = _read_count(d_num_selected)
+    got = d_out.copy_to_host()[:num_selected]
 
     expected, expected_count = _host_select(h_in, divisible_by_3)
 
@@ -283,12 +286,12 @@ def test_select_reuse_object(dtype):
     def positive_op(x):
         return x > 0
 
-    d_out = cp.empty(num_items, dtype=dtype)
-    d_num_selected = cp.empty(2, dtype=np.uint64)
+    d_out = DeviceArray.empty(num_items, dtype)
+    d_num_selected = DeviceArray.empty(2, np.uint64)
 
     # Create select object with initial input
     h_in1 = random_array(num_items, dtype, max_value=100) - 50
-    d_in1 = cp.asarray(h_in1)
+    d_in1 = DeviceArray.from_numpy(h_in1)
     selector = cuda.compute.make_select(
         d_in=d_in1,
         d_out=d_out,
@@ -305,7 +308,7 @@ def positive_op(x):
         cond=positive_op,
         num_items=num_items,
     )
-    d_temp_storage = cp.empty(temp_storage_bytes, dtype=np.uint8)
+    d_temp_storage = DeviceArray.empty(temp_storage_bytes, np.uint8)
     selector(
         temp_storage=d_temp_storage,
         d_in=d_in1,
@@ -315,8 +318,8 @@ def positive_op(x):
         num_items=num_items,
     )
 
-    num_selected1 = int(d_num_selected[0].get())
-    got1 = d_out.get()[:num_selected1]
+    num_selected1 = _read_count(d_num_selected)
+    got1 = d_out.copy_to_host()[:num_selected1]
     expected1, expected_count1 = _host_select(h_in1, positive_op)
 
     assert num_selected1 == expected_count1
@@ -324,7 +327,7 @@ def positive_op(x):
 
     # Reuse with different input
     h_in2 = random_array(num_items, dtype, max_value=100) - 50
-    d_in2 = cp.asarray(h_in2)
+    d_in2 = DeviceArray.from_numpy(h_in2)
 
     selector(
         temp_storage=d_temp_storage,
@@ -335,8 +338,8 @@ def positive_op(x):
         num_items=num_items,
     )
 
-    num_selected2 = int(d_num_selected[0].get())
-    got2 = d_out.get()[:num_selected2]
+    num_selected2 = _read_count(d_num_selected)
+    got2 = d_out.copy_to_host()[:num_selected2]
     expected2, expected_count2 = _host_select(h_in2, positive_op)
 
     assert num_selected2 == expected_count2
@@ -363,10 +366,9 @@ class Point:
     def in_first_quadrant(p: Point) -> np.uint8:
         return (p.x > 50) and (p.y > 50)
 
-    d_in = cp.empty(num_items, dtype=Point.dtype)
-    d_in.set(h_in)
-    d_out = cp.empty_like(d_in)
-    d_num_selected = cp.empty(2, dtype=np.uint64)
+    d_in = DeviceArray.from_numpy(h_in)
+    d_out = DeviceArray.empty(h_in.shape, h_in.dtype)
+    d_num_selected = DeviceArray.empty(2, np.uint64)
 
     cuda.compute.select(
         d_in=d_in,
@@ -376,8 +378,8 @@ def in_first_quadrant(p: Point) -> np.uint8:
         num_items=num_items,
     )
 
-    num_selected = int(d_num_selected[0].get())
-    got = d_out.get()[:num_selected]
+    num_selected = _read_count(d_num_selected)
+    got = d_out.copy_to_host()[:num_selected]
 
     # Host reference
     def host_in_first_quadrant(p):
@@ -405,19 +407,19 @@ def condition(pair):
         return (pair[0] + pair[1]) < 70
 
     # Device arrays
-    d_in1 = cp.asarray(h_in1)
-    d_in2 = cp.asarray(h_in2)
+    d_in1 = DeviceArray.from_numpy(h_in1)
+    d_in2 = DeviceArray.from_numpy(h_in2)
 
     # Create zip iterator for input
     zip_in = ZipIterator(d_in1, d_in2)
 
     # Allocate output arrays
-    d_out1 = cp.empty_like(d_in1)
-    d_out2 = cp.empty_like(d_in2)
+    d_out1 = DeviceArray.empty(h_in1.shape, h_in1.dtype)
+    d_out2 = DeviceArray.empty(h_in2.shape, h_in2.dtype)
 
     # Create zip iterator for output
     zip_out = ZipIterator(d_out1, d_out2)
-    d_num_selected = cp.empty(1, dtype=np.int32)
+    d_num_selected = DeviceArray.empty(1, np.int32)
 
     cuda.compute.select(
         d_in=zip_in,
@@ -427,11 +429,11 @@ def condition(pair):
         num_items=num_items,
     )
 
-    num_selected = int(d_num_selected[0].get())
+    num_selected = _read_count(d_num_selected)
 
     # Get results
-    got1 = d_out1.get()[:num_selected]
-    got2 = d_out2.get()[:num_selected]
+    got1 = d_out1.copy_to_host()[:num_selected]
+    got2 = d_out2.copy_to_host()[:num_selected]
 
     # Verify results: all elements should satisfy the condition
     for i in range(num_selected):
@@ -451,15 +453,17 @@ def test_select_stateful_threshold():
 
     # Create device state containing threshold value
     threshold_value = 50
-    threshold_state = cp.array([threshold_value], dtype=np.int32)
+    threshold_state = DeviceArray.from_numpy(
+        np.array([threshold_value], dtype=np.int32)
+    )
 
     # Define condition that references state as closure
     def threshold_select(x):
         return x > threshold_state[0]
 
-    d_in = cp.asarray(h_in)
-    d_out = cp.empty_like(d_in)
-    d_num_selected = cp.empty(2, dtype=np.uint64)
+    d_in = DeviceArray.from_numpy(h_in)
+    d_out = DeviceArray.empty(h_in.shape, h_in.dtype)
+    d_num_selected = DeviceArray.empty(2, np.uint64)
 
     cuda.compute.select(
         d_in=d_in,
@@ -470,8 +474,8 @@ def threshold_select(x):
     )
 
     # Check selected output
-    num_selected = int(d_num_selected[0].get())
-    got = d_out.get()[:num_selected]
+    num_selected = _read_count(d_num_selected)
+    got = d_out.copy_to_host()[:num_selected]
 
     # Verify all output values are > threshold
     assert np.all(got > threshold_value)
@@ -494,7 +498,7 @@ def test_select_stateful_atomic():
     h_in = random_array(num_items, np.int32, max_value=100)
 
     # Create device state for counting rejected items
-    reject_counter = cp.zeros(1, dtype=np.int32)
+    reject_counter = DeviceArray.from_numpy(np.zeros(1, dtype=np.int32))
 
     # Define condition that references state as closure
     def count_rejects(x):
@@ -504,9 +508,9 @@ def count_rejects(x):
             numba_cuda.atomic.add(reject_counter, 0, 1)
             return False
 
-    d_in = cp.asarray(h_in)
-    d_out = cp.empty_like(d_in)
-    d_num_selected = cp.empty(2, dtype=np.uint64)
+    d_in = DeviceArray.from_numpy(h_in)
+    d_out = DeviceArray.empty(h_in.shape, h_in.dtype)
+    d_num_selected = DeviceArray.empty(2, np.uint64)
 
     cuda.compute.select(
         d_in=d_in,
@@ -517,8 +521,8 @@ def count_rejects(x):
     )
 
     # Check selected output
-    num_selected = int(d_num_selected[0].get())
-    got = d_out.get()[:num_selected]
+    num_selected = _read_count(d_num_selected)
+    got = d_out.copy_to_host()[:num_selected]
 
     # Verify all output values are > 50
     assert np.all(got > 50)
@@ -533,7 +537,7 @@ def count_rejects(x):
     assert np.array_equal(got, expected_selected)
 
     # Verify state contains count of rejected items
-    rejected_count = int(reject_counter[0].get())
+    rejected_count = _read_count(reject_counter)
     expected_rejected = len(h_in[h_in <= 50])
     assert rejected_count == expected_rejected, (
         f"Expected {expected_rejected} rejections, got {rejected_count}"
@@ -544,11 +548,12 @@ def test_select_with_side_effect_counting_rejects():
     """Select with side effect that counts rejected items"""
     from numba import cuda as numba_cuda
 
-    d_in = cp.arange(100, dtype=np.int32)
-    d_out = cp.empty_like(d_in)
-    d_num_selected = cp.empty(1, dtype=np.uint64)
+    h_in = np.arange(100, dtype=np.int32)
+    d_in = DeviceArray.from_numpy(h_in)
+    d_out = DeviceArray.empty(h_in.shape, h_in.dtype)
+    d_num_selected = DeviceArray.empty(1, np.uint64)
 
-    reject_count = cp.zeros(1, dtype=np.int32)
+    reject_count = DeviceArray.from_numpy(np.zeros(1, dtype=np.int32))
 
     # Define condition that references state as closure
     def count_rejects(x):
@@ -563,11 +568,11 @@ def count_rejects(x):
         d_out=d_out,
         d_num_selected_out=d_num_selected,
         cond=count_rejects,
-        num_items=len(d_in),
+        num_items=h_in.size,
     )
 
-    num_selected = int(d_num_selected.get()[0])
-    num_rejected = int(reject_count.get()[0])
+    num_selected = _read_count(d_num_selected)
+    num_rejected = _read_count(reject_count)
 
     assert num_selected == 50  # Values 50-99
     assert num_rejected == 50  # Values 0-49
@@ -578,9 +583,9 @@ def test_select_with_lambda():
     num_items = 100
     h_in = np.arange(num_items, dtype=np.int32)
 
-    d_in = cp.asarray(h_in)
-    d_out = cp.empty_like(d_in)
-    d_num_selected = cp.empty(2, dtype=np.uint64)
+    d_in = DeviceArray.from_numpy(h_in)
+    d_out = DeviceArray.empty(h_in.shape, h_in.dtype)
+    d_num_selected = DeviceArray.empty(2, np.uint64)
 
     # Use a lambda function directly as the predicate
     cuda.compute.select(
@@ -591,23 +596,26 @@ def test_select_with_lambda():
         num_items=num_items,
     )
 
-    num_selected = int(d_num_selected.get()[0])
+    num_selected = _read_count(d_num_selected)
     expected_selected = [x for x in h_in if x % 2 == 0]
 
     assert num_selected == len(expected_selected)
-    np.testing.assert_array_equal(d_out.get()[:num_selected], expected_selected)
+    np.testing.assert_array_equal(
+        d_out.copy_to_host()[:num_selected], expected_selected
+    )
 
 
 def test_select_stateful_state_updates():
     """Test that select correctly updates state between calls with different thresholds."""
     num_items = 20
-    d_in = cp.arange(num_items, dtype=np.int32)
-    d_out = cp.empty_like(d_in)
-    d_count = cp.zeros(2, dtype=np.uint64)
+    h_in = np.arange(num_items, dtype=np.int32)
+    d_in = DeviceArray.from_numpy(h_in)
+    d_out = DeviceArray.empty(h_in.shape, h_in.dtype)
+    d_count = DeviceArray.from_numpy(np.zeros(2, dtype=np.uint64))
 
     # Create two different thresholds
-    threshold_5 = cp.array([5], dtype=np.int32)
-    threshold_15 = cp.array([15], dtype=np.int32)
+    threshold_5 = DeviceArray.from_numpy(np.array([5], dtype=np.int32))
+    threshold_15 = DeviceArray.from_numpy(np.array([15], dtype=np.int32))
 
     # Call 1: Select items > 5 (should get 14 items: 6-19)
     def select_gt_5(x):
@@ -620,16 +628,16 @@ def select_gt_5(x):
         cond=select_gt_5,
         num_items=num_items,
     )
-    count1 = int(d_count[0].get())
+    count1 = _read_count(d_count)
     assert count1 == 14
     expected_1 = list(range(6, 20))
-    np.testing.assert_array_equal(d_out.get()[:count1], expected_1)
+    np.testing.assert_array_equal(d_out.copy_to_host()[:count1], expected_1)
 
     # Call 2: Select items > 15 (should get 4 items: 16-19)
     def select_gt_15(x):
         return x > threshold_15[0]
 
-    d_count.fill(0)
+    d_count.copy_from_host(np.zeros(2, dtype=np.uint64))
     cuda.compute.select(
         d_in=d_in,
         d_out=d_out,
@@ -637,13 +645,13 @@ def select_gt_15(x):
         cond=select_gt_15,
         num_items=num_items,
     )
-    count2 = int(d_count[0].get())
+    count2 = _read_count(d_count)
     assert count2 == 4
     expected_2 = list(range(16, 20))
-    np.testing.assert_array_equal(d_out.get()[:count2], expected_2)
+    np.testing.assert_array_equal(d_out.copy_to_host()[:count2], expected_2)
 
     # Call 3: Back to first threshold (test cache reuse with updated state)
-    d_count.fill(0)
+    d_count.copy_from_host(np.zeros(2, dtype=np.uint64))
     cuda.compute.select(
         d_in=d_in,
         d_out=d_out,
@@ -651,9 +659,9 @@ def select_gt_15(x):
         cond=select_gt_5,
         num_items=num_items,
     )
-    count3 = int(d_count[0].get())
+    count3 = _read_count(d_count)
     assert count3 == 14
-    np.testing.assert_array_equal(d_out.get()[:count3], expected_1)
+    np.testing.assert_array_equal(d_out.copy_to_host()[:count3], expected_1)
 
 
 def test_select_stateful_same_bytecode_different_state():
@@ -665,9 +673,10 @@ def test_select_stateful_same_bytecode_different_state():
     the same bytecode but different captured arrays would reuse stale state.
     """
     num_items = 20
-    d_in = cp.arange(num_items, dtype=np.int32)
-    d_out = cp.empty_like(d_in)
-    d_count = cp.zeros(2, dtype=np.uint64)
+    h_in = np.arange(num_items, dtype=np.int32)
+    d_in = DeviceArray.from_numpy(h_in)
+    d_out = DeviceArray.empty(h_in.shape, h_in.dtype)
+    d_count = DeviceArray.from_numpy(np.zeros(2, dtype=np.uint64))
 
     # Factory that creates functions with identical bytecode
     def make_selector(threshold_array):
@@ -676,8 +685,8 @@ def selector(x):
 
         return selector
 
-    threshold_5 = cp.array([5], dtype=np.int32)
-    threshold_15 = cp.array([15], dtype=np.int32)
+    threshold_5 = DeviceArray.from_numpy(np.array([5], dtype=np.int32))
+    threshold_15 = DeviceArray.from_numpy(np.array([15], dtype=np.int32))
 
     select_5 = make_selector(threshold_5)
     select_15 = make_selector(threshold_15)
@@ -690,11 +699,11 @@ def selector(x):
         cond=select_5,
         num_items=num_items,
     )
-    count1 = int(d_count[0].get())
+    count1 = _read_count(d_count)
     assert count1 == 14
 
     # Call 2: threshold > 15 (different state, same bytecode)
-    d_count.fill(0)
+    d_count.copy_from_host(np.zeros(2, dtype=np.uint64))
     cuda.compute.select(
         d_in=d_in,
         d_out=d_out,
@@ -702,7 +711,7 @@ def selector(x):
         cond=select_15,
         num_items=num_items,
     )
-    count2 = int(d_count[0].get())
+    count2 = _read_count(d_count)
     assert count2 == 4  # If this fails, cache collision bug is present
 
 
@@ -712,19 +721,15 @@ def test_stateful_caching_same_dtype_different_values():
     After transformation, values are runtime parameters, so they should use the
     same compiled code.
     """
-    import cupy as cp
-    import numpy as np
-
-    import cuda.compute
-
     num_items = 100
-    d_in = cp.arange(num_items, dtype=np.int32)
-    d_out = cp.empty_like(d_in)
-    d_count = cp.zeros(2, dtype=np.uint64)
+    h_in = np.arange(num_items, dtype=np.int32)
+    d_in = DeviceArray.from_numpy(h_in)
+    d_out = DeviceArray.empty(h_in.shape, h_in.dtype)
+    d_count = DeviceArray.from_numpy(np.zeros(2, dtype=np.uint64))
 
     # Two thresholds with SAME dtype, SAME size, DIFFERENT values
-    threshold_30 = cp.array([30], dtype=np.int32)
-    threshold_70 = cp.array([70], dtype=np.int32)
+    threshold_30 = DeviceArray.from_numpy(np.array([30], dtype=np.int32))
+    threshold_70 = DeviceArray.from_numpy(np.array([70], dtype=np.int32))
 
     # Test with threshold_30
     def select_gt_30(x):
@@ -737,14 +742,14 @@ def select_gt_30(x):
         cond=select_gt_30,
         num_items=num_items,
     )
-    count_30 = int(d_count[0].get())
+    count_30 = _read_count(d_count)
 
     # Test with threshold_70
     def select_gt_70(x):
         return x > threshold_70[0]
 
-    d_out.fill(0)
-    d_count.fill(0)
+    d_out.copy_from_host(np.zeros_like(h_in))
+    d_count.copy_from_host(np.zeros(2, dtype=np.uint64))
     cuda.compute.select(
         d_in=d_in,
         d_out=d_out,
@@ -752,7 +757,7 @@ def select_gt_70(x):
         cond=select_gt_70,
         num_items=num_items,
     )
-    count_70 = int(d_count[0].get())
+    count_70 = _read_count(d_count)
 
     # Verify correct results (not cache collision)
     assert count_30 == 69  # Values 31-99
diff --git a/python/cuda_cccl/tests/compute/test_shuffle_iterator.py b/python/cuda_cccl/tests/compute/test_shuffle_iterator.py
index 5c82fe1ad38..2d442abce02 100644
--- a/python/cuda_cccl/tests/compute/test_shuffle_iterator.py
+++ b/python/cuda_cccl/tests/compute/test_shuffle_iterator.py
@@ -2,9 +2,9 @@
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-import cupy as cp
 import numpy as np
 import pytest
+from _utils.device_array import DeviceArray
 
 import cuda.compute
 from cuda.compute.iterators import (
@@ -19,12 +19,12 @@ def test_shuffle_iterator_bijectivity():
 
     shuffle_it = ShuffleIterator(num_items, seed)
 
-    d_output = cp.empty(num_items, dtype=np.int64)
+    d_output = DeviceArray.empty(num_items, np.int64)
     cuda.compute.unary_transform(
         d_in=shuffle_it, d_out=d_output, op=lambda x: x, num_items=num_items
     )
 
-    result = d_output.get()
+    result = d_output.copy_to_host()
 
     assert len(set(result)) == num_items
     assert set(result) == set(range(num_items))
@@ -37,8 +37,8 @@ def test_shuffle_iterator_determinism():
     shuffle_it1 = ShuffleIterator(num_items, seed)
     shuffle_it2 = ShuffleIterator(num_items, seed)
 
-    d_output1 = cp.empty(num_items, dtype=np.int64)
-    d_output2 = cp.empty(num_items, dtype=np.int64)
+    d_output1 = DeviceArray.empty(num_items, np.int64)
+    d_output2 = DeviceArray.empty(num_items, np.int64)
 
     cuda.compute.unary_transform(
         d_in=shuffle_it1, d_out=d_output1, op=lambda x: x, num_items=num_items
@@ -47,7 +47,7 @@ def test_shuffle_iterator_determinism():
         d_in=shuffle_it2, d_out=d_output2, op=lambda x: x, num_items=num_items
     )
 
-    cp.testing.assert_array_equal(d_output1, d_output2)
+    np.testing.assert_array_equal(d_output1.copy_to_host(), d_output2.copy_to_host())
 
 
 @pytest.mark.parametrize("num_items", [1, 2, 7, 16, 17, 100, 1000, 1023, 1024, 1025])
@@ -56,12 +56,12 @@ def test_shuffle_iterator_various_sizes(num_items):
 
     shuffle_it = ShuffleIterator(num_items, seed)
 
-    d_output = cp.empty(num_items, dtype=np.int64)
+    d_output = DeviceArray.empty(num_items, np.int64)
     cuda.compute.unary_transform(
         d_in=shuffle_it, d_out=d_output, op=lambda x: x, num_items=num_items
     )
 
-    result = d_output.get()
+    result = d_output.copy_to_host()
 
     assert len(set(result)) == num_items
     assert set(result) == set(range(num_items))
@@ -71,20 +71,21 @@ def test_shuffle_iterator_with_permutation_iterator():
     num_items = 10
     seed = 42
 
-    d_values = cp.asarray([10, 20, 30, 40, 50, 60, 70, 80, 90, 100], dtype=np.int32)
+    h_values = np.asarray([10, 20, 30, 40, 50, 60, 70, 80, 90, 100], dtype=np.int32)
+    d_values = DeviceArray.from_numpy(h_values)
 
     shuffle_it = ShuffleIterator(num_items, seed)
     perm_it = PermutationIterator(d_values, shuffle_it)
 
-    d_output = cp.empty(num_items, dtype=np.int32)
+    d_output = DeviceArray.empty(num_items, np.int32)
     cuda.compute.unary_transform(
         d_in=perm_it, d_out=d_output, op=lambda x: x, num_items=num_items
     )
 
-    result = d_output.get()
+    result = d_output.copy_to_host()
 
-    assert result.sum() == d_values.sum()
-    assert sorted(result) == sorted(d_values.get())
+    assert result.sum() == h_values.sum()
+    assert sorted(result) == sorted(h_values)
 
 
 def test_shuffle_iterator_invalid_num_items():
diff --git a/python/cuda_cccl/tests/compute/test_three_way_partition.py b/python/cuda_cccl/tests/compute/test_three_way_partition.py
index 3fb5b102275..51958c6e1f1 100644
--- a/python/cuda_cccl/tests/compute/test_three_way_partition.py
+++ b/python/cuda_cccl/tests/compute/test_three_way_partition.py
@@ -2,9 +2,9 @@
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-import cupy as cp
 import numpy as np
 import pytest
+from _utils.device_array import DeviceArray
 
 import cuda.compute
 from cuda.compute import CacheModifiedInputIterator, gpu_struct
@@ -85,11 +85,11 @@ def less_than_op(x):
     def greater_equal_op(x):
         return x >= 42
 
-    d_in = cp.asarray(h_in)
-    d_first = cp.empty_like(d_in)
-    d_second = cp.empty_like(d_in)
-    d_unselected = cp.empty_like(d_in)
-    d_num_selected = cp.empty(2, dtype=np.int32)
+    d_in = DeviceArray.from_numpy(h_in)
+    d_first = DeviceArray.empty(h_in.shape, h_in.dtype)
+    d_second = DeviceArray.empty(h_in.shape, h_in.dtype)
+    d_unselected = DeviceArray.empty(h_in.shape, h_in.dtype)
+    d_num_selected = DeviceArray.empty(2, np.int32)
     cuda.compute.three_way_partition(
         d_in=d_in,
         d_first_part_out=d_first,
@@ -101,10 +101,10 @@ def greater_equal_op(x):
         num_items=num_items,
     )
 
-    num_selected = d_num_selected.get()
-    got_first = d_first.get()[: int(num_selected[0])]
-    got_second = d_second.get()[: int(num_selected[1])]
-    got_unselected = d_unselected.get()[
+    num_selected = d_num_selected.copy_to_host()
+    got_first = d_first.copy_to_host()[: int(num_selected[0])]
+    got_second = d_second.copy_to_host()[: int(num_selected[1])]
+    got_unselected = d_unselected.copy_to_host()[
         : int(num_items) - int(num_selected[0]) - int(num_selected[1])
     ]
 
@@ -120,11 +120,11 @@ def greater_equal_op(x):
 
 def test_three_way_partition_empty():
     dtype = np.int32
-    d_in = cp.empty(0, dtype=dtype)
-    d_first = cp.empty(0, dtype=dtype)
-    d_second = cp.empty(0, dtype=dtype)
-    d_unselected = cp.empty(0, dtype=dtype)
-    d_num_selected = cp.zeros(2, dtype=np.int64)
+    d_in = DeviceArray.empty(0, dtype)
+    d_first = DeviceArray.empty(0, dtype)
+    d_second = DeviceArray.empty(0, dtype)
+    d_unselected = DeviceArray.empty(0, dtype)
+    d_num_selected = DeviceArray.from_numpy(np.zeros(2, dtype=np.int64))
 
     def less_than_op(x):
         return x < 42
@@ -143,7 +143,7 @@ def greater_equal_op(x):
         num_items=0,
     )
 
-    np.testing.assert_array_equal(d_num_selected.get(), np.array([0, 0]))
+    np.testing.assert_array_equal(d_num_selected.copy_to_host(), np.array([0, 0]))
 
 
 def test_three_way_partition_with_iterators():
@@ -161,13 +161,13 @@ def greater_equal_op(x):
         _host_three_way_partition(h_in, less_than_op, greater_equal_op)
     )
 
-    d_in = cp.asarray(h_in)
+    d_in = DeviceArray.from_numpy(h_in)
     in_it = CacheModifiedInputIterator(d_in, modifier="stream")
 
-    d_first = cp.empty_like(d_in)
-    d_second = cp.empty_like(d_in)
-    d_unselected = cp.empty_like(d_in)
-    d_num_selected = cp.empty(2, dtype=np.uint32)
+    d_first = DeviceArray.empty(h_in.shape, h_in.dtype)
+    d_second = DeviceArray.empty(h_in.shape, h_in.dtype)
+    d_unselected = DeviceArray.empty(h_in.shape, h_in.dtype)
+    d_num_selected = DeviceArray.empty(2, np.uint32)
 
     cuda.compute.three_way_partition(
         d_in=in_it,
@@ -180,10 +180,10 @@ def greater_equal_op(x):
         num_items=num_items,
     )
 
-    num_selected = d_num_selected.get()
-    got_first = d_first.get()[: int(num_selected[0])]
-    got_second = d_second.get()[: int(num_selected[1])]
-    got_unselected = d_unselected.get()[
+    num_selected = d_num_selected.copy_to_host()
+    got_first = d_first.copy_to_host()[: int(num_selected[0])]
+    got_second = d_second.copy_to_host()[: int(num_selected[1])]
+    got_unselected = d_unselected.copy_to_host()[
         : int(num_items) - int(num_selected[0]) - int(num_selected[1])
     ]
 
@@ -224,12 +224,11 @@ def greater_equal_op(x: pair_type):
     expected_second = h_in[remaining_mask][expected_second_mask]
     expected_unselected = h_in[remaining_mask][~expected_second_mask]
 
-    h_in_i32 = h_in.view(np.int32).reshape(num_items, 4)
-    d_in = cp.asarray(h_in_i32).view(pair_type.dtype).reshape(num_items)
-    d_first = cp.empty_like(d_in)
-    d_second = cp.empty_like(d_in)
-    d_unselected = cp.empty_like(d_in)
-    d_num_selected = cp.empty(2, dtype=np.uint64)
+    d_in = DeviceArray.from_numpy(h_in)
+    d_first = DeviceArray.empty(h_in.shape, h_in.dtype)
+    d_second = DeviceArray.empty(h_in.shape, h_in.dtype)
+    d_unselected = DeviceArray.empty(h_in.shape, h_in.dtype)
+    d_num_selected = DeviceArray.empty(2, np.uint64)
 
     cuda.compute.three_way_partition(
         d_in=d_in,
@@ -242,10 +241,10 @@ def greater_equal_op(x: pair_type):
         num_items=num_items,
     )
 
-    num_selected = d_num_selected.get()
-    got_first = d_first.get()[: int(num_selected[0])]
-    got_second = d_second.get()[: int(num_selected[1])]
-    got_unselected = d_unselected.get()[
+    num_selected = d_num_selected.copy_to_host()
+    got_first = d_first.copy_to_host()[: int(num_selected[0])]
+    got_second = d_second.copy_to_host()[: int(num_selected[1])]
+    got_unselected = d_unselected.copy_to_host()[
         : int(num_items) - int(num_selected[0]) - int(num_selected[1])
     ]
 
@@ -269,13 +268,11 @@ def greater_equal_op(x):
         _host_three_way_partition(h_in, less_than_op, greater_equal_op)
     )
 
-    cp_stream = cp.cuda.ExternalStream(cuda_stream.ptr)
-    with cp_stream:
-        d_in = cp.asarray(h_in)
-        d_first = cp.empty_like(d_in)
-        d_second = cp.empty_like(d_in)
-        d_unselected = cp.empty_like(d_in)
-        d_num_selected = cp.empty(2, dtype=np.int64)
+    d_in = DeviceArray.from_numpy(h_in, stream=cuda_stream)
+    d_first = DeviceArray.empty(h_in.shape, h_in.dtype, stream=cuda_stream)
+    d_second = DeviceArray.empty(h_in.shape, h_in.dtype, stream=cuda_stream)
+    d_unselected = DeviceArray.empty(h_in.shape, h_in.dtype, stream=cuda_stream)
+    d_num_selected = DeviceArray.empty(2, np.int64, stream=cuda_stream)
 
     cuda.compute.three_way_partition(
         d_in=d_in,
@@ -289,13 +286,12 @@ def greater_equal_op(x):
         stream=cuda_stream,
     )
 
-    with cp_stream:
-        num_selected = d_num_selected.get()
-        got_first = d_first.get()[: int(num_selected[0])]
-        got_second = d_second.get()[: int(num_selected[1])]
-        got_unselected = d_unselected.get()[
-            : int(num_items) - int(num_selected[0]) - int(num_selected[1])
-        ]
+    num_selected = d_num_selected.copy_to_host(stream=cuda_stream)
+    got_first = d_first.copy_to_host(stream=cuda_stream)[: int(num_selected[0])]
+    got_second = d_second.copy_to_host(stream=cuda_stream)[: int(num_selected[1])]
+    got_unselected = d_unselected.copy_to_host(stream=cuda_stream)[
+        : int(num_items) - int(num_selected[0]) - int(num_selected[1])
+    ]
 
     np.testing.assert_array_equal(got_first, expected_first)
     np.testing.assert_array_equal(got_second, expected_second)
@@ -313,11 +309,11 @@ def less_than_op(x):
     def greater_equal_op(x):
         return x == 102
 
-    d_in = cp.asarray(h_in)
-    d_first = cp.empty_like(d_in)
-    d_second = cp.empty_like(d_in)
-    d_unselected = cp.empty_like(d_in)
-    d_num_selected = cp.empty(2, dtype=np.int64)
+    d_in = DeviceArray.from_numpy(h_in)
+    d_first = DeviceArray.empty(h_in.shape, h_in.dtype)
+    d_second = DeviceArray.empty(h_in.shape, h_in.dtype)
+    d_unselected = DeviceArray.empty(h_in.shape, h_in.dtype)
+    d_num_selected = DeviceArray.empty(2, np.int64)
 
     cuda.compute.three_way_partition(
         d_in=d_in,
@@ -330,12 +326,12 @@ def greater_equal_op(x):
         num_items=num_items,
     )
 
-    num_selected = d_num_selected.get()
+    num_selected = d_num_selected.copy_to_host()
     assert int(num_selected[0]) == 0 and int(num_selected[1]) == 0
 
-    got_first = d_first.get()[: int(num_selected[0])]
-    got_second = d_second.get()[: int(num_selected[1])]
-    got_unselected = d_unselected.get()[:num_items]
+    got_first = d_first.copy_to_host()[: int(num_selected[0])]
+    got_second = d_second.copy_to_host()[: int(num_selected[1])]
+    got_unselected = d_unselected.copy_to_host()[:num_items]
 
     np.testing.assert_array_equal(got_first, np.empty(0, dtype=dtype))
     np.testing.assert_array_equal(got_second, np.empty(0, dtype=dtype))
@@ -350,11 +346,11 @@ def test_three_way_partition_same_predicate():
     def always_true(x):
         return True
 
-    d_in = cp.asarray(h_in)
-    d_first = cp.empty_like(d_in)
-    d_second = cp.empty_like(d_in)
-    d_unselected = cp.empty_like(d_in)
-    d_num_selected = cp.empty(2, dtype=np.int64)
+    d_in = DeviceArray.from_numpy(h_in)
+    d_first = DeviceArray.empty(h_in.shape, h_in.dtype)
+    d_second = DeviceArray.empty(h_in.shape, h_in.dtype)
+    d_unselected = DeviceArray.empty(h_in.shape, h_in.dtype)
+    d_num_selected = DeviceArray.empty(2, np.int64)
 
     cuda.compute.three_way_partition(
         d_in=d_in,
@@ -367,7 +363,7 @@ def always_true(x):
         num_items=num_items,
     )
 
-    num_selected = d_num_selected.get()
+    num_selected = d_num_selected.copy_to_host()
     assert int(num_selected[0]) == num_items
     assert int(num_selected[1]) == 0
 
@@ -383,11 +379,11 @@ def less_than_op(x):
     def greater_equal_op(x):
         return x == 42
 
-    d_in = cp.asarray(h_in)
-    d_first = cp.empty_like(d_in)
-    d_second = cp.empty_like(d_in)
-    d_unselected = cp.empty_like(d_in)
-    d_num_selected = cp.empty(2, dtype=np.int64)
+    d_in = DeviceArray.from_numpy(h_in)
+    d_first = DeviceArray.empty(h_in.shape, h_in.dtype)
+    d_second = DeviceArray.empty(h_in.shape, h_in.dtype)
+    d_unselected = DeviceArray.empty(h_in.shape, h_in.dtype)
+    d_num_selected = DeviceArray.empty(2, np.int64)
 
     cuda.compute.three_way_partition(
         d_in=d_in,
@@ -400,12 +396,12 @@ def greater_equal_op(x):
         num_items=num_items,
     )
 
-    num_selected = d_num_selected.get()
+    num_selected = d_num_selected.copy_to_host()
     assert int(num_selected[0]) == num_items and int(num_selected[1]) == 0
 
-    got_first = d_first.get()[: int(num_selected[0])]
-    got_second = d_second.get()[: int(num_selected[1])]
-    got_unselected = d_unselected.get()[
+    got_first = d_first.copy_to_host()[: int(num_selected[0])]
+    got_second = d_second.copy_to_host()[: int(num_selected[1])]
+    got_unselected = d_unselected.copy_to_host()[
         : int(num_items) - int(num_selected[0]) - int(num_selected[1])
     ]
 
diff --git a/python/cuda_cccl/tests/compute/test_transform.py b/python/cuda_cccl/tests/compute/test_transform.py
index c7c3ca2818a..1b17bbc9212 100644
--- a/python/cuda_cccl/tests/compute/test_transform.py
+++ b/python/cuda_cccl/tests/compute/test_transform.py
@@ -2,9 +2,9 @@
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-import cupy as cp
 import numpy as np
 import pytest
+from _utils.device_array import DeviceArray
 
 import cuda.compute
 from cuda.compute import (
@@ -46,13 +46,14 @@ def test_unary_transform(input_array):
     def op(a):
         return a + 1
 
-    d_in = input_array
-    d_out = cp.empty_like(d_in)
+    h_in = input_array
+    d_in = DeviceArray.from_numpy(h_in)
+    d_out = DeviceArray.empty(h_in.shape, h_in.dtype)
 
-    unary_transform_device(d_in, d_out, len(d_in), op)
+    unary_transform_device(d_in, d_out, h_in.size, op)
 
-    got = d_out.get()
-    expected = unary_transform_host(d_in.get(), op)
+    got = d_out.copy_to_host()
+    expected = unary_transform_host(h_in, op)
 
     np.testing.assert_allclose(expected, got, rtol=1e-5)
 
@@ -64,14 +65,16 @@ def test_binary_transform(input_array):
     def op(a, b):
         return a + b
 
-    d_in1 = input_array
-    d_in2 = input_array
-    d_out = cp.empty_like(d_in1)
+    h_in1 = input_array
+    h_in2 = input_array
+    d_in1 = DeviceArray.from_numpy(h_in1)
+    d_in2 = DeviceArray.from_numpy(h_in2)
+    d_out = DeviceArray.empty(h_in1.shape, h_in1.dtype)
 
-    binary_transform_device(d_in1, d_in2, d_out, len(d_in1), op)
+    binary_transform_device(d_in1, d_in2, d_out, h_in1.size, op)
 
-    got = d_out.get()
-    expected = binary_transform_host(d_in1.get(), d_in2.get(), op)
+    got = d_out.copy_to_host()
+    expected = binary_transform_host(h_in1, h_in2, op)
 
     np.testing.assert_allclose(expected, got, rtol=1e-5)
 
@@ -92,20 +95,12 @@ def op(a):
     h_in = np.empty(num_values, dtype=MyStruct.dtype)
     h_in["x"] = np.arange(num_values)
     h_in["y"] = 1
-    d_in = cp.empty_like(h_in)
+    d_in = DeviceArray.from_numpy(h_in)
+    d_out = DeviceArray.empty(h_in.shape, h_in.dtype)
 
-    cp.cuda.runtime.memcpy(
-        d_in.data.ptr,
-        h_in.__array_interface__["data"][0],
-        h_in.nbytes,
-        cp.cuda.runtime.memcpyHostToDevice,
-    )
-
-    d_out = cp.empty_like(d_in)
-
-    cuda.compute.unary_transform(d_in=d_in, d_out=d_out, op=op, num_items=len(d_in))
+    cuda.compute.unary_transform(d_in=d_in, d_out=d_out, op=op, num_items=h_in.size)
 
-    got = d_out.get()
+    got = d_out.copy_to_host()
 
     np.testing.assert_allclose(got["x"], np.arange(num_values) * 2)
     np.testing.assert_allclose(got["y"], np.ones(num_values) + 10)
@@ -132,29 +127,15 @@ def op(a, b):
     h_in2["x"] = np.random.randint(0, num_values, num_values, dtype="int16")
     h_in2["y"] = np.random.randint(0, num_values, num_values, dtype="uint64")
 
-    d_in1 = cp.empty_like(h_in1)
-    d_in2 = cp.empty_like(h_in2)
-
-    cp.cuda.runtime.memcpy(
-        d_in1.data.ptr,
-        h_in1.__array_interface__["data"][0],
-        h_in1.nbytes,
-        cp.cuda.runtime.memcpyHostToDevice,
-    )
-    cp.cuda.runtime.memcpy(
-        d_in2.data.ptr,
-        h_in2.__array_interface__["data"][0],
-        h_in2.nbytes,
-        cp.cuda.runtime.memcpyHostToDevice,
-    )
-
-    d_out = cp.empty_like(d_in1)
+    d_in1 = DeviceArray.from_numpy(h_in1)
+    d_in2 = DeviceArray.from_numpy(h_in2)
+    d_out = DeviceArray.empty(h_in1.shape, h_in1.dtype)
 
     cuda.compute.binary_transform(
-        d_in1=d_in1, d_in2=d_in2, d_out=d_out, op=op, num_items=len(d_in1)
+        d_in1=d_in1, d_in2=d_in2, d_out=d_out, op=op, num_items=h_in1.size
     )
 
-    got = d_out.get()
+    got = d_out.copy_to_host()
 
     np.testing.assert_allclose(got["x"], h_in1["x"] + h_in2["x"])
     np.testing.assert_allclose(got["y"], h_in1["y"] + h_in2["y"])
@@ -167,11 +148,11 @@ def op(a):
     d_in = CountingIterator(np.int32(0))
 
     num_items = 1024
-    d_out = cp.empty(num_items, dtype=np.int32)
+    d_out = DeviceArray.empty(num_items, np.int32)
 
     unary_transform_device(d_in, d_out, num_items, op)
 
-    got = d_out.get()
+    got = d_out.copy_to_host()
     expected = np.arange(1, num_items + 1, dtype=np.int32)
 
     np.testing.assert_allclose(expected, got)
@@ -185,11 +166,11 @@ def op(a, b):
     d_in2 = CountingIterator(np.int32(1))
 
     num_items = 1024
-    d_out = cp.empty(num_items, dtype=np.int32)
+    d_out = DeviceArray.empty(num_items, np.int32)
 
     binary_transform_device(d_in1, d_in2, d_out, num_items, op)
 
-    got = d_out.get()
+    got = d_out.copy_to_host()
     expected = np.arange(1, 2 * num_items + 1, step=2, dtype=np.int32)
 
     np.testing.assert_allclose(expected, got)
@@ -199,18 +180,15 @@ def test_unary_transform_with_stream(cuda_stream):
     def op(a):
         return a + 1
 
-    cp_stream = cp.cuda.ExternalStream(cuda_stream.ptr)
-
     n = 10
-
-    with cp_stream:
-        d_in = cp.arange(n, dtype=np.int32)
-        d_out = cp.empty_like(d_in)
+    h_in = np.arange(n, dtype=np.int32)
+    d_in = DeviceArray.from_numpy(h_in, stream=cuda_stream)
+    d_out = DeviceArray.empty(h_in.shape, h_in.dtype, stream=cuda_stream)
 
     unary_transform_device(d_in, d_out, n, op, stream=cuda_stream)
 
-    got = d_out.get()
-    expected = unary_transform_host(d_in.get(), op)
+    got = d_out.copy_to_host(stream=cuda_stream)
+    expected = unary_transform_host(h_in, op)
 
     np.testing.assert_allclose(expected, got, rtol=1e-5)
 
@@ -219,19 +197,17 @@ def test_binary_transform_with_stream(cuda_stream):
     def op(a, b):
         return a + b
 
-    cp_stream = cp.cuda.ExternalStream(cuda_stream.ptr)
-
     n = 10
-
-    with cp_stream:
-        d_in1 = cp.arange(n, dtype=np.int32)
-        d_in2 = cp.arange(n, dtype=np.int32)
-        d_out = cp.empty_like(d_in1)
+    h_in1 = np.arange(n, dtype=np.int32)
+    h_in2 = np.arange(n, dtype=np.int32)
+    d_in1 = DeviceArray.from_numpy(h_in1, stream=cuda_stream)
+    d_in2 = DeviceArray.from_numpy(h_in2, stream=cuda_stream)
+    d_out = DeviceArray.empty(h_in1.shape, h_in1.dtype, stream=cuda_stream)
 
     binary_transform_device(d_in1, d_in2, d_out, n, op, stream=cuda_stream)
 
-    got = d_out.get()
-    expected = binary_transform_host(d_in1.get(), d_in2.get(), op)
+    got = d_out.copy_to_host(stream=cuda_stream)
+    expected = binary_transform_host(h_in1, h_in2, op)
 
     np.testing.assert_allclose(expected, got, rtol=1e-5)
 
@@ -244,11 +220,11 @@ def op(a, b):
     d_in2 = CountingIterator(np.int32(1))
 
     num_items = 1024
-    d_out = cp.empty(num_items, dtype=np.int32)
+    d_out = DeviceArray.empty(num_items, np.int32)
 
     binary_transform_device(d_in1, d_in2, d_out, num_items, op)
 
-    got = d_out.get()
+    got = d_out.copy_to_host()
     expected = np.arange(1, 2 * num_items + 1, step=2, dtype=np.int32)
 
     np.testing.assert_allclose(expected, got)
@@ -260,7 +236,7 @@ def op2(a):
         return a + 1
 
     unary_transform_device(d_in2, d_out, num_items, op2)
-    got = d_out.get()
+    got = d_out.copy_to_host()
     expected = np.arange(1, num_items + 1, dtype=np.int32) + 1
 
     np.testing.assert_allclose(expected, got)
@@ -269,41 +245,45 @@ def op2(a):
 def test_unary_transform_well_known_negate():
     """Test unary transform with well-known NEGATE operation."""
     dtype = np.int32
-    d_input = cp.array([1, -2, 3, -4, 5], dtype=dtype)
-    d_output = cp.empty_like(d_input, dtype=dtype)
+    h_input = np.array([1, -2, 3, -4, 5], dtype=dtype)
+    d_input = DeviceArray.from_numpy(h_input)
+    d_output = DeviceArray.empty(h_input.shape, dtype)
 
     # Run unary transform with well-known NEGATE operation
     cuda.compute.unary_transform(
-        d_in=d_input, d_out=d_output, op=OpKind.NEGATE, num_items=len(d_input)
+        d_in=d_input, d_out=d_output, op=OpKind.NEGATE, num_items=h_input.size
     )
 
     # Check the result is correct
     expected = np.array([-1, 2, -3, 4, -5])
-    np.testing.assert_equal(d_output.get(), expected)
+    np.testing.assert_equal(d_output.copy_to_host(), expected)
 
 
 def test_unary_transform_well_known_identity():
     """Test unary transform with well-known IDENTITY operation."""
     dtype = np.int32
-    d_input = cp.array([1, 2, 3, 4, 5], dtype=dtype)
-    d_output = cp.empty_like(d_input, dtype=dtype)
+    h_input = np.array([1, 2, 3, 4, 5], dtype=dtype)
+    d_input = DeviceArray.from_numpy(h_input)
+    d_output = DeviceArray.empty(h_input.shape, dtype)
 
     # Run unary transform with well-known IDENTITY operation
     cuda.compute.unary_transform(
-        d_in=d_input, d_out=d_output, op=OpKind.IDENTITY, num_items=len(d_input)
+        d_in=d_input, d_out=d_output, op=OpKind.IDENTITY, num_items=h_input.size
     )
 
     # Check the result is correct
     expected = np.array([1, 2, 3, 4, 5])
-    np.testing.assert_equal(d_output.get(), expected)
+    np.testing.assert_equal(d_output.copy_to_host(), expected)
 
 
 @pytest.mark.parametrize("dtype", [np.int32, np.float16])
 def test_binary_transform_well_known_plus(dtype):
     """Test binary transform with well-known PLUS operation."""
-    d_input1 = cp.array([1, 2, 3, 4, 5], dtype=dtype)
-    d_input2 = cp.array([10, 20, 30, 40, 50], dtype=dtype)
-    d_output = cp.empty_like(d_input1, dtype=dtype)
+    h_input1 = np.array([1, 2, 3, 4, 5], dtype=dtype)
+    h_input2 = np.array([10, 20, 30, 40, 50], dtype=dtype)
+    d_input1 = DeviceArray.from_numpy(h_input1)
+    d_input2 = DeviceArray.from_numpy(h_input2)
+    d_output = DeviceArray.empty(h_input1.shape, dtype)
 
     # Run binary transform with well-known PLUS operation
     cuda.compute.binary_transform(
@@ -311,20 +291,22 @@ def test_binary_transform_well_known_plus(dtype):
         d_in2=d_input2,
         d_out=d_output,
         op=OpKind.PLUS,
-        num_items=len(d_input1),
+        num_items=h_input1.size,
     )
 
     # Check the result is correct
     expected = np.array([11, 22, 33, 44, 55])
-    np.testing.assert_equal(d_output.get(), expected)
+    np.testing.assert_equal(d_output.copy_to_host(), expected)
 
 
 def test_binary_transform_well_known_multiplies():
     """Test binary transform with well-known MULTIPLIES operation."""
     dtype = np.int32
-    d_input1 = cp.array([1, 2, 3, 4, 5], dtype=dtype)
-    d_input2 = cp.array([2, 3, 4, 5, 6], dtype=dtype)
-    d_output = cp.empty_like(d_input1, dtype=dtype)
+    h_input1 = np.array([1, 2, 3, 4, 5], dtype=dtype)
+    h_input2 = np.array([2, 3, 4, 5, 6], dtype=dtype)
+    d_input1 = DeviceArray.from_numpy(h_input1)
+    d_input2 = DeviceArray.from_numpy(h_input2)
+    d_output = DeviceArray.empty(h_input1.shape, dtype)
 
     # Run binary transform with well-known MULTIPLIES operation
     cuda.compute.binary_transform(
@@ -332,12 +314,12 @@ def test_binary_transform_well_known_multiplies():
         d_in2=d_input2,
         d_out=d_output,
         op=OpKind.MULTIPLIES,
-        num_items=len(d_input1),
+        num_items=h_input1.size,
     )
 
     # Check the result is correct
     expected = np.array([2, 6, 12, 20, 30])
-    np.testing.assert_equal(d_output.get(), expected)
+    np.testing.assert_equal(d_output.copy_to_host(), expected)
 
 
 def test_unary_transform_struct_type_with_annotations():
@@ -355,16 +337,14 @@ def scale_op(p: Point) -> Point:
     h_in["x"] = np.random.rand(num_items).astype(np.float32)
     h_in["y"] = np.random.rand(num_items).astype(np.float32)
 
-    d_in = cp.empty_like(h_in)
-    d_in.set(h_in)
-
-    d_out = cp.empty_like(d_in)
+    d_in = DeviceArray.from_numpy(h_in)
+    d_out = DeviceArray.empty(h_in.shape, h_in.dtype)
 
     cuda.compute.unary_transform(
         d_in=d_in, d_out=d_out, op=scale_op, num_items=num_items
     )
 
-    result = d_out.get()
+    result = d_out.copy_to_host()
 
     np.testing.assert_allclose(result["x"], h_in["x"] * 2.0, rtol=1e-5)
     np.testing.assert_allclose(result["y"], h_in["y"] * 3.0, rtol=1e-5)
@@ -389,19 +369,15 @@ def add_vectors(v1: Vec2D, v2: Vec2D) -> Vec2D:
     h_in2["x"] = np.random.randint(-100, 100, num_items, dtype=np.int32)
     h_in2["y"] = np.random.randint(-100, 100, num_items, dtype=np.int32)
 
-    d_in1 = cp.empty_like(h_in1)
-    d_in1.set(h_in1)
-
-    d_in2 = cp.empty_like(h_in2)
-    d_in2.set(h_in2)
-
-    d_out = cp.empty_like(d_in1)
+    d_in1 = DeviceArray.from_numpy(h_in1)
+    d_in2 = DeviceArray.from_numpy(h_in2)
+    d_out = DeviceArray.empty(h_in1.shape, h_in1.dtype)
 
     cuda.compute.binary_transform(
         d_in1=d_in1, d_in2=d_in2, d_out=d_out, op=add_vectors, num_items=num_items
     )
 
-    result = d_out.get()
+    result = d_out.copy_to_host()
 
     np.testing.assert_equal(result["x"], h_in1["x"] + h_in2["x"])
     np.testing.assert_equal(result["y"], h_in1["y"] + h_in2["y"])
@@ -411,10 +387,11 @@ def test_unary_transform_stateful_counting():
     """Test unary_transform with state that counts even numbers."""
     from numba import cuda as numba_cuda
 
-    d_in = cp.arange(100, dtype=np.int32)
-    d_out = cp.empty_like(d_in)
+    h_in = np.arange(100, dtype=np.int32)
+    d_in = DeviceArray.from_numpy(h_in)
+    d_out = DeviceArray.empty(h_in.shape, h_in.dtype)
 
-    even_count = cp.zeros(1, dtype=np.int32)
+    even_count = DeviceArray.from_numpy(np.zeros(1, dtype=np.int32))
 
     # Define op that references state as closure
     def count_evens(x):
@@ -423,25 +400,26 @@ def count_evens(x):
         return x * 2
 
     cuda.compute.unary_transform(
-        d_in=d_in, d_out=d_out, op=count_evens, num_items=len(d_in)
+        d_in=d_in, d_out=d_out, op=count_evens, num_items=h_in.size
     )
 
-    expected_output = cp.arange(100, dtype=np.int32) * 2
-    np.testing.assert_array_equal(d_out.get(), expected_output.get())
+    expected_output = h_in * 2
+    np.testing.assert_array_equal(d_out.copy_to_host(), expected_output)
 
-    num_evens = int(even_count.get()[0])
+    num_evens = int(even_count.copy_to_host()[0])
     assert num_evens == 50  # 0, 2, 4, ..., 98
 
 
 def test_unary_transform_stateful_state_updates():
     """Test that stateful transform correctly updates state between calls."""
     num_items = 20
-    d_in = cp.arange(num_items, dtype=np.int32)
-    d_out = cp.empty_like(d_in)
+    h_in = np.arange(num_items, dtype=np.int32)
+    d_in = DeviceArray.from_numpy(h_in)
+    d_out = DeviceArray.empty(h_in.shape, h_in.dtype)
 
     # Create two different thresholds
-    threshold_10 = cp.array([10], dtype=np.int32)
-    threshold_15 = cp.array([15], dtype=np.int32)
+    threshold_10 = DeviceArray.from_numpy(np.array([10], dtype=np.int32))
+    threshold_15 = DeviceArray.from_numpy(np.array([15], dtype=np.int32))
 
     # Call 1: x + 10
     def add_threshold_10(x):
@@ -450,41 +428,42 @@ def add_threshold_10(x):
     cuda.compute.unary_transform(
         d_in=d_in, d_out=d_out, op=add_threshold_10, num_items=num_items
     )
-    result_1 = d_out.get()
-    expected_1 = d_in.get() + 10
+    result_1 = d_out.copy_to_host()
+    expected_1 = h_in + 10
     np.testing.assert_array_equal(result_1, expected_1)
 
     # Call 2: x + 15 (different state)
     def add_threshold_15(x):
         return x + threshold_15[0]
 
-    d_out.fill(0)
+    d_out.copy_from_host(np.zeros_like(h_in))
     cuda.compute.unary_transform(
         d_in=d_in, d_out=d_out, op=add_threshold_15, num_items=num_items
     )
-    result_2 = d_out.get()
-    expected_2 = d_in.get() + 15
+    result_2 = d_out.copy_to_host()
+    expected_2 = h_in + 15
     np.testing.assert_array_equal(result_2, expected_2)
 
     # Call 3: Back to first threshold (test cache reuse with updated state)
-    d_out.fill(0)
+    d_out.copy_from_host(np.zeros_like(h_in))
     cuda.compute.unary_transform(
         d_in=d_in, d_out=d_out, op=add_threshold_10, num_items=num_items
     )
-    result_3 = d_out.get()
-    expected_3 = d_in.get() + 10
+    result_3 = d_out.copy_to_host()
+    expected_3 = h_in + 10
     np.testing.assert_array_equal(result_3, expected_3)
 
 
 def test_unary_transform_stateful_multiple_arrays():
     """Test stateful transform with multiple captured arrays."""
     num_items = 10
-    d_in = cp.arange(num_items, dtype=np.int32)
-    d_out = cp.empty_like(d_in)
+    h_in = np.arange(num_items, dtype=np.int32)
+    d_in = DeviceArray.from_numpy(h_in)
+    d_out = DeviceArray.empty(h_in.shape, h_in.dtype)
 
     # Multiple state arrays
-    offset = cp.array([5], dtype=np.int32)
-    multiplier = cp.array([2], dtype=np.int32)
+    offset = DeviceArray.from_numpy(np.array([5], dtype=np.int32))
+    multiplier = DeviceArray.from_numpy(np.array([2], dtype=np.int32))
 
     def transform_with_multiple_state(x):
         return (x + offset[0]) * multiplier[0]
@@ -492,23 +471,23 @@ def transform_with_multiple_state(x):
     cuda.compute.unary_transform(
         d_in=d_in, d_out=d_out, op=transform_with_multiple_state, num_items=num_items
     )
-    result = d_out.get()
-    expected = (d_in.get() + 5) * 2
+    result = d_out.copy_to_host()
+    expected = (h_in + 5) * 2
     np.testing.assert_array_equal(result, expected)
 
     # Update state and verify it works with new values
-    offset = cp.array([10], dtype=np.int32)
-    multiplier = cp.array([3], dtype=np.int32)
+    offset = DeviceArray.from_numpy(np.array([10], dtype=np.int32))
+    multiplier = DeviceArray.from_numpy(np.array([3], dtype=np.int32))
 
     def transform_with_updated_state(x):
         return (x + offset[0]) * multiplier[0]
 
-    d_out.fill(0)
+    d_out.copy_from_host(np.zeros_like(h_in))
     cuda.compute.unary_transform(
         d_in=d_in, d_out=d_out, op=transform_with_updated_state, num_items=num_items
     )
-    result = d_out.get()
-    expected = (d_in.get() + 10) * 3
+    result = d_out.copy_to_host()
+    expected = (h_in + 10) * 3
     np.testing.assert_array_equal(result, expected)
 
 
@@ -527,14 +506,18 @@ def func(x):
 
         return func
 
-    d_in = cp.array([0, 1, 2], dtype=np.int32)
-    d_out = cp.empty_like(d_in)
+    h_in = np.array([0, 1, 2], dtype=np.int32)
+    d_in = DeviceArray.from_numpy(h_in)
+    d_out = DeviceArray.empty(h_in.shape, h_in.dtype)
 
     # First call with offset 10
     cuda.compute.unary_transform(
-        d_in=d_in, d_out=d_out, op=make_adder(cp.array([10])), num_items=len(d_in)
+        d_in=d_in,
+        d_out=d_out,
+        op=make_adder(DeviceArray.from_numpy(np.array([10], dtype=np.int64))),
+        num_items=h_in.size,
     )
-    np.testing.assert_array_equal(d_out.get(), np.array([10, 11, 12]))
+    np.testing.assert_array_equal(d_out.copy_to_host(), np.array([10, 11, 12]))
 
     # Multiple calls with different offsets to test state re-detection
     for i in range(5):
@@ -542,12 +525,12 @@ def func(x):
         cuda.compute.unary_transform(
             d_in=d_in,
             d_out=d_out,
-            op=make_adder(cp.array([offset])),
-            num_items=len(d_in),
+            op=make_adder(DeviceArray.from_numpy(np.array([offset], dtype=np.int64))),
+            num_items=h_in.size,
         )
         expected = np.array([offset, offset + 1, offset + 2])
         np.testing.assert_array_equal(
-            d_out.get(),
+            d_out.copy_to_host(),
             expected,
             err_msg=f"Failed at iteration {i} with offset {offset}",
         )
@@ -555,23 +538,26 @@ def func(x):
 
 def test_unary_transform_with_lambda():
     """Test unary_transform with a lambda function."""
-    d_in = cp.array([1, 2, 3, 4, 5], dtype=np.int32)
-    d_out = cp.empty_like(d_in)
+    h_in = np.array([1, 2, 3, 4, 5], dtype=np.int32)
+    d_in = DeviceArray.from_numpy(h_in)
+    d_out = DeviceArray.empty(h_in.shape, h_in.dtype)
 
     # Use a lambda function directly
     cuda.compute.unary_transform(
-        d_in=d_in, d_out=d_out, op=lambda x: x * 2, num_items=len(d_in)
+        d_in=d_in, d_out=d_out, op=lambda x: x * 2, num_items=h_in.size
     )
 
     expected = np.array([2, 4, 6, 8, 10], dtype=np.int32)
-    np.testing.assert_array_equal(d_out.get(), expected)
+    np.testing.assert_array_equal(d_out.copy_to_host(), expected)
 
 
 def test_binary_transform_with_lambda():
     """Test binary_transform with a lambda function."""
-    d_in1 = cp.array([1, 2, 3, 4, 5], dtype=np.int32)
-    d_in2 = cp.array([10, 20, 30, 40, 50], dtype=np.int32)
-    d_out = cp.empty_like(d_in1)
+    h_in1 = np.array([1, 2, 3, 4, 5], dtype=np.int32)
+    h_in2 = np.array([10, 20, 30, 40, 50], dtype=np.int32)
+    d_in1 = DeviceArray.from_numpy(h_in1)
+    d_in2 = DeviceArray.from_numpy(h_in2)
+    d_out = DeviceArray.empty(h_in1.shape, h_in1.dtype)
 
     # Use a lambda function directly
     cuda.compute.binary_transform(
@@ -579,28 +565,30 @@ def test_binary_transform_with_lambda():
         d_in2=d_in2,
         d_out=d_out,
         op=lambda a, b: a + b,
-        num_items=len(d_in1),
+        num_items=h_in1.size,
     )
 
     expected = np.array([11, 22, 33, 44, 55], dtype=np.int32)
-    np.testing.assert_array_equal(d_out.get(), expected)
+    np.testing.assert_array_equal(d_out.copy_to_host(), expected)
 
 
 def test_binary_transform_bool_equal_to():
-    d_input1 = cp.array([True, False, True, False], dtype=np.bool_)
-    d_input2 = cp.array([True, True, False, False], dtype=np.bool_)
-    d_output = cp.empty_like(d_input1)
+    h_input1 = np.array([True, False, True, False], dtype=np.bool_)
+    h_input2 = np.array([True, True, False, False], dtype=np.bool_)
+    d_input1 = DeviceArray.from_numpy(h_input1)
+    d_input2 = DeviceArray.from_numpy(h_input2)
+    d_output = DeviceArray.empty(h_input1.shape, h_input1.dtype)
 
     cuda.compute.binary_transform(
         d_in1=d_input1,
         d_in2=d_input2,
         d_out=d_output,
         op=OpKind.EQUAL_TO,
-        num_items=len(d_input1),
+        num_items=h_input1.size,
     )
 
     expected = np.array([True, False, False, True], dtype=np.bool_)
-    np.testing.assert_array_equal(d_output.get(), expected)
+    np.testing.assert_array_equal(d_output.copy_to_host(), expected)
 
 
 def test_stateful_transform_same_bytecode_different_sizes():
@@ -615,16 +603,19 @@ def op(x):
 
         return op
 
-    d_in = cp.asarray([1, 2, 3])
-    d_out = cp.empty_like(d_in, dtype=bool)
-    op1 = make_op(cp.empty(1))  # len(arr) == 1
-    op2 = make_op(cp.empty(2))  # len(arr) == 2
+    h_in = np.asarray([1, 2, 3])
+    d_in = DeviceArray.from_numpy(h_in)
+    d_out = DeviceArray.empty(h_in.shape, bool)
+    op1 = make_op(DeviceArray.empty(1, np.float64))  # len(arr) == 1
+    op2 = make_op(DeviceArray.empty(2, np.float64))  # len(arr) == 2
 
-    cuda.compute.unary_transform(d_in=d_in, d_out=d_out, op=op1, num_items=len(d_in))
-    np.testing.assert_array_equal(np.asarray([False, True, True]), d_out.get())
+    cuda.compute.unary_transform(d_in=d_in, d_out=d_out, op=op1, num_items=h_in.size)
+    np.testing.assert_array_equal(np.asarray([False, True, True]), d_out.copy_to_host())
 
-    cuda.compute.unary_transform(d_in=d_in, d_out=d_out, op=op2, num_items=len(d_in))
-    np.testing.assert_array_equal(np.asarray([False, False, True]), d_out.get())
+    cuda.compute.unary_transform(d_in=d_in, d_out=d_out, op=op2, num_items=h_in.size)
+    np.testing.assert_array_equal(
+        np.asarray([False, False, True]), d_out.copy_to_host()
+    )
 
 
 def test_transform_caching_with_global_np_ufunc():
@@ -632,8 +623,9 @@ def test_transform_caching_with_global_np_ufunc():
     # ops referenced dotted globals like `np.<func>` those
     # ops would all hash to the same value.
 
-    d_in = cp.asarray([1.0, 2.0, 3.0])
-    d_out = cp.empty_like(d_in)
+    h_in = np.asarray([1.0, 2.0, 3.0])
+    d_in = DeviceArray.from_numpy(h_in)
+    d_out = DeviceArray.empty(h_in.shape, h_in.dtype)
 
     def make_op():
         sin = np.sin
@@ -643,12 +635,10 @@ def op(x):
 
         return op
 
-    d_out = cp.empty_like(d_in)
-
     cuda.compute.unary_transform(
-        d_in=d_in, d_out=d_out, op=make_op(), num_items=len(d_in)
+        d_in=d_in, d_out=d_out, op=make_op(), num_items=h_in.size
     )
-    cp.testing.assert_allclose(d_out, cp.sin(d_in))
+    np.testing.assert_allclose(d_out.copy_to_host(), np.sin(h_in))
 
     def make_op():
         cos = np.cos
@@ -659,8 +649,6 @@ def op(x):
         return op
 
     cuda.compute.unary_transform(
-        d_in=d_in, d_out=d_out, op=make_op(), num_items=len(d_in)
+        d_in=d_in, d_out=d_out, op=make_op(), num_items=h_in.size
     )
-    cp.testing.assert_allclose(d_out, cp.cos(d_in))
-
-    d_in = cp.asarray([1.0, 2.0, 3.0])
+    np.testing.assert_allclose(d_out.copy_to_host(), np.cos(h_in))
diff --git a/python/cuda_cccl/tests/compute/test_unique_by_key.py b/python/cuda_cccl/tests/compute/test_unique_by_key.py
index 83a4a17db67..668b6fb0a0f 100644
--- a/python/cuda_cccl/tests/compute/test_unique_by_key.py
+++ b/python/cuda_cccl/tests/compute/test_unique_by_key.py
@@ -3,10 +3,9 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 
-import cupy as cp
-import numba.cuda
 import numpy as np
 import pytest
+from _utils.device_array import DeviceArray, get_compute_capability
 
 import cuda.compute
 from cuda.compute import (
@@ -125,7 +124,7 @@ def compare_op(lhs, rhs):
 
 @pytest.mark.parametrize("dtype, num_items, op", unique_by_key_params)
 def test_unique_by_key(dtype, num_items, op, monkeypatch):
-    cc_major, _ = numba.cuda.get_current_device().compute_capability
+    cc_major, _ = get_compute_capability()
     # Skip sass verification for CC 9.0+ due to a bug in NVRTC.
     # TODO: add NVRTC version check, ref nvbug 5243118
     if cc_major >= 9:
@@ -143,11 +142,13 @@ def test_unique_by_key(dtype, num_items, op, monkeypatch):
     h_out_items = np.empty(num_items, dtype=np.float32)
     h_out_num_selected = np.empty(1, np.int32)
 
-    d_in_keys = numba.cuda.to_device(h_in_keys)
-    d_in_items = numba.cuda.to_device(h_in_items)
-    d_out_keys = numba.cuda.to_device(h_out_keys)
-    d_out_items = numba.cuda.to_device(h_out_items)
-    d_out_num_selected = numba.cuda.to_device(h_out_num_selected)
+    d_in_keys = DeviceArray.from_numpy(h_in_keys)
+    d_in_items = DeviceArray.from_numpy(h_in_items)
+    d_out_keys = DeviceArray.empty(h_out_keys.shape, h_out_keys.dtype)
+    d_out_items = DeviceArray.empty(h_out_items.shape, h_out_items.dtype)
+    d_out_num_selected = DeviceArray.empty(
+        h_out_num_selected.shape, h_out_num_selected.dtype
+    )
 
     unique_by_key_device(
         d_in_keys,
@@ -172,7 +173,7 @@ def test_unique_by_key(dtype, num_items, op, monkeypatch):
 
 @pytest.mark.parametrize("dtype, num_items, op", unique_by_key_params)
 def test_unique_by_key_iterators(dtype, num_items, op, monkeypatch):
-    cc_major, _ = numba.cuda.get_current_device().compute_capability
+    cc_major, _ = get_compute_capability()
     # Skip sass verification for CC 9.0+, due to a bug in NVRTC.
     # TODO: add NVRTC version check, ref nvbug 5243118
     if cc_major >= 9:
@@ -190,11 +191,13 @@ def test_unique_by_key_iterators(dtype, num_items, op, monkeypatch):
     h_out_items = np.empty(num_items, dtype=np.float32)
     h_out_num_selected = np.empty(1, np.int64)
 
-    d_in_keys = numba.cuda.to_device(h_in_keys)
-    d_in_items = numba.cuda.to_device(h_in_items)
-    d_out_keys = numba.cuda.to_device(h_out_keys)
-    d_out_items = numba.cuda.to_device(h_out_items)
-    d_out_num_selected = numba.cuda.to_device(h_out_num_selected)
+    d_in_keys = DeviceArray.from_numpy(h_in_keys)
+    d_in_items = DeviceArray.from_numpy(h_in_items)
+    d_out_keys = DeviceArray.empty(h_out_keys.shape, h_out_keys.dtype)
+    d_out_items = DeviceArray.empty(h_out_items.shape, h_out_items.dtype)
+    d_out_num_selected = DeviceArray.empty(
+        h_out_num_selected.shape, h_out_num_selected.dtype
+    )
 
     i_in_keys = CacheModifiedInputIterator(d_in_keys, modifier="stream")
     i_in_items = CacheModifiedInputIterator(d_in_items, modifier="stream")
@@ -226,9 +229,11 @@ def test_unique_by_key_keys_only():
     h_out_keys = np.empty(num_items, dtype=np.int32)
     h_out_num_selected = np.empty(1, np.int32)
 
-    d_in_keys = numba.cuda.to_device(h_in_keys)
-    d_out_keys = numba.cuda.to_device(h_out_keys)
-    d_out_num_selected = numba.cuda.to_device(h_out_num_selected)
+    d_in_keys = DeviceArray.from_numpy(h_in_keys)
+    d_out_keys = DeviceArray.empty(h_out_keys.shape, h_out_keys.dtype)
+    d_out_num_selected = DeviceArray.empty(
+        h_out_num_selected.shape, h_out_num_selected.dtype
+    )
 
     unique_by_key_device(
         d_in_keys,
@@ -267,11 +272,13 @@ def compare_complex(lhs, rhs):
     h_out_items = np.empty(num_items, dtype=np.float32)
     h_out_num_selected = np.empty(1, np.int32)
 
-    d_in_keys = numba.cuda.to_device(h_in_keys)
-    d_in_items = numba.cuda.to_device(h_in_items)
-    d_out_keys = numba.cuda.to_device(h_out_keys)
-    d_out_items = numba.cuda.to_device(h_out_items)
-    d_out_num_selected = numba.cuda.to_device(h_out_num_selected)
+    d_in_keys = DeviceArray.from_numpy(h_in_keys)
+    d_in_items = DeviceArray.from_numpy(h_in_items)
+    d_out_keys = DeviceArray.empty(h_out_keys.shape, h_out_keys.dtype)
+    d_out_items = DeviceArray.empty(h_out_items.shape, h_out_items.dtype)
+    d_out_num_selected = DeviceArray.empty(
+        h_out_num_selected.shape, h_out_num_selected.dtype
+    )
 
     unique_by_key_device(
         d_in_keys,
@@ -328,14 +335,13 @@ def struct_compare_op(lhs, rhs):
     h_in_items["a"] = a_items
     h_in_items["b"] = b_items
 
-    d_in_keys = cp.empty_like(h_in_keys)
-    d_in_items = cp.empty_like(h_in_items)
-    d_in_keys.set(h_in_keys)
-    d_in_items.set(h_in_items)
-
-    d_out_keys = cp.empty_like(d_in_keys)
-    d_out_items = cp.empty_like(d_in_items)
-    d_out_num_selected = cp.empty_like(h_out_num_selected)
+    d_in_keys = DeviceArray.from_numpy(h_in_keys)
+    d_in_items = DeviceArray.from_numpy(h_in_items)
+    d_out_keys = DeviceArray.empty(h_in_keys.shape, h_in_keys.dtype)
+    d_out_items = DeviceArray.empty(h_in_items.shape, h_in_items.dtype)
+    d_out_num_selected = DeviceArray.empty(
+        h_out_num_selected.shape, h_out_num_selected.dtype
+    )
 
     unique_by_key_device(
         d_in_keys,
@@ -347,10 +353,10 @@ def struct_compare_op(lhs, rhs):
         num_items,
     )
 
-    h_out_num_selected = d_out_num_selected.get()
+    h_out_num_selected = d_out_num_selected.copy_to_host()
     num_selected = h_out_num_selected[0]
-    h_out_keys = d_out_keys.get()[:num_selected]
-    h_out_items = d_out_items.get()[:num_selected]
+    h_out_keys = d_out_keys.copy_to_host()[:num_selected]
+    h_out_items = d_out_items.copy_to_host()[:num_selected]
 
     expected_keys, expected_items = unique_by_key_host(
         h_in_keys,
@@ -363,7 +369,7 @@ def struct_compare_op(lhs, rhs):
 
 
 def test_unique_by_key_with_stream(cuda_stream, monkeypatch):
-    cc_major, _ = numba.cuda.get_current_device().compute_capability
+    cc_major, _ = get_compute_capability()
     # Skip sass verification for CC 9.0+ due to a bug in NVRTC.
     # TODO: add NVRTC version check, ref nvbug 5243118
     if cc_major >= 9:
@@ -375,7 +381,6 @@ def test_unique_by_key_with_stream(cuda_stream, monkeypatch):
             False,
         )
 
-    cp_stream = cp.cuda.ExternalStream(cuda_stream.ptr)
     num_items = 10000
 
     h_in_keys = random_array(num_items, np.int32, max_value=20)
@@ -384,13 +389,19 @@ def test_unique_by_key_with_stream(cuda_stream, monkeypatch):
     h_out_items = np.empty(num_items, dtype=np.float32)
     h_out_num_selected = np.empty(1, np.int32)
 
-    with cp_stream:
-        h_in_keys = random_array(num_items, np.int32)
-        d_in_keys = cp.asarray(h_in_keys)
-        d_in_items = cp.asarray(h_in_items)
-        d_out_keys = cp.empty_like(h_out_keys)
-        d_out_items = cp.empty_like(h_out_items)
-        d_out_num_selected = cp.empty_like(h_out_num_selected)
+    d_in_keys = DeviceArray.from_numpy(h_in_keys, stream=cuda_stream)
+    d_in_items = DeviceArray.from_numpy(h_in_items, stream=cuda_stream)
+    d_out_keys = DeviceArray.empty(
+        h_out_keys.shape, h_out_keys.dtype, stream=cuda_stream
+    )
+    d_out_items = DeviceArray.empty(
+        h_out_items.shape, h_out_items.dtype, stream=cuda_stream
+    )
+    d_out_num_selected = DeviceArray.empty(
+        h_out_num_selected.shape,
+        h_out_num_selected.dtype,
+        stream=cuda_stream,
+    )
 
     unique_by_key_device(
         d_in_keys,
@@ -403,9 +414,9 @@ def test_unique_by_key_with_stream(cuda_stream, monkeypatch):
         stream=cuda_stream,
     )
 
-    h_out_keys = d_out_keys.get()
-    h_out_items = d_out_items.get()
-    h_out_num_selected = d_out_num_selected.get()
+    h_out_keys = d_out_keys.copy_to_host(stream=cuda_stream)
+    h_out_items = d_out_items.copy_to_host(stream=cuda_stream)
+    h_out_num_selected = d_out_num_selected.copy_to_host(stream=cuda_stream)
 
     num_selected = h_out_num_selected[0]
     h_out_keys = h_out_keys[:num_selected]
@@ -418,7 +429,7 @@ def test_unique_by_key_with_stream(cuda_stream, monkeypatch):
 
 
 def test_unique_by_key_well_known_equal_to(monkeypatch):
-    cc_major, _ = numba.cuda.get_current_device().compute_capability
+    cc_major, _ = get_compute_capability()
     # Skip sass verification for CC 9.0+ due to a bug in NVRTC.
     # TODO: add NVRTC version check, ref nvbug 5243118
     if cc_major >= 9:
@@ -433,11 +444,13 @@ def test_unique_by_key_well_known_equal_to(monkeypatch):
     dtype = np.int32
 
     # Create input keys and values: keys=[1,1,1,2,2,3] values=[10,20,30,40,50,60]
-    d_in_keys = cp.array([1, 1, 1, 2, 2, 3], dtype=dtype)
-    d_in_values = cp.array([10, 20, 30, 40, 50, 60], dtype=dtype)
-    d_out_keys = cp.empty_like(d_in_keys)
-    d_out_values = cp.empty_like(d_in_values)
-    d_num_selected = cp.empty(1, dtype=dtype)
+    h_in_keys = np.array([1, 1, 1, 2, 2, 3], dtype=dtype)
+    h_in_values = np.array([10, 20, 30, 40, 50, 60], dtype=dtype)
+    d_in_keys = DeviceArray.from_numpy(h_in_keys)
+    d_in_values = DeviceArray.from_numpy(h_in_values)
+    d_out_keys = DeviceArray.empty(h_in_keys.shape, h_in_keys.dtype)
+    d_out_values = DeviceArray.empty(h_in_values.shape, h_in_values.dtype)
+    d_num_selected = DeviceArray.empty(1, dtype)
 
     # Run unique by key with well-known EQUAL_TO operation
     cuda.compute.unique_by_key(
@@ -447,13 +460,13 @@ def test_unique_by_key_well_known_equal_to(monkeypatch):
         d_out_items=d_out_values,
         d_out_num_selected=d_num_selected,
         op=OpKind.EQUAL_TO,
-        num_items=len(d_in_keys),
+        num_items=h_in_keys.size,
     )
 
     # Check the result is correct
-    assert d_num_selected.get()[0] == 3  # three unique keys
+    assert d_num_selected.copy_to_host()[0] == 3  # three unique keys
     expected_keys = [1, 2, 3]
     expected_values = [10, 40, 60]  # first occurrence of each key
 
-    np.testing.assert_equal(d_out_keys.get()[:3], expected_keys)
-    np.testing.assert_equal(d_out_values.get()[:3], expected_values)
+    np.testing.assert_equal(d_out_keys.copy_to_host()[:3], expected_keys)
+    np.testing.assert_equal(d_out_values.copy_to_host()[:3], expected_values)
diff --git a/python/cuda_cccl/tests/compute/test_zip_iterator.py b/python/cuda_cccl/tests/compute/test_zip_iterator.py
index 85630bb4f5f..973e5a3c9c9 100644
--- a/python/cuda_cccl/tests/compute/test_zip_iterator.py
+++ b/python/cuda_cccl/tests/compute/test_zip_iterator.py
@@ -1,9 +1,9 @@
 # Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-import cupy as cp
 import numpy as np
 import pytest
+from _utils.device_array import DeviceArray, get_compute_capability
 
 import cuda.compute
 from cuda.compute import (
@@ -24,24 +24,26 @@ class Pair:
     def sum_pairs(p1, p2):
         return Pair(p1[0] + p2[0], p1[1] + p2[1])
 
-    d_input1 = cp.arange(num_items, dtype=np.int64)
-    d_input2 = cp.arange(num_items, dtype=np.float32)
+    h_input1 = np.arange(num_items, dtype=np.int64)
+    h_input2 = np.arange(num_items, dtype=np.float32)
+    d_input1 = DeviceArray.from_numpy(h_input1)
+    d_input2 = DeviceArray.from_numpy(h_input2)
 
     zip_it = ZipIterator(d_input1, d_input2)
 
-    d_output = cp.empty(1, dtype=Pair.dtype)
+    d_output = DeviceArray.empty(1, Pair.dtype)
     h_init = Pair(0, 0.0)
 
     cuda.compute.reduce_into(
         d_in=zip_it, d_out=d_output, num_items=num_items, op=sum_pairs, h_init=h_init
     )
 
-    expected_first = d_input1.sum().get()
-    expected_second = d_input2.sum().get()
+    expected_first = h_input1.sum()
+    expected_second = h_input2.sum()
 
-    result = d_output.get()[0]
-    cp.testing.assert_array_equal(result["first"], expected_first)
-    cp.testing.assert_allclose(result["second"], expected_second, rtol=1e-6)
+    result = d_output.copy_to_host()[0]
+    np.testing.assert_array_equal(result["first"], expected_first)
+    np.testing.assert_allclose(result["second"], expected_second, rtol=1e-6)
 
 
 @pytest.mark.parametrize("num_items", [10, 1_000, 100_000])
@@ -53,23 +55,24 @@ def max_by_value(p1, p2):
         return p1 if p1[1] > p2[1] else p2
 
     counting_it = CountingIterator(np.int32(0))
-    arr = cp.arange(num_items, dtype=np.int32)
+    h_arr = np.arange(num_items, dtype=np.int32)
+    d_arr = DeviceArray.from_numpy(h_arr)
 
-    zip_it = ZipIterator(counting_it, arr)
+    zip_it = ZipIterator(counting_it, d_arr)
 
     dtype = np.dtype([("index", np.int32), ("value", np.int32)], align=True)
     h_init = np.asarray([(-1, -1)], dtype=dtype)
 
-    d_output = cp.empty(1, dtype=dtype)
+    d_output = DeviceArray.empty(1, dtype)
 
     cuda.compute.reduce_into(
         d_in=zip_it, d_out=d_output, num_items=num_items, op=max_by_value, h_init=h_init
     )
 
-    result = d_output.get()[0]
+    result = d_output.copy_to_host()[0]
 
-    expected_index = cp.argmax(arr).get()
-    expected_value = arr[expected_index].get()
+    expected_index = np.argmax(h_arr)
+    expected_value = h_arr[expected_index]
 
     assert result["index"] == expected_index
     assert result["value"] == expected_value
@@ -86,28 +89,27 @@ def max_by_value(p1, p2):
         return p1 if p1[1] > p2[1] else p2
 
     counting_it = CountingIterator(np.int32(0))
-    arr = cp.arange(num_items, dtype=np.int32)
+    h_arr = np.arange(num_items, dtype=np.int32)
+    d_arr = DeviceArray.from_numpy(h_arr)
 
     def double_op(x):
         return x * 2
 
-    transform_it = TransformIterator(arr, double_op)
+    transform_it = TransformIterator(d_arr, double_op)
 
     zip_it = ZipIterator(counting_it, transform_it)
 
-    d_output = cp.empty(1, dtype=IndexValuePair.dtype)
-
-    result = d_output.get()[0]
+    d_output = DeviceArray.empty(1, IndexValuePair.dtype)
     h_init = IndexValuePair(-1, -1)
 
     cuda.compute.reduce_into(
         d_in=zip_it, d_out=d_output, num_items=num_items, op=max_by_value, h_init=h_init
     )
 
-    result = d_output.get()[0]
+    result = d_output.copy_to_host()[0]
 
-    expected_index = cp.argmax(arr).get()
-    expected_value = arr[expected_index].get() * 2
+    expected_index = np.argmax(h_arr)
+    expected_value = h_arr[expected_index] * 2
 
     assert result["index"] == expected_index
     assert result["value"] == expected_value
@@ -126,28 +128,30 @@ class Triple:
     def sum_triples(t1, t2):
         return Triple(t1[0] + t2[0], t1[1] + t2[1], t1[2] + t2[2])
 
-    d_input1 = cp.arange(num_items, dtype=np.int64)
-    d_input2 = cp.arange(num_items, dtype=np.float32)
+    h_input1 = np.arange(num_items, dtype=np.int64)
+    h_input2 = np.arange(num_items, dtype=np.float32)
+    d_input1 = DeviceArray.from_numpy(h_input1)
+    d_input2 = DeviceArray.from_numpy(h_input2)
     counting_it = CountingIterator(np.int64(10))
 
     zip_it = ZipIterator(d_input1, d_input2, counting_it)
 
-    d_output = cp.empty(1, dtype=Triple.dtype)
+    d_output = DeviceArray.empty(1, Triple.dtype)
     h_init = Triple(0, 0.0, 0)
 
     cuda.compute.reduce_into(
         d_in=zip_it, d_out=d_output, num_items=num_items, op=sum_triples, h_init=h_init
     )
 
-    result = d_output.get()[0]
+    result = d_output.copy_to_host()[0]
 
-    expected_first = d_input1.sum().get()
-    expected_second = d_input2.sum().get()
-    expected_third = cp.arange(10, 10 + num_items).sum().get()
+    expected_first = h_input1.sum()
+    expected_second = h_input2.sum()
+    expected_third = np.arange(10, 10 + num_items).sum()
 
-    cp.testing.assert_array_equal(result["first"], expected_first)
-    cp.testing.assert_allclose(result["second"], expected_second, rtol=1e-6)
-    cp.testing.assert_array_equal(result["third"], expected_third)
+    np.testing.assert_array_equal(result["first"], expected_first)
+    np.testing.assert_allclose(result["second"], expected_second, rtol=1e-6)
+    np.testing.assert_array_equal(result["third"], expected_third)
 
 
 @pytest.mark.parametrize("num_items", [10, 1_000, 100_000])
@@ -161,20 +165,21 @@ class Single:
     def sum_singles(s1, s2):
         return Single(s1[0] + s2[0])
 
-    d_input = cp.arange(num_items, dtype=np.int64)
+    h_input = np.arange(num_items, dtype=np.int64)
+    d_input = DeviceArray.from_numpy(h_input)
 
     zip_it = ZipIterator(d_input)
 
-    d_output = cp.empty(1, dtype=Single.dtype)
+    d_output = DeviceArray.empty(1, Single.dtype)
     h_init = Single(0)
 
     cuda.compute.reduce_into(
         d_in=zip_it, d_out=d_output, num_items=num_items, op=sum_singles, h_init=h_init
     )
 
-    result = d_output.get()[0]
+    result = d_output.copy_to_host()[0]
 
-    expected_value = d_input.sum().get()
+    expected_value = h_input.sum()
     assert result["value"] == expected_value
 
 
@@ -189,14 +194,16 @@ def binary_transform(pair1, pair2):
         return TransformedPair(pair1[0] + pair2[0], pair1[1] * pair2[1])
 
     counting_it1 = CountingIterator(np.int32(0))
-    arr1 = cp.arange(num_items, dtype=np.int32)
-    zip_it1 = ZipIterator(counting_it1, arr1)
+    h_arr1 = np.arange(num_items, dtype=np.int32)
+    d_arr1 = DeviceArray.from_numpy(h_arr1)
+    zip_it1 = ZipIterator(counting_it1, d_arr1)
 
     counting_it2 = CountingIterator(np.int32(0))
-    arr2 = cp.arange(num_items, dtype=np.int32)
-    zip_it2 = ZipIterator(counting_it2, arr2)
+    h_arr2 = np.arange(num_items, dtype=np.int32)
+    d_arr2 = DeviceArray.from_numpy(h_arr2)
+    zip_it2 = ZipIterator(counting_it2, d_arr2)
 
-    d_output = cp.empty(num_items, dtype=TransformedPair.dtype)
+    d_output = DeviceArray.empty(num_items, TransformedPair.dtype)
 
     cuda.compute.binary_transform(
         d_in1=zip_it1,
@@ -206,10 +213,10 @@ def binary_transform(pair1, pair2):
         num_items=num_items,
     )
 
-    result = d_output.get()
+    result = d_output.copy_to_host()
 
-    expected_sum_indices = (arr1 + arr2).get()
-    expected_product_values = (arr1 * arr2).get()
+    expected_sum_indices = h_arr1 + h_arr2
+    expected_product_values = h_arr1 * h_arr2
 
     for i, result_item in enumerate(result):
         assert result_item["sum_indices"] == expected_sum_indices[i]
@@ -231,13 +238,15 @@ def min_pairs(p1, p2):
         return Pair(min(p1[0], p2[0]), min(p1[1], p2[1]))
 
     # Create two randomized arrays to make min operations interesting
-    arr1 = cp.random.randint(0, 1000, num_items, dtype=np.int64)
-    arr2 = cp.random.randint(0, 1000, num_items, dtype=np.int64)
+    h_arr1 = np.random.randint(0, 1000, num_items, dtype=np.int64)
+    h_arr2 = np.random.randint(0, 1000, num_items, dtype=np.int64)
+    d_arr1 = DeviceArray.from_numpy(h_arr1)
+    d_arr2 = DeviceArray.from_numpy(h_arr2)
 
-    zip_it = ZipIterator(arr1, arr2)
+    zip_it = ZipIterator(d_arr1, d_arr2)
 
-    d_output = cp.empty(num_items, dtype=Pair.dtype)
-    h_init = Pair(cp.iinfo(np.int64).max, cp.iinfo(np.int64).max)
+    d_output = DeviceArray.empty(num_items, Pair.dtype)
+    h_init = Pair(np.iinfo(np.int64).max, np.iinfo(np.int64).max)
 
     cuda.compute.inclusive_scan(
         d_in=zip_it,
@@ -247,11 +256,11 @@ def min_pairs(p1, p2):
         num_items=num_items,
     )
 
-    result = d_output.get()
+    result = d_output.copy_to_host()
 
     # Verify the scan operation produces running minimums for both arrays
-    expected_first_running_mins = np.minimum.accumulate(arr1.get())
-    expected_second_running_mins = np.minimum.accumulate(arr2.get())
+    expected_first_running_mins = np.minimum.accumulate(h_arr1)
+    expected_second_running_mins = np.minimum.accumulate(h_arr2)
 
     for i, result_item in enumerate(result):
         assert result_item["first_min"] == expected_first_running_mins[i]
@@ -261,10 +270,8 @@ def min_pairs(p1, p2):
 @pytest.mark.parametrize("num_items", [10, 1000])
 def test_output_zip_iterator_with_scan(monkeypatch, num_items):
     """Test ZipIterator as output iterator with scan operations."""
-    import numba.cuda
-
     # Skip SASS check for CC 8.0+ due to LDL/STL CI failure.
-    cc_major, _ = numba.cuda.get_current_device().compute_capability
+    cc_major, _ = get_compute_capability()
     if cc_major >= 8:
         monkeypatch.setattr(
             cuda.compute._cccl_interop,
@@ -272,13 +279,15 @@ def test_output_zip_iterator_with_scan(monkeypatch, num_items):
             False,
         )
 
-    d_in1 = cp.random.randint(0, 1000, num_items, dtype=np.int64)
-    d_in2 = cp.random.randint(0, 1000, num_items, dtype=np.int64)
+    h_in1 = np.random.randint(0, 1000, num_items, dtype=np.int64)
+    h_in2 = np.random.randint(0, 1000, num_items, dtype=np.int64)
+    d_in1 = DeviceArray.from_numpy(h_in1)
+    d_in2 = DeviceArray.from_numpy(h_in2)
 
     zip_it = ZipIterator(d_in1, d_in2)
 
-    d_out1 = cp.empty_like(d_in1)
-    d_out2 = cp.empty_like(d_in2)
+    d_out1 = DeviceArray.empty(h_in1.shape, h_in1.dtype)
+    d_out2 = DeviceArray.empty(h_in2.shape, h_in2.dtype)
 
     zip_out_it = ZipIterator(d_out1, d_out2)
 
@@ -293,20 +302,18 @@ def add_pairs(p1, p2):
         num_items=num_items,
     )
 
-    in1 = d_in1.get()
-    in2 = d_in2.get()
-    expected_out1 = np.empty_like(in1)
-    expected_out2 = np.empty_like(in2)
+    expected_out1 = np.empty_like(h_in1)
+    expected_out2 = np.empty_like(h_in2)
 
     # First element is just the input
-    expected_out1[0] = in1[0]
-    expected_out2[0] = in2[0]
+    expected_out1[0] = h_in1[0]
+    expected_out2[0] = h_in2[0]
     for i in range(1, num_items):
-        expected_out1[i] = expected_out1[i - 1] + in1[i]
-        expected_out2[i] = expected_out2[i - 1] + in2[i]
+        expected_out1[i] = expected_out1[i - 1] + h_in1[i]
+        expected_out2[i] = expected_out2[i - 1] + h_in2[i]
 
-    np.testing.assert_array_equal(d_out1.get(), expected_out1)
-    np.testing.assert_array_equal(d_out2.get(), expected_out2)
+    np.testing.assert_array_equal(d_out1.copy_to_host(), expected_out1)
+    np.testing.assert_array_equal(d_out2.copy_to_host(), expected_out2)
 
 
 def test_nested_zip_iterators():
@@ -330,9 +337,12 @@ def sum_nested_zips(v1, v2):
     num_items = 100
 
     # Create three input arrays
-    d_input_a = cp.arange(num_items, dtype=np.int32)
-    d_input_b = cp.arange(num_items, dtype=np.int64) * 2
-    d_input_c = cp.arange(num_items, dtype=np.float32) * 3.0
+    h_input_a = np.arange(num_items, dtype=np.int32)
+    h_input_b = np.arange(num_items, dtype=np.int64) * 2
+    h_input_c = np.arange(num_items, dtype=np.float32) * 3.0
+    d_input_a = DeviceArray.from_numpy(h_input_a)
+    d_input_b = DeviceArray.from_numpy(h_input_b)
+    d_input_c = DeviceArray.from_numpy(h_input_c)
 
     # Create an inner zip iterator combining a and b
     inner_zip = ZipIterator(d_input_a, d_input_b)
@@ -341,7 +351,7 @@ def sum_nested_zips(v1, v2):
     outer_zip = ZipIterator(inner_zip, d_input_c)
 
     # Perform reduction
-    d_output = cp.empty(1, dtype=OuterTriple.dtype)
+    d_output = DeviceArray.empty(1, OuterTriple.dtype)
     h_init = OuterTriple(InnerPair(0, 0), 0.0)
 
     cuda.compute.reduce_into(
@@ -352,12 +362,12 @@ def sum_nested_zips(v1, v2):
         h_init=h_init,
     )
 
-    result = d_output.get()[0]
+    result = d_output.copy_to_host()[0]
 
     # Calculate expected values
-    expected_first = d_input_a.sum().get()
-    expected_second = d_input_b.sum().get()
-    expected_third = d_input_c.sum().get()
+    expected_first = h_input_a.sum()
+    expected_second = h_input_b.sum()
+    expected_third = h_input_c.sum()
 
     assert result["inner"]["first"] == expected_first, (
         f"Expected inner.first={expected_first}, got {result['inner']['first']}"
@@ -386,14 +396,17 @@ def sum_nested_zips(v1, v2):
 
     num_items = 100
 
-    d_input_a = cp.arange(num_items, dtype=np.int32)
-    d_input_b = cp.arange(num_items, dtype=np.float32)
-    d_input_c = cp.arange(num_items, dtype=np.int64)
+    h_input_a = np.arange(num_items, dtype=np.int32)
+    h_input_b = np.arange(num_items, dtype=np.float32)
+    h_input_c = np.arange(num_items, dtype=np.int64)
+    d_input_a = DeviceArray.from_numpy(h_input_a)
+    d_input_b = DeviceArray.from_numpy(h_input_b)
+    d_input_c = DeviceArray.from_numpy(h_input_c)
 
     inner_zip = ZipIterator(d_input_a, d_input_b)
     outer_zip = ZipIterator(inner_zip, d_input_c)
 
-    d_output = cp.empty(1, dtype=OuterPair.dtype)
+    d_output = DeviceArray.empty(1, OuterPair.dtype)
     h_init = OuterPair(InnerPair(0, 0.0), 0)
 
     cuda.compute.reduce_into(
@@ -404,13 +417,13 @@ def sum_nested_zips(v1, v2):
         h_init=h_init,
     )
 
-    result = d_output.get()[0]
+    result = d_output.copy_to_host()[0]
 
     # outer_zip produces: {value_0: {value_0: int32, value_1: float32}, value_1: int64}
     # which maps to our OuterPair: {inner: {a: int32, b: float32}, c: int64}
-    expected_a = d_input_a.sum().get()  # int32
-    expected_b = d_input_b.sum().get()  # float32
-    expected_c = d_input_c.sum().get()  # int64
+    expected_a = h_input_a.sum()  # int32
+    expected_b = h_input_b.sum()  # float32
+    expected_c = h_input_c.sum()  # int64
 
     assert result["inner"]["a"] == expected_a
     assert np.isclose(result["inner"]["b"], expected_b)
@@ -426,9 +439,7 @@ def sum_nested_zips(v1, v2):
     ],
 )
 def test_nested_output_zip_iterator_with_scan(monkeypatch, num_items, dtype_map):
-    import numba.cuda
-
-    cc_major, _ = numba.cuda.get_current_device().compute_capability
+    cc_major, _ = get_compute_capability()
     if cc_major >= 8:
         monkeypatch.setattr(
             cuda.compute._cccl_interop,
@@ -446,15 +457,13 @@ def test_nested_output_zip_iterator_with_scan(monkeypatch, num_items, dtype_map)
         h_in2[i]["x"] = float(i * 10)
         h_in2[i]["y"] = float(i * 20)
 
-    d_in1 = cp.empty(num_items, dtype=Vec2.dtype)
-    d_in2 = cp.empty(num_items, dtype=Vec2.dtype)
-    d_in1.set(h_in1)
-    d_in2.set(h_in2)
+    d_in1 = DeviceArray.from_numpy(h_in1)
+    d_in2 = DeviceArray.from_numpy(h_in2)
 
     zip_it = ZipIterator(d_in1, d_in2)
 
-    d_out1 = cp.empty_like(d_in1)
-    d_out2 = cp.empty_like(d_in2)
+    d_out1 = DeviceArray.empty(h_in1.shape, h_in1.dtype)
+    d_out2 = DeviceArray.empty(h_in2.shape, h_in2.dtype)
 
     zip_out_it = ZipIterator(d_out1, d_out2)
 
@@ -471,25 +480,23 @@ def add_vec2_pairs(v1, v2):
         num_items=num_items,
     )
 
-    in1 = d_in1.get()
-    in2 = d_in2.get()
-    expected_out1 = np.empty_like(in1)
-    expected_out2 = np.empty_like(in2)
+    expected_out1 = np.empty_like(h_in1)
+    expected_out2 = np.empty_like(h_in2)
 
-    expected_out1[0] = in1[0]
-    expected_out2[0] = in2[0]
+    expected_out1[0] = h_in1[0]
+    expected_out2[0] = h_in2[0]
     for i in range(1, num_items):
-        expected_out1[i]["x"] = expected_out1[i - 1]["x"] + in1[i]["x"]
-        expected_out1[i]["y"] = expected_out1[i - 1]["y"] + in1[i]["y"]
-        expected_out2[i]["x"] = expected_out2[i - 1]["x"] + in2[i]["x"]
-        expected_out2[i]["y"] = expected_out2[i - 1]["y"] + in2[i]["y"]
+        expected_out1[i]["x"] = expected_out1[i - 1]["x"] + h_in1[i]["x"]
+        expected_out1[i]["y"] = expected_out1[i - 1]["y"] + h_in1[i]["y"]
+        expected_out2[i]["x"] = expected_out2[i - 1]["x"] + h_in2[i]["x"]
+        expected_out2[i]["y"] = expected_out2[i - 1]["y"] + h_in2[i]["y"]
 
-    np.testing.assert_array_equal(d_out1.get(), expected_out1)
-    np.testing.assert_array_equal(d_out2.get(), expected_out2)
+    np.testing.assert_array_equal(d_out1.copy_to_host(), expected_out1)
+    np.testing.assert_array_equal(d_out2.copy_to_host(), expected_out2)
 
 
 def test_zip_iterator_of_transform_iterator_kind():
-    arr = cp.arange(10, dtype=np.int64)
+    d_arr = DeviceArray.from_numpy(np.arange(10, dtype=np.int64))
 
     def f(x):
         return x
@@ -497,8 +504,8 @@ def f(x):
     def g(x):
         return x + 1
 
-    it1 = ZipIterator(TransformIterator(arr, f))
-    it2 = ZipIterator(TransformIterator(arr, g))
+    it1 = ZipIterator(TransformIterator(d_arr, f))
+    it2 = ZipIterator(TransformIterator(d_arr, g))
     assert it1.kind != it2.kind
 
 
@@ -522,8 +529,10 @@ def test_caching_zip_iterator():
     # Create multiple instances with same structure
     iterators = []
     for i in range(5):
-        arr = cp.arange(i * 10, (i + 1) * 10, dtype=np.float32)
-        z = ZipIterator(arr)
+        d_arr = DeviceArray.from_numpy(
+            np.arange(i * 10, (i + 1) * 10, dtype=np.float32)
+        )
+        z = ZipIterator(d_arr)
         # Trigger compilation by accessing LTOIR
         z.get_advance_op()
         z.get_input_deref_op()
@@ -542,12 +551,12 @@ def test_caching_zip_iterator():
     # Test 4: Arrays with different dtypes should not share cache
     compile_cpp_op_code.cache_clear()
 
-    z_int32 = ZipIterator(cp.arange(10, dtype=np.int32))
+    z_int32 = ZipIterator(DeviceArray.from_numpy(np.arange(10, dtype=np.int32)))
     z_int32.get_advance_op()
     z_int32.get_input_deref_op()
     misses_after_first = compile_cpp_op_code.cache_info().misses
 
-    z_int64 = ZipIterator(cp.arange(10, dtype=np.int64))
+    z_int64 = ZipIterator(DeviceArray.from_numpy(np.arange(10, dtype=np.int64)))
     z_int64.get_advance_op()
     z_int64.get_input_deref_op()
     misses_after_second = compile_cpp_op_code.cache_info().misses
@@ -647,8 +656,10 @@ class Pair:
     num_items = 100
     offset = 10
 
-    d_input1 = cp.arange(num_items, dtype=np.int32)
-    d_input2 = cp.arange(num_items, dtype=np.int32) * 2
+    h_input1 = np.arange(num_items, dtype=np.int32)
+    h_input2 = np.arange(num_items, dtype=np.int32) * 2
+    d_input1 = DeviceArray.from_numpy(h_input1)
+    d_input2 = DeviceArray.from_numpy(h_input2)
 
     # Create base zip iterator
     zip_it = ZipIterator(d_input1, d_input2)
@@ -661,7 +672,7 @@ def sum_pairs(p1, p2):
         return Pair(p1[0] + p2[0], p1[1] + p2[1])
 
     h_init = Pair(0, 0)
-    d_output = cp.empty(1, dtype=Pair.dtype)
+    d_output = DeviceArray.empty(1, Pair.dtype)
 
     remaining_items = num_items - offset
     cuda.compute.reduce_into(
@@ -672,11 +683,11 @@ def sum_pairs(p1, p2):
         h_init=h_init,
     )
 
-    result = d_output.get()[0]
+    result = d_output.copy_to_host()[0]
 
     # Expected values should be sum from offset onwards
-    expected_first = d_input1[offset:].sum().get()
-    expected_second = d_input2[offset:].sum().get()
+    expected_first = h_input1[offset:].sum()
+    expected_second = h_input2[offset:].sum()
 
     assert result["first"] == expected_first
     assert result["second"] == expected_second
@@ -699,9 +710,12 @@ def sum_nested_zips(v1, v2):
     offset = 15
 
     # Create three input arrays
-    d_input_a = cp.arange(num_items, dtype=np.int32)
-    d_input_b = cp.arange(num_items, dtype=np.int64) * 2
-    d_input_c = cp.arange(num_items, dtype=np.float32) * 3.0
+    h_input_a = np.arange(num_items, dtype=np.int32)
+    h_input_b = np.arange(num_items, dtype=np.int64) * 2
+    h_input_c = np.arange(num_items, dtype=np.float32) * 3.0
+    d_input_a = DeviceArray.from_numpy(h_input_a)
+    d_input_b = DeviceArray.from_numpy(h_input_b)
+    d_input_c = DeviceArray.from_numpy(h_input_c)
 
     # Create nested zip: ZipIterator(ZipIterator(a, b), c)
     inner_zip = ZipIterator(d_input_a, d_input_b)
@@ -711,7 +725,7 @@ def sum_nested_zips(v1, v2):
     advanced_outer_zip = outer_zip + offset
 
     # Perform reduction from the advanced position
-    d_output = cp.empty(1, dtype=OuterTriple.dtype)
+    d_output = DeviceArray.empty(1, OuterTriple.dtype)
     h_init = OuterTriple(InnerPair(0, 0), 0.0)
 
     remaining_items = num_items - offset
@@ -723,12 +737,12 @@ def sum_nested_zips(v1, v2):
         h_init=h_init,
     )
 
-    result = d_output.get()[0]
+    result = d_output.copy_to_host()[0]
 
     # Calculate expected values from offset onwards
-    expected_first = d_input_a[offset:].sum().get()
-    expected_second = d_input_b[offset:].sum().get()
-    expected_third = d_input_c[offset:].sum().get()
+    expected_first = h_input_a[offset:].sum()
+    expected_second = h_input_b[offset:].sum()
+    expected_third = h_input_c[offset:].sum()
 
     assert result["inner"]["first"] == expected_first, (
         f"Expected inner.first={expected_first}, got {result['inner']['first']}"
diff --git a/python/cuda_cccl/tests/coop/_experimental/test_block_exchange.py b/python/cuda_cccl/tests/coop/_experimental/test_block_exchange.py
index add2bc6d0c2..c81c4c1e6d9 100644
--- a/python/cuda_cccl/tests/coop/_experimental/test_block_exchange.py
+++ b/python/cuda_cccl/tests/coop/_experimental/test_block_exchange.py
@@ -14,6 +14,7 @@
 import numba
 import numpy as np
 import pytest
+from _utils.device_array import DeviceArray
 from helpers import (
     NUMBA_TYPES_TO_NP,
     Complex,
@@ -119,11 +120,10 @@ def kernel(input_arr, output_arr):
     total_items = num_threads * items_per_thread
     h_input = random_int(total_items, T_np)
 
-    d_input = cuda.to_device(h_input)
-    d_output = cuda.device_array(total_items, dtype=T_np)
+    d_input = DeviceArray.from_numpy(h_input)
+    d_output = DeviceArray.empty(total_items, dtype=T_np)
 
     kernel[1, threads_per_block](d_input, d_output)
-    cuda.synchronize()
 
     output = d_output.copy_to_host()
 
@@ -229,11 +229,10 @@ def kernel(input_arr, output_arr):
     h_input_imag = random_int(total_complex_items, T_complex_np_component)
     h_input_combined = np.concatenate((h_input_real, h_input_imag))
 
-    d_input = cuda.to_device(h_input_combined)
-    d_output = cuda.device_array(2 * total_complex_items, dtype=T_complex_np_component)
+    d_input = DeviceArray.from_numpy(h_input_combined)
+    d_output = DeviceArray.empty(2 * total_complex_items, dtype=T_complex_np_component)
 
     kernel[1, threads_per_block](d_input, d_output)
-    cuda.synchronize()
 
     output_combined = d_output.copy_to_host()
     output_real = output_combined[:total_complex_items]
diff --git a/python/cuda_cccl/tests/coop/_experimental/test_block_load.py b/python/cuda_cccl/tests/coop/_experimental/test_block_load.py
index 2693d642395..617ff27210e 100644
--- a/python/cuda_cccl/tests/coop/_experimental/test_block_load.py
+++ b/python/cuda_cccl/tests/coop/_experimental/test_block_load.py
@@ -7,6 +7,7 @@
 
 import numba
 import pytest
+from _utils.device_array import DeviceArray
 from helpers import NUMBA_TYPES_TO_NP, random_int, row_major_tid
 from numba import cuda, types
 
@@ -61,10 +62,9 @@ def kernel(d_input, d_output):
     dtype = NUMBA_TYPES_TO_NP[T]
     items_per_tile = num_threads_per_block * items_per_thread
     h_input = random_int(items_per_tile, dtype)
-    d_input = cuda.to_device(h_input)
-    d_output = cuda.device_array(items_per_tile, dtype=dtype)
+    d_input = DeviceArray.from_numpy(h_input)
+    d_output = DeviceArray.empty(items_per_tile, dtype=dtype)
     kernel[1, threads_per_block](d_input, d_output)
-    cuda.synchronize()
 
     output = d_output.copy_to_host()
     reference = h_input
diff --git a/python/cuda_cccl/tests/coop/_experimental/test_block_load_store_api.py b/python/cuda_cccl/tests/coop/_experimental/test_block_load_store_api.py
index 99da797c97c..2bd3ecd1fc7 100644
--- a/python/cuda_cccl/tests/coop/_experimental/test_block_load_store_api.py
+++ b/python/cuda_cccl/tests/coop/_experimental/test_block_load_store_api.py
@@ -2,6 +2,9 @@
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+from _utils.device_array import DeviceArray
+
+# isort: split
 # example-begin imports
 import numba
 import numpy as np
@@ -36,8 +39,8 @@ def kernel(input, output):
     h_input = np.random.randint(
         0, 42, threads_per_block * items_per_thread, dtype=np.int32
     )
-    d_input = cuda.to_device(h_input)
-    d_output = cuda.device_array_like(d_input)
+    d_input = DeviceArray.from_numpy(h_input)
+    d_output = DeviceArray.empty(h_input.shape, h_input.dtype)
     kernel[1, threads_per_block](d_input, d_output)
     h_output = d_output.copy_to_host()
 
diff --git a/python/cuda_cccl/tests/coop/_experimental/test_block_merge_sort.py b/python/cuda_cccl/tests/coop/_experimental/test_block_merge_sort.py
index fe339b1d27e..88943761c80 100644
--- a/python/cuda_cccl/tests/coop/_experimental/test_block_merge_sort.py
+++ b/python/cuda_cccl/tests/coop/_experimental/test_block_merge_sort.py
@@ -8,6 +8,7 @@
 import numba
 import numpy as np
 import pytest
+from _utils.device_array import DeviceArray
 from helpers import NUMBA_TYPES_TO_NP, random_int, row_major_tid
 from numba import cuda, types
 
@@ -51,10 +52,9 @@ def kernel(input, output):
     dtype = NUMBA_TYPES_TO_NP[T]
     items_per_tile = num_threads_per_block * items_per_thread
     input = random_int(items_per_tile, dtype)
-    d_input = cuda.to_device(input)
-    d_output = cuda.device_array(items_per_tile, dtype=dtype)
+    d_input = DeviceArray.from_numpy(input)
+    d_output = DeviceArray.empty(items_per_tile, dtype=dtype)
     kernel[1, threads_per_block](d_input, d_output)
-    cuda.synchronize()
 
     output = d_output.copy_to_host()
     reference = sorted(input)
@@ -103,10 +103,9 @@ def kernel(input, output):
     dtype = NUMBA_TYPES_TO_NP[T]
     items_per_tile = num_threads_per_block * items_per_thread
     input = random_int(items_per_tile, dtype)
-    d_input = cuda.to_device(input)
-    d_output = cuda.device_array(items_per_tile, dtype=dtype)
+    d_input = DeviceArray.from_numpy(input)
+    d_output = DeviceArray.empty(items_per_tile, dtype=dtype)
     kernel[1, threads_per_block](d_input, d_output)
-    cuda.synchronize()
 
     output = d_output.copy_to_host()
     reference = sorted(input, reverse=True)
@@ -151,10 +150,9 @@ def kernel(input, output):
     items_per_tile = threads_per_block * items_per_thread
     input = np.random.random(items_per_tile) + 1j * np.random.random(items_per_tile)
     input = input.astype(dtype)
-    d_input = cuda.to_device(input)
-    d_output = cuda.device_array(items_per_tile, dtype=dtype)
+    d_input = DeviceArray.from_numpy(input)
+    d_output = DeviceArray.empty(items_per_tile, dtype=dtype)
     kernel[1, threads_per_block](d_input, d_output)
-    cuda.synchronize()
 
     output = d_output.copy_to_host()
     reference = sorted(input, reverse=True, key=lambda x: x.real)
diff --git a/python/cuda_cccl/tests/coop/_experimental/test_block_merge_sort_api.py b/python/cuda_cccl/tests/coop/_experimental/test_block_merge_sort_api.py
index 0bb68952d67..024df9604a6 100644
--- a/python/cuda_cccl/tests/coop/_experimental/test_block_merge_sort_api.py
+++ b/python/cuda_cccl/tests/coop/_experimental/test_block_merge_sort_api.py
@@ -4,6 +4,7 @@
 
 import numba
 import numpy as np
+from _utils.device_array import DeviceArray
 from numba import cuda
 
 import cuda.coop._experimental as coop
@@ -47,7 +48,7 @@ def kernel(keys):
     tile_size = threads_per_block * items_per_thread
 
     h_keys = np.arange(0, tile_size, dtype=np.int32)
-    d_keys = cuda.to_device(h_keys)
+    d_keys = DeviceArray.from_numpy(h_keys)
     kernel[1, threads_per_block](d_keys)
     h_keys = d_keys.copy_to_host()
     for i in range(tile_size):
diff --git a/python/cuda_cccl/tests/coop/_experimental/test_block_radix_sort.py b/python/cuda_cccl/tests/coop/_experimental/test_block_radix_sort.py
index 072e443662d..961b1c259c6 100644
--- a/python/cuda_cccl/tests/coop/_experimental/test_block_radix_sort.py
+++ b/python/cuda_cccl/tests/coop/_experimental/test_block_radix_sort.py
@@ -7,6 +7,7 @@
 
 import numba
 import pytest
+from _utils.device_array import DeviceArray
 from helpers import NUMBA_TYPES_TO_NP, random_int, row_major_tid
 from numba import cuda, types
 
@@ -47,10 +48,9 @@ def kernel(input, output):
     dtype = NUMBA_TYPES_TO_NP[T]
     items_per_tile = num_threads_per_block * items_per_thread
     input = random_int(items_per_tile, dtype)
-    d_input = cuda.to_device(input)
-    d_output = cuda.device_array(items_per_tile, dtype=dtype)
+    d_input = DeviceArray.from_numpy(input)
+    d_output = DeviceArray.empty(items_per_tile, dtype=dtype)
     kernel[1, threads_per_block](d_input, d_output)
-    cuda.synchronize()
 
     output = d_output.copy_to_host()
     reference = sorted(input, reverse=True)
@@ -92,10 +92,9 @@ def kernel(input, output):
 
     dtype = NUMBA_TYPES_TO_NP[T]
     input = random_int(items_per_tile, dtype)
-    d_input = cuda.to_device(input)
-    d_output = cuda.device_array(items_per_tile, dtype=dtype)
+    d_input = DeviceArray.from_numpy(input)
+    d_output = DeviceArray.empty(items_per_tile, dtype=dtype)
     kernel[1, threads_per_block](d_input, d_output)
-    cuda.synchronize()
 
     output = d_output.copy_to_host()
     reference = sorted(input)
@@ -133,10 +132,9 @@ def kernel(input, output):
 
     dtype = NUMBA_TYPES_TO_NP[T]
     input = random_int(items_per_tile, dtype)
-    d_input = cuda.to_device(input)
-    d_output = cuda.device_array(items_per_tile, dtype=dtype)
+    d_input = DeviceArray.from_numpy(input)
+    d_output = DeviceArray.empty(items_per_tile, dtype=dtype)
     kernel[1, threads_per_block](d_input, d_output)
-    cuda.synchronize()
 
     output = d_output.copy_to_host()
     reference = sorted(input)
@@ -189,15 +187,14 @@ def kernel(int_input, int_output, double_input, double_output):
             double_output[tid * items_per_thread + i] = double_thread_data[i]
 
     int_input = random_int(items_per_tile, "int32")
-    d_int_input = cuda.to_device(int_input)
-    d_int_output = cuda.device_array(items_per_tile, dtype="int32")
+    d_int_input = DeviceArray.from_numpy(int_input)
+    d_int_output = DeviceArray.empty(items_per_tile, dtype="int32")
     double_input = random_int(items_per_tile, "float64")
-    d_double_input = cuda.to_device(double_input)
-    d_double_output = cuda.device_array(items_per_tile, dtype="float64")
+    d_double_input = DeviceArray.from_numpy(double_input)
+    d_double_output = DeviceArray.empty(items_per_tile, dtype="float64")
     kernel[1, threads_per_block](
         d_int_input, d_int_output, d_double_input, d_double_output
     )
-    cuda.synchronize()
 
     int_output = d_int_output.copy_to_host()
     int_reference = sorted(int_input)
diff --git a/python/cuda_cccl/tests/coop/_experimental/test_block_radix_sort_api.py b/python/cuda_cccl/tests/coop/_experimental/test_block_radix_sort_api.py
index 5747cd8e161..9f7c36b722d 100644
--- a/python/cuda_cccl/tests/coop/_experimental/test_block_radix_sort_api.py
+++ b/python/cuda_cccl/tests/coop/_experimental/test_block_radix_sort_api.py
@@ -4,6 +4,7 @@
 
 import numba
 import numpy as np
+from _utils.device_array import DeviceArray
 from numba import cuda
 
 import cuda.coop._experimental as coop
@@ -43,7 +44,7 @@ def kernel(keys):
     tile_size = threads_per_block * items_per_thread
 
     h_keys = np.arange(tile_size - 1, -1, -1, dtype=np.int32)
-    d_keys = cuda.to_device(h_keys)
+    d_keys = DeviceArray.from_numpy(h_keys)
     kernel[1, threads_per_block](d_keys)
     h_keys = d_keys.copy_to_host()
     for i in range(tile_size):
@@ -80,7 +81,7 @@ def kernel(keys):
     tile_size = threads_per_block * items_per_thread
 
     h_keys = np.arange(0, tile_size, dtype=np.int32)
-    d_keys = cuda.to_device(h_keys)
+    d_keys = DeviceArray.from_numpy(h_keys)
     kernel[1, threads_per_block](d_keys)
     h_keys = d_keys.copy_to_host()
     for i in range(tile_size):
diff --git a/python/cuda_cccl/tests/coop/_experimental/test_block_reduce.py b/python/cuda_cccl/tests/coop/_experimental/test_block_reduce.py
index f777b1acb83..874ff498f55 100644
--- a/python/cuda_cccl/tests/coop/_experimental/test_block_reduce.py
+++ b/python/cuda_cccl/tests/coop/_experimental/test_block_reduce.py
@@ -8,6 +8,7 @@
 import numba
 import numpy as np
 import pytest
+from _utils.device_array import DeviceArray
 from helpers import (
     NUMBA_TYPES_TO_NP,
     Complex,
@@ -65,10 +66,9 @@ def kernel(input, output):
             output[1] = block_output.imag
 
     h_input = random_int(2 * num_threads_per_block, "int32")
-    d_input = cuda.to_device(h_input)
-    d_output = cuda.device_array(2, dtype="int32")
+    d_input = DeviceArray.from_numpy(h_input)
+    d_output = DeviceArray.empty(2, dtype="int32")
     kernel[1, threads_per_block](d_input, d_output)
-    cuda.synchronize()
     h_output = d_output.copy_to_host()
     h_expected = (
         np.sum(h_input[:num_threads_per_block]),
@@ -129,10 +129,9 @@ def kernel(input, output):
             output[1] = block_output.imag
 
     h_input = random_int(2 * num_threads_per_block, "int32")
-    d_input = cuda.to_device(h_input)
-    d_output = cuda.device_array(2, dtype="int32")
+    d_input = DeviceArray.from_numpy(h_input)
+    d_output = DeviceArray.empty(2, dtype="int32")
     kernel[1, threads_per_block](d_input, d_output)
-    cuda.synchronize()
     h_output = d_output.copy_to_host()
     h_expected = (
         np.sum(h_input[:num_threads_per_block]),
@@ -182,10 +181,9 @@ def kernel(input, output):
 
     dtype = NUMBA_TYPES_TO_NP[T]
     h_input = random_int(num_threads_per_block, dtype)
-    d_input = cuda.to_device(h_input)
-    d_output = cuda.device_array(1, dtype=dtype)
+    d_input = DeviceArray.from_numpy(h_input)
+    d_output = DeviceArray.empty(1, dtype=dtype)
     kernel[1, threads_per_block](d_input, d_output)
-    cuda.synchronize()
     h_output = d_output.copy_to_host()
     h_expected = np.min(h_input)
 
@@ -234,10 +232,9 @@ def kernel(input, output):
     dtype = NUMBA_TYPES_TO_NP[T]
     h_input = random_int(num_threads_per_block, dtype)
     h_input[-1] = 0
-    d_input = cuda.to_device(h_input)
-    d_output = cuda.device_array(1, dtype=dtype)
+    d_input = DeviceArray.from_numpy(h_input)
+    d_output = DeviceArray.empty(1, dtype=dtype)
     kernel[1, threads_per_block](d_input, d_output)
-    cuda.synchronize()
     h_output = d_output.copy_to_host()
     h_expected = np.min(h_input[: num_threads_per_block // 2])
 
@@ -296,10 +293,9 @@ def kernel(input, output):
 
     dtype = NUMBA_TYPES_TO_NP[T]
     h_input = random_int(items_per_thread * num_threads_per_block, dtype)
-    d_input = cuda.to_device(h_input)
-    d_output = cuda.device_array(1, dtype=dtype)
+    d_input = DeviceArray.from_numpy(h_input)
+    d_output = DeviceArray.empty(1, dtype=dtype)
     kernel[1, threads_per_block](d_input, d_output)
-    cuda.synchronize()
     h_output = d_output.copy_to_host()
     h_expected = np.min(h_input)
 
@@ -357,10 +353,9 @@ def kernel(input, output):
 
     dtype = NUMBA_TYPES_TO_NP[T]
     h_input = random_int(items_per_thread * num_threads_per_block, dtype)
-    d_input = cuda.to_device(h_input)
-    d_output = cuda.device_array(1, dtype=dtype)
+    d_input = DeviceArray.from_numpy(h_input)
+    d_output = DeviceArray.empty(1, dtype=dtype)
     kernel[1, threads_per_block](d_input, d_output)
-    cuda.synchronize()
     h_output = d_output.copy_to_host()
     h_expected = np.min(h_input)
 
@@ -403,10 +398,9 @@ def kernel(input, output):
 
     dtype = NUMBA_TYPES_TO_NP[T]
     h_input = random_int(num_threads_per_block, dtype)
-    d_input = cuda.to_device(h_input)
-    d_output = cuda.device_array(1, dtype=dtype)
+    d_input = DeviceArray.from_numpy(h_input)
+    d_output = DeviceArray.empty(1, dtype=dtype)
     kernel[1, threads_per_block](d_input, d_output)
-    cuda.synchronize()
     h_output = d_output.copy_to_host()
     h_expected = np.sum(h_input)
 
@@ -452,10 +446,9 @@ def kernel(input, output):
     dtype = NUMBA_TYPES_TO_NP[T]
     h_input = random_int(num_threads_per_block, dtype)
     h_input[-1] = 0
-    d_input = cuda.to_device(h_input)
-    d_output = cuda.device_array(1, dtype=dtype)
+    d_input = DeviceArray.from_numpy(h_input)
+    d_output = DeviceArray.empty(1, dtype=dtype)
     kernel[1, threads_per_block](d_input, d_output)
-    cuda.synchronize()
     h_output = d_output.copy_to_host()
     h_expected = np.sum(h_input[: num_threads_per_block // 2])
 
@@ -510,10 +503,9 @@ def kernel(input, output):
 
     dtype = NUMBA_TYPES_TO_NP[T]
     h_input = random_int(items_per_thread * num_threads_per_block, dtype)
-    d_input = cuda.to_device(h_input)
-    d_output = cuda.device_array(1, dtype=dtype)
+    d_input = DeviceArray.from_numpy(h_input)
+    d_output = DeviceArray.empty(1, dtype=dtype)
     kernel[1, threads_per_block](d_input, d_output)
-    cuda.synchronize()
     h_output = d_output.copy_to_host()
     h_expected = np.sum(h_input)
 
@@ -565,10 +557,9 @@ def kernel(input, output):
 
     dtype = NUMBA_TYPES_TO_NP[T]
     h_input = random_int(items_per_thread * num_threads_per_block, dtype)
-    d_input = cuda.to_device(h_input)
-    d_output = cuda.device_array(1, dtype=dtype)
+    d_input = DeviceArray.from_numpy(h_input)
+    d_output = DeviceArray.empty(1, dtype=dtype)
     kernel[1, threads_per_block](d_input, d_output)
-    cuda.synchronize()
     h_output = d_output.copy_to_host()
     h_expected = np.sum(h_input)
 
diff --git a/python/cuda_cccl/tests/coop/_experimental/test_block_reduce_api.py b/python/cuda_cccl/tests/coop/_experimental/test_block_reduce_api.py
index 3c795ac1722..349b8ea61b1 100644
--- a/python/cuda_cccl/tests/coop/_experimental/test_block_reduce_api.py
+++ b/python/cuda_cccl/tests/coop/_experimental/test_block_reduce_api.py
@@ -2,6 +2,9 @@
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+from _utils.device_array import DeviceArray
+
+# isort: split
 # example-begin imports
 import numba
 import numpy as np
@@ -32,8 +35,8 @@ def kernel(input, output):
     # example-end reduce
 
     h_input = np.random.randint(0, 42, threads_per_block, dtype=np.int32)
-    d_input = cuda.to_device(h_input)
-    d_output = cuda.device_array(1, dtype=np.int32)
+    d_input = DeviceArray.from_numpy(h_input)
+    d_output = DeviceArray.empty(1, dtype=np.int32)
     kernel[1, threads_per_block](d_input, d_output)
     h_output = d_output.copy_to_host()
     h_expected = np.max(h_input)
@@ -56,8 +59,8 @@ def kernel(input, output):
     # example-end sum
 
     h_input = np.ones(threads_per_block, dtype=np.int32)
-    d_input = cuda.to_device(h_input)
-    d_output = cuda.device_array(1, dtype=np.int32)
+    d_input = DeviceArray.from_numpy(h_input)
+    d_output = DeviceArray.empty(1, dtype=np.int32)
     kernel[1, threads_per_block](d_input, d_output)
     h_output = d_output.copy_to_host()
 
diff --git a/python/cuda_cccl/tests/coop/_experimental/test_block_scan.py b/python/cuda_cccl/tests/coop/_experimental/test_block_scan.py
index 1b0ea5ac3ed..e700301b8ae 100644
--- a/python/cuda_cccl/tests/coop/_experimental/test_block_scan.py
+++ b/python/cuda_cccl/tests/coop/_experimental/test_block_scan.py
@@ -17,6 +17,7 @@
 import numba
 import numpy as np
 import pytest
+from _utils.device_array import DeviceArray
 from helpers import (
     NUMBA_TYPES_TO_NP,
     Complex,
@@ -152,11 +153,10 @@ def kernel(input_arr, output_arr):
     dtype_np = NUMBA_TYPES_TO_NP[T]
     items_per_tile = num_threads * items_per_thread
     h_input = random_int(items_per_tile, dtype_np)
-    d_input = cuda.to_device(h_input)
-    d_output = cuda.device_array(items_per_tile, dtype=dtype_np)
+    d_input = DeviceArray.from_numpy(h_input)
+    d_output = DeviceArray.empty(items_per_tile, dtype=dtype_np)
 
     kernel[1, threads_per_block](d_input, d_output)
-    cuda.synchronize()
 
     output = d_output.copy_to_host()
     if mode == "inclusive":
@@ -248,11 +248,10 @@ def kernel(input_arr, output_arr):
             tile_offset += tile_items
 
     h_input = np.arange(num_elements, dtype=np.int32)
-    d_input = cuda.to_device(h_input)
-    d_output = cuda.to_device(np.zeros(num_elements, dtype=np.int32))
+    d_input = DeviceArray.from_numpy(h_input)
+    d_output = DeviceArray.from_numpy(np.zeros(num_elements, dtype=np.int32))
 
     kernel[num_segments, threads_per_block](d_input, d_output)
-    cuda.synchronize()
 
     h_output = d_output.copy_to_host()
     ref = np.zeros_like(h_input)
@@ -438,10 +437,9 @@ def kernel(input_arr, output_arr):
     # Account for a Complex type containing two int32 values.
     total_items = num_threads * items_per_thread * 2
     h_input = random_int(total_items, "int32")
-    d_input = cuda.to_device(h_input)
-    d_output = cuda.device_array(total_items, dtype=np.int32)
+    d_input = DeviceArray.from_numpy(h_input)
+    d_output = DeviceArray.empty(total_items, dtype=np.int32)
     kernel[1, threads_per_block](d_input, d_output)
-    cuda.synchronize()
 
     h_output = d_output.copy_to_host()
     real_vals = h_input[:num_elements]
@@ -538,11 +536,10 @@ def kernel(input_arr, output_arr):
     dtype_np = NUMBA_TYPES_TO_NP[T]
     total_items = num_threads * items_per_thread
     h_input = random_int(total_items, dtype_np)
-    d_input = cuda.to_device(h_input)
-    d_output = cuda.device_array(total_items, dtype=dtype_np)
+    d_input = DeviceArray.from_numpy(h_input)
+    d_output = DeviceArray.empty(total_items, dtype=dtype_np)
 
     kernel[1, threads_per_block](d_input, d_output)
-    cuda.synchronize()
 
     output = d_output.copy_to_host()
 
@@ -709,11 +706,10 @@ def kernel(input_arr, output_arr):
     dtype_np = NUMBA_TYPES_TO_NP[T]
     total_items = num_threads * items_per_thread
     h_input = random_int(total_items, dtype_np)
-    d_input = cuda.to_device(h_input)
-    d_output = cuda.device_array(total_items, dtype=dtype_np)
+    d_input = DeviceArray.from_numpy(h_input)
+    d_output = DeviceArray.empty(total_items, dtype=dtype_np)
 
     kernel[1, threads_per_block](d_input, d_output)
-    cuda.synchronize()
 
     output = d_output.copy_to_host()
 
@@ -822,12 +818,11 @@ def kernel(input_arr, output_arr):
     else:
         h_input = random_int(total_items, dtype_np)
 
-    d_input = cuda.to_device(h_input)
-    d_output = cuda.device_array(total_items, dtype=dtype_np)
+    d_input = DeviceArray.from_numpy(h_input)
+    d_output = DeviceArray.empty(total_items, dtype=dtype_np)
 
     k = kernel[1, threads_per_block]
     k(d_input, d_output)
-    cuda.synchronize()
 
     output = d_output.copy_to_host()
 
diff --git a/python/cuda_cccl/tests/coop/_experimental/test_block_scan_api.py b/python/cuda_cccl/tests/coop/_experimental/test_block_scan_api.py
index 8b40e57d7f7..b0b90bea662 100644
--- a/python/cuda_cccl/tests/coop/_experimental/test_block_scan_api.py
+++ b/python/cuda_cccl/tests/coop/_experimental/test_block_scan_api.py
@@ -4,6 +4,7 @@
 
 import numba
 import numpy as np
+from _utils.device_array import DeviceArray
 from numba import cuda
 
 import cuda.coop._experimental as coop
@@ -43,7 +44,7 @@ def kernel(data):
     tile_size = threads_per_block * items_per_thread
 
     h_keys = np.ones(tile_size, dtype=np.int32)
-    d_keys = cuda.to_device(h_keys)
+    d_keys = DeviceArray.from_numpy(h_keys)
     kernel[1, threads_per_block](d_keys)
     h_keys = d_keys.copy_to_host()
     for i in range(tile_size):
@@ -77,7 +78,7 @@ def kernel(data):
     tile_size = threads_per_block
 
     h_keys = np.ones(tile_size, dtype=np.int32)
-    d_keys = cuda.to_device(h_keys)
+    d_keys = DeviceArray.from_numpy(h_keys)
     kernel[1, threads_per_block](d_keys)
     h_keys = d_keys.copy_to_host()
     for i in range(tile_size):
diff --git a/python/cuda_cccl/tests/coop/_experimental/test_block_store.py b/python/cuda_cccl/tests/coop/_experimental/test_block_store.py
index 1f6a7aeb9cf..2616d3f3aa9 100644
--- a/python/cuda_cccl/tests/coop/_experimental/test_block_store.py
+++ b/python/cuda_cccl/tests/coop/_experimental/test_block_store.py
@@ -7,6 +7,7 @@
 
 import numba
 import pytest
+from _utils.device_array import DeviceArray
 from helpers import NUMBA_TYPES_TO_NP, random_int, row_major_tid
 from numba import cuda, types
 
@@ -63,10 +64,9 @@ def kernel(d_input, d_output):
     dtype = NUMBA_TYPES_TO_NP[T]
     items_per_tile = num_threads_per_block * items_per_thread
     h_input = random_int(items_per_tile, dtype)
-    d_input = cuda.to_device(h_input)
-    d_output = cuda.device_array(items_per_tile, dtype=dtype)
+    d_input = DeviceArray.from_numpy(h_input)
+    d_output = DeviceArray.empty(items_per_tile, dtype=dtype)
     kernel[1, threads_per_block](d_input, d_output)
-    cuda.synchronize()
 
     output = d_output.copy_to_host()
     reference = h_input
diff --git a/python/cuda_cccl/tests/coop/_experimental/test_warp_merge_sort.py b/python/cuda_cccl/tests/coop/_experimental/test_warp_merge_sort.py
index ec9a88fa634..08abf8260e1 100644
--- a/python/cuda_cccl/tests/coop/_experimental/test_warp_merge_sort.py
+++ b/python/cuda_cccl/tests/coop/_experimental/test_warp_merge_sort.py
@@ -4,6 +4,7 @@
 
 import numba
 import pytest
+from _utils.device_array import DeviceArray
 from helpers import NUMBA_TYPES_TO_NP, random_int
 from numba import cuda, types
 
@@ -35,10 +36,9 @@ def kernel(input, output):
     dtype = NUMBA_TYPES_TO_NP[T]
     items_per_tile = 32 * items_per_thread
     input = random_int(items_per_tile, dtype)
-    d_input = cuda.to_device(input)
-    d_output = cuda.device_array(items_per_tile, dtype=dtype)
+    d_input = DeviceArray.from_numpy(input)
+    d_output = DeviceArray.empty(items_per_tile, dtype=dtype)
     kernel[1, 32](d_input, d_output)
-    cuda.synchronize()
 
     output = d_output.copy_to_host()
     reference = sorted(input)
@@ -79,10 +79,9 @@ def kernel(input, output):
 
     dtype = NUMBA_TYPES_TO_NP[T]
     h_input = random_int(items_per_tile, dtype)
-    d_input = cuda.to_device(h_input)
-    d_output = cuda.device_array(items_per_tile, dtype=dtype)
+    d_input = DeviceArray.from_numpy(h_input)
+    d_output = DeviceArray.empty(items_per_tile, dtype=dtype)
     kernel[1, threads_per_block](d_input, d_output)
-    cuda.synchronize()
 
     output = d_output.copy_to_host()
     for wid in range(threads_per_block // warp_threads):
diff --git a/python/cuda_cccl/tests/coop/_experimental/test_warp_merge_sort_api.py b/python/cuda_cccl/tests/coop/_experimental/test_warp_merge_sort_api.py
index 4ee9b55b267..c7c088fdb59 100644
--- a/python/cuda_cccl/tests/coop/_experimental/test_warp_merge_sort_api.py
+++ b/python/cuda_cccl/tests/coop/_experimental/test_warp_merge_sort_api.py
@@ -4,6 +4,7 @@
 
 import numba
 import numpy as np
+from _utils.device_array import DeviceArray
 from numba import cuda
 
 import cuda.coop._experimental as coop
@@ -42,7 +43,7 @@ def kernel(keys):
     tile_size = 32 * items_per_thread
 
     h_keys = np.arange(0, tile_size, dtype=np.int32)
-    d_keys = cuda.to_device(h_keys)
+    d_keys = DeviceArray.from_numpy(h_keys)
     kernel[1, 32](d_keys)
     h_keys = d_keys.copy_to_host()
     for i in range(tile_size):
diff --git a/python/cuda_cccl/tests/coop/_experimental/test_warp_reduce.py b/python/cuda_cccl/tests/coop/_experimental/test_warp_reduce.py
index 2aa53b651f2..28401a74900 100644
--- a/python/cuda_cccl/tests/coop/_experimental/test_warp_reduce.py
+++ b/python/cuda_cccl/tests/coop/_experimental/test_warp_reduce.py
@@ -5,6 +5,7 @@
 import numba
 import numpy as np
 import pytest
+from _utils.device_array import DeviceArray
 from helpers import NUMBA_TYPES_TO_NP, random_int
 from numba import cuda, types
 
@@ -31,10 +32,9 @@ def kernel(input, output):
 
     dtype = NUMBA_TYPES_TO_NP[T]
     h_input = random_int(32, dtype)
-    d_input = cuda.to_device(h_input)
-    d_output = cuda.device_array(1, dtype=dtype)
+    d_input = DeviceArray.from_numpy(h_input)
+    d_output = DeviceArray.empty(1, dtype=dtype)
     kernel[1, 32](d_input, d_output)
-    cuda.synchronize()
     h_output = d_output.copy_to_host()
     h_expected = np.min(h_input)
 
@@ -62,10 +62,9 @@ def kernel(input, output):
 
     dtype = NUMBA_TYPES_TO_NP[T]
     h_input = random_int(32, dtype)
-    d_input = cuda.to_device(h_input)
-    d_output = cuda.device_array(1, dtype=dtype)
+    d_input = DeviceArray.from_numpy(h_input)
+    d_output = DeviceArray.empty(1, dtype=dtype)
     kernel[1, 32](d_input, d_output)
-    cuda.synchronize()
     h_output = d_output.copy_to_host()
     h_expected = np.sum(h_input)
 
diff --git a/python/cuda_cccl/tests/coop/_experimental/test_warp_reduce_api.py b/python/cuda_cccl/tests/coop/_experimental/test_warp_reduce_api.py
index 4e79f22b913..3ade32a0203 100644
--- a/python/cuda_cccl/tests/coop/_experimental/test_warp_reduce_api.py
+++ b/python/cuda_cccl/tests/coop/_experimental/test_warp_reduce_api.py
@@ -4,6 +4,7 @@
 
 import numba
 import numpy as np
+from _utils.device_array import DeviceArray
 from numba import cuda
 
 import cuda.coop._experimental as coop
@@ -30,8 +31,8 @@ def kernel(input, output):
     # example-end reduce
 
     h_input = np.random.randint(0, 42, 32, dtype=np.int32)
-    d_input = cuda.to_device(h_input)
-    d_output = cuda.device_array(1, dtype=np.int32)
+    d_input = DeviceArray.from_numpy(h_input)
+    d_output = DeviceArray.empty(1, dtype=np.int32)
     kernel[1, 32](d_input, d_output)
     h_output = d_output.copy_to_host()
     h_expected = np.max(h_input)
@@ -53,8 +54,8 @@ def kernel(input, output):
     # example-end sum
 
     h_input = np.ones(32, dtype=np.int32)
-    d_input = cuda.to_device(h_input)
-    d_output = cuda.device_array(1, dtype=np.int32)
+    d_input = DeviceArray.from_numpy(h_input)
+    d_output = DeviceArray.empty(1, dtype=np.int32)
     kernel[1, 32](d_input, d_output)
     h_output = d_output.copy_to_host()
 
diff --git a/python/cuda_cccl/tests/coop/_experimental/test_warp_scan.py b/python/cuda_cccl/tests/coop/_experimental/test_warp_scan.py
index a82809de342..650baf446c8 100644
--- a/python/cuda_cccl/tests/coop/_experimental/test_warp_scan.py
+++ b/python/cuda_cccl/tests/coop/_experimental/test_warp_scan.py
@@ -5,6 +5,7 @@
 import numba
 import numpy as np
 import pytest
+from _utils.device_array import DeviceArray
 from helpers import NUMBA_TYPES_TO_NP, random_int
 from numba import cuda, types
 
@@ -26,10 +27,9 @@ def kernel(input, output):
 
     dtype = NUMBA_TYPES_TO_NP[T]
     h_input = random_int(32, dtype)
-    d_input = cuda.to_device(h_input)
-    d_output = cuda.device_array(32, dtype=dtype)
+    d_input = DeviceArray.from_numpy(h_input)
+    d_output = DeviceArray.empty(32, dtype=dtype)
     kernel[1, 32](d_input, d_output)
-    cuda.synchronize()
 
     output = d_output.copy_to_host()
     reference = np.cumsum(h_input) - h_input
diff --git a/python/cuda_cccl/tests/coop/_experimental/test_warp_scan_api.py b/python/cuda_cccl/tests/coop/_experimental/test_warp_scan_api.py
index aa5a2497d70..82ccd22dee3 100644
--- a/python/cuda_cccl/tests/coop/_experimental/test_warp_scan_api.py
+++ b/python/cuda_cccl/tests/coop/_experimental/test_warp_scan_api.py
@@ -4,6 +4,7 @@
 
 import numba
 import numpy as np
+from _utils.device_array import DeviceArray
 from numba import cuda
 
 import cuda.coop._experimental as coop
@@ -29,7 +30,7 @@ def kernel(data):
     tile_size = 32
 
     h_keys = np.ones(tile_size, dtype=np.int32)
-    d_keys = cuda.to_device(h_keys)
+    d_keys = DeviceArray.from_numpy(h_keys)
     kernel[1, 32](d_keys)
     h_keys = d_keys.copy_to_host()
     for i in range(tile_size):
diff --git a/python/cuda_cccl/tests/test_examples.py b/python/cuda_cccl/tests/test_examples.py
index a78165905c5..5c6759130e5 100644
--- a/python/cuda_cccl/tests/test_examples.py
+++ b/python/cuda_cccl/tests/test_examples.py
@@ -11,11 +11,14 @@
 """
 
 import importlib
+import importlib.util
 import inspect
 import sys
 import traceback
 from pathlib import Path
 
+import pytest
+
 
 def discover_examples():
     """Automatically discover all example files and their functions."""
@@ -153,6 +156,11 @@ def test_func():
         globals()[test_name] = make_test_func(module_name, display_name)
         globals()[test_name].__name__ = test_name
         globals()[test_name].__doc__ = f"Test {display_name} examples"
+        if module_name.startswith("compute.examples."):
+            globals()[test_name] = pytest.mark.skipif(
+                importlib.util.find_spec("cupy") is None,
+                reason="cuda.compute examples require the optional CuPy dependency",
+            )(globals()[test_name])
 
 
 # Create test functions for pytest