From 7557967b2ddc43bcb04cea633ac22de732e0d484 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sat, 27 Sep 2025 04:51:20 +0000 Subject: [PATCH 01/27] set up build system for targeting different cuda-bindings major verions --- cuda_core/build_hooks.py | 59 ++++++++++++++++++++++++++++++++++++++++ cuda_core/pyproject.toml | 3 +- 2 files changed, 61 insertions(+), 1 deletion(-) create mode 100644 cuda_core/build_hooks.py diff --git a/cuda_core/build_hooks.py b/cuda_core/build_hooks.py new file mode 100644 index 000000000..71354e57c --- /dev/null +++ b/cuda_core/build_hooks.py @@ -0,0 +1,59 @@ +# SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +# This module implements basic PEP 517 backend support, see e.g. +# - https://peps.python.org/pep-0517/ +# - https://setuptools.pypa.io/en/latest/build_meta.html#dynamic-build-dependencies-and-other-build-meta-tweaks +# Specifically, there are 5 APIs required to create a proper build backend, see below. +# For now it's mostly a pass-through to setuptools, except that we need to determine +# some dependencies at build time. +# +# TODO: also implement PEP-660 API hooks + +import os +import re +import subprocess # nosec: B404 + +from setuptools import build_meta as _build_meta + +prepare_metadata_for_build_wheel = _build_meta.prepare_metadata_for_build_wheel +build_wheel = _build_meta.build_wheel +build_sdist = _build_meta.build_sdist +get_requires_for_build_sdist = _build_meta.get_requires_for_build_sdist + + +def _get_proper_cuda_bindings_major_version() -> str: + # for local development (with/without build isolation) + try: + import cuda.bindings + + return cuda.bindings.__version__.split(".")[0] + except ImportError: + pass + + # for custom overwrite, e.g. in CI + cuda_major = os.environ.get("CUDA_CORE_BUILD_MAJOR") + if cuda_major is not None: + return cuda_major + + # also for local development + try: + out = subprocess.run("nvidia-smi", env=os.environ, capture_output=True, check=True) # nosec: B603, B607 + m = re.search(r"CUDA Version:\s*([\d\.]+)", out.stdout.decode()) + if m: + return m.group(1).split(".")[0] + except FileNotFoundError: + # the build machine has no driver installed + pass + + # default fallback + return "13" + + +# Note: this function returns a list of *build-time* dependencies, so it's not affected +# by "--no-deps" based on the PEP-517 design. +def get_requires_for_build_wheel(config_settings=None): + cuda_major = _get_proper_cuda_bindings_major_version() + cuda_bindings_require = [f"cuda-bindings=={cuda_major}.*"] + return _build_meta.get_requires_for_build_wheel(config_settings) + cuda_bindings_require diff --git a/cuda_core/pyproject.toml b/cuda_core/pyproject.toml index 9e3be132f..27fa5ce19 100644 --- a/cuda_core/pyproject.toml +++ b/cuda_core/pyproject.toml @@ -4,7 +4,8 @@ [build-system] requires = ["setuptools>=77.0.0", "Cython>=3.1"] -build-backend = "setuptools.build_meta" +build-backend = "build_hooks" +backend-path = ["."] [project] From 19765972a528d318db76dca3fc843decea4a6566 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sat, 27 Sep 2025 05:53:13 +0000 Subject: [PATCH 02/27] defer cythonization until cuda-bindings is installed --- cuda_core/build_hooks.py | 44 +++++++++++++++++++++++++++++++++++----- cuda_core/setup.py | 29 +++----------------------- 2 files changed, 42 insertions(+), 31 deletions(-) diff --git a/cuda_core/build_hooks.py b/cuda_core/build_hooks.py index 71354e57c..fd1692ad3 100644 --- a/cuda_core/build_hooks.py +++ b/cuda_core/build_hooks.py @@ -6,23 +6,25 @@ # - https://peps.python.org/pep-0517/ # - https://setuptools.pypa.io/en/latest/build_meta.html#dynamic-build-dependencies-and-other-build-meta-tweaks # Specifically, there are 5 APIs required to create a proper build backend, see below. -# For now it's mostly a pass-through to setuptools, except that we need to determine -# some dependencies at build time. # # TODO: also implement PEP-660 API hooks +import functools +import glob import os import re import subprocess # nosec: B404 +from Cython.Build import cythonize +from setuptools import Extension from setuptools import build_meta as _build_meta prepare_metadata_for_build_wheel = _build_meta.prepare_metadata_for_build_wheel -build_wheel = _build_meta.build_wheel build_sdist = _build_meta.build_sdist get_requires_for_build_sdist = _build_meta.get_requires_for_build_sdist +@functools.cache def _get_proper_cuda_bindings_major_version() -> str: # for local development (with/without build isolation) try: @@ -51,8 +53,40 @@ def _get_proper_cuda_bindings_major_version() -> str: return "13" -# Note: this function returns a list of *build-time* dependencies, so it's not affected -# by "--no-deps" based on the PEP-517 design. +# used later by setup() +_extensions = None + + +def build_wheel(wheel_directory, config_settings=None, metadata_directory=None): + # Customizing this hook is needed because we must defer cythonization until cuda-bindings, + # now a required build-time dependency that's dynamically installed via the other hook below, + # is installed. Otherwise, cimport any cuda.bindings modules would fail! + + # It seems setuptools' wildcard support has problems for namespace packages, + # so we explicitly spell out all Extension instances. + root_module = "cuda.core.experimental" + root_path = f"{os.path.sep}".join(root_module.split(".")) + os.path.sep + ext_files = glob.glob(f"{root_path}/**/*.pyx", recursive=True) + + def strip_prefix_suffix(filename): + return filename[len(root_path) : -4] + + module_names = (strip_prefix_suffix(f) for f in ext_files) + ext_modules = tuple( + Extension( + f"cuda.core.experimental.{mod.replace(os.path.sep, '.')}", + sources=[f"cuda/core/experimental/{mod}.pyx"], + language="c++", + ) + for mod in module_names + ) + + global _extensions + _extensions = cythonize(ext_modules, verbose=True, language_level=3, compiler_directives={"embedsignature": True}) + + return _build_meta.build_wheel(wheel_directory, config_settings, metadata_directory) + + def get_requires_for_build_wheel(config_settings=None): cuda_major = _get_proper_cuda_bindings_major_version() cuda_bindings_require = [f"cuda-bindings=={cuda_major}.*"] diff --git a/cuda_core/setup.py b/cuda_core/setup.py index d93eec45d..4a501edc1 100644 --- a/cuda_core/setup.py +++ b/cuda_core/setup.py @@ -2,38 +2,15 @@ # # SPDX-License-Identifier: Apache-2.0 -import glob import os -from Cython.Build import cythonize -from setuptools import Extension, setup +import build_hooks # our build backend +from setuptools import setup from setuptools.command.build_ext import build_ext as _build_ext nthreads = int(os.environ.get("CUDA_PYTHON_PARALLEL_LEVEL", os.cpu_count() // 2)) -# It seems setuptools' wildcard support has problems for namespace packages, -# so we explicitly spell out all Extension instances. -root_module = "cuda.core.experimental" -root_path = f"{os.path.sep}".join(root_module.split(".")) + os.path.sep -ext_files = glob.glob(f"{root_path}/**/*.pyx", recursive=True) - - -def strip_prefix_suffix(filename): - return filename[len(root_path) : -4] - - -module_names = (strip_prefix_suffix(f) for f in ext_files) -ext_modules = tuple( - Extension( - f"cuda.core.experimental.{mod.replace(os.path.sep, '.')}", - sources=[f"cuda/core/experimental/{mod}.pyx"], - language="c++", - ) - for mod in module_names -) - - class build_ext(_build_ext): def build_extensions(self): self.parallel = nthreads @@ -41,7 +18,7 @@ def build_extensions(self): setup( - ext_modules=cythonize(ext_modules, verbose=True, language_level=3, compiler_directives={"embedsignature": True}), + ext_modules=build_hooks._extensions, cmdclass={ "build_ext": build_ext, }, From 67db25e9ea4ce36712f876f03a8f7b88dbc87e7e Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sun, 28 Sep 2025 02:17:58 +0000 Subject: [PATCH 03/27] cythonize stream module --- .../{_launcher.py => _launcher.pyx} | 8 +- cuda_core/cuda/core/experimental/_stream.pxd | 9 ++ cuda_core/cuda/core/experimental/_stream.pyx | 85 +++++++++++++------ 3 files changed, 72 insertions(+), 30 deletions(-) rename cuda_core/cuda/core/experimental/{_launcher.py => _launcher.pyx} (93%) create mode 100644 cuda_core/cuda/core/experimental/_stream.pxd diff --git a/cuda_core/cuda/core/experimental/_launcher.py b/cuda_core/cuda/core/experimental/_launcher.pyx similarity index 93% rename from cuda_core/cuda/core/experimental/_launcher.py rename to cuda_core/cuda/core/experimental/_launcher.pyx index 2d0c274c7..ae808be89 100644 --- a/cuda_core/cuda/core/experimental/_launcher.py +++ b/cuda_core/cuda/core/experimental/_launcher.pyx @@ -2,12 +2,16 @@ # # SPDX-License-Identifier: Apache-2.0 +from libc.stdint cimport uintptr_t + +from cuda.core.experimental._stream cimport _try_to_get_stream_ptr + from typing import Union from cuda.core.experimental._kernel_arg_handler import ParamHolder from cuda.core.experimental._launch_config import LaunchConfig, _to_native_launch_config from cuda.core.experimental._module import Kernel -from cuda.core.experimental._stream import IsStreamT, Stream, _try_to_get_stream_ptr +from cuda.core.experimental._stream import IsStreamT, Stream from cuda.core.experimental._utils.clear_error_support import assert_type from cuda.core.experimental._utils.cuda_utils import ( _reduce_3_tuple, @@ -60,7 +64,7 @@ def launch(stream: Union[Stream, IsStreamT], config: LaunchConfig, kernel: Kerne stream_handle = stream.handle except AttributeError: try: - stream_handle = _try_to_get_stream_ptr(stream) + stream_handle = driver.CUstream((_try_to_get_stream_ptr(stream))) except Exception: raise ValueError( f"stream must either be a Stream object or support __cuda_stream__ (got {type(stream)})" diff --git a/cuda_core/cuda/core/experimental/_stream.pxd b/cuda_core/cuda/core/experimental/_stream.pxd new file mode 100644 index 000000000..f7d97de33 --- /dev/null +++ b/cuda_core/cuda/core/experimental/_stream.pxd @@ -0,0 +1,9 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +# TODO: how about cuda.bindings < 12.6.2? +from cuda.bindings cimport cydriver + + +cdef cydriver.CUstream _try_to_get_stream_ptr(obj: IsStreamT) except* diff --git a/cuda_core/cuda/core/experimental/_stream.pyx b/cuda_core/cuda/core/experimental/_stream.pyx index a2c1a90b9..284831cd6 100644 --- a/cuda_core/cuda/core/experimental/_stream.pyx +++ b/cuda_core/cuda/core/experimental/_stream.pyx @@ -4,10 +4,16 @@ from __future__ import annotations +from libc.stdint cimport uintptr_t + +# TODO: how about cuda.bindings < 12.6.2? +from cuda.bindings cimport cydriver + from cuda.core.experimental._utils.cuda_utils cimport ( _check_driver_error as raise_if_driver_error, check_or_create_options, ) + import sys import cython @@ -59,7 +65,7 @@ class IsStreamT(Protocol): ... -def _try_to_get_stream_ptr(obj: IsStreamT): +cdef cydriver.CUstream _try_to_get_stream_ptr(obj: IsStreamT) except*: try: cuda_stream_attr = obj.__cuda_stream__ except AttributeError: @@ -86,7 +92,7 @@ def _try_to_get_stream_ptr(obj: IsStreamT): raise RuntimeError( f"The first element of the sequence returned by obj.__cuda_stream__ must be 0, got {repr(info[0])}" ) - return driver.CUstream(info[1]) + return (info[1]) cdef class Stream: @@ -108,7 +114,7 @@ cdef class Stream: """ cdef: - object _handle + cydriver.CUstream _handle object _owner object _builtin object _nonblocking @@ -116,6 +122,9 @@ cdef class Stream: object _device_id object _ctx_handle + def __cinit__(self, *args, **kwargs): + self._handle = (NULL) + def __init__(self, *args, **kwargs): raise RuntimeError( "Stream objects cannot be instantiated directly. " @@ -125,7 +134,7 @@ cdef class Stream: @classmethod def _legacy_default(cls): cdef Stream self = Stream.__new__(cls) - self._handle = driver.CUstream(driver.CU_STREAM_LEGACY) + self._handle = (cydriver.CU_STREAM_LEGACY) self._owner = None self._builtin = True self._nonblocking = None # delayed @@ -137,7 +146,7 @@ cdef class Stream: @classmethod def _per_thread_default(cls): cdef Stream self = Stream.__new__(cls) - self._handle = driver.CUstream(driver.CU_STREAM_PER_THREAD) + self._handle = (cydriver.CU_STREAM_PER_THREAD) self._owner = None self._builtin = True self._nonblocking = None # delayed @@ -149,7 +158,6 @@ cdef class Stream: @classmethod def _init(cls, obj: Optional[IsStreamT] = None, options=None, device_id: int = None): cdef Stream self = Stream.__new__(cls) - self._handle = None self._owner = None self._builtin = False @@ -169,16 +177,20 @@ cdef class Stream: nonblocking = opts.nonblocking priority = opts.priority - flags = driver.CUstream_flags.CU_STREAM_NON_BLOCKING if nonblocking else driver.CUstream_flags.CU_STREAM_DEFAULT - err, high, low = driver.cuCtxGetStreamPriorityRange() - raise_if_driver_error(err) + flags = cydriver.CUstream_flags.CU_STREAM_NON_BLOCKING if nonblocking else cydriver.CUstream_flags.CU_STREAM_DEFAULT + # TODO: use HANDLE_RETURN + cdef int high, low + err = cydriver.cuCtxGetStreamPriorityRange(&high, &low) if priority is not None: if not (low <= priority <= high): raise ValueError(f"{priority=} is out of range {[low, high]}") else: priority = high - self._handle = handle_return(driver.cuStreamCreateWithPriority(flags, priority)) + cdef cydriver.CUstream s + # TODO: add HANDLE_RETURN macro to check driver error code? + err = cydriver.cuStreamCreateWithPriority(&s, flags, priority) + self._handle = s self._owner = None self._nonblocking = nonblocking self._priority = priority @@ -195,10 +207,11 @@ cdef class Stream: if self._owner is None: if self._handle and not self._builtin: - handle_return(driver.cuStreamDestroy(self._handle)) + # TODO: use HANDLE_RETURN + err = cydriver.cuStreamDestroy(self._handle) else: self._owner = None - self._handle = None + self._handle = (NULL) cpdef close(self): """Destroy the stream. @@ -222,14 +235,16 @@ cdef class Stream: This handle is a Python object. To get the memory address of the underlying C handle, call ``int(Stream.handle)``. """ - return self._handle + return driver.CUstream((self._handle)) @property def is_nonblocking(self) -> bool: """Return True if this is a nonblocking stream, otherwise False.""" + cdef unsigned int flags if self._nonblocking is None: - flag = handle_return(driver.cuStreamGetFlags(self._handle)) - if flag == driver.CUstream_flags.CU_STREAM_NON_BLOCKING: + # TODO: switch to HANDLE_RETURN + err = cydriver.cuStreamGetFlags(self._handle, &flags) + if flags & cydriver.CUstream_flags.CU_STREAM_NON_BLOCKING: self._nonblocking = True else: self._nonblocking = False @@ -238,14 +253,17 @@ cdef class Stream: @property def priority(self) -> int: """Return the stream priority.""" + cdef int prio if self._priority is None: - prio = handle_return(driver.cuStreamGetPriority(self._handle)) + # TODO: switch to HANDLE_RETURN + err = cydriver.cuStreamGetPriority(self._handle, &prio) self._priority = prio return self._priority def sync(self): """Synchronize the stream.""" - handle_return(driver.cuStreamSynchronize(self._handle)) + # TODO: switch to HANDLE_RETURN + err = cydriver.cuStreamSynchronize(self._handle) def record(self, event: Event = None, options: EventOptions = None) -> Event: """Record an event onto the stream. @@ -272,8 +290,9 @@ cdef class Stream: if event is None: self._get_device_and_context() event = Event._init(self._device_id, self._ctx_handle, options) - err, = driver.cuEventRecord(event.handle, self._handle) - raise_if_driver_error(err) + # TODO: switch to HANDLE_RETURN + # TODO: revisit after Event is cythonized + err = cydriver.cuEventRecord((event.handle), self._handle) return event def wait(self, event_or_stream: Union[Event, Stream]): @@ -286,28 +305,35 @@ cdef class Stream: on the stream and then waiting on it. """ + cdef cydriver.CUevent event + cdef cydriver.CUstream stream + cdef bint discard_event + if isinstance(event_or_stream, Event): - event = event_or_stream.handle + event = (event_or_stream.handle) discard_event = False else: if isinstance(event_or_stream, Stream): - stream = event_or_stream + stream = (event_or_stream.handle) else: try: - stream = Stream._init(obj=event_or_stream) + s = Stream._init(obj=event_or_stream) except Exception as e: raise ValueError( "only an Event, Stream, or object supporting __cuda_stream__ can be waited," f" got {type(event_or_stream)}" ) from e - event = handle_return(driver.cuEventCreate(driver.CUevent_flags.CU_EVENT_DISABLE_TIMING)) - handle_return(driver.cuEventRecord(event, stream.handle)) + stream = (s.handle) + # TODO: switch to HANDLE_RETURN + err = cydriver.cuEventCreate(&event, cydriver.CUevent_flags.CU_EVENT_DISABLE_TIMING) + err = cydriver.cuEventRecord(event, stream) discard_event = True # TODO: support flags other than 0? - handle_return(driver.cuStreamWaitEvent(self._handle, event, 0)) + # TODO: switch to HANDLE_RETURN + err = cydriver.cuStreamWaitEvent(self._handle, event, 0) if discard_event: - handle_return(driver.cuEventDestroy(event)) + err = cydriver.cuEventDestroy(event) @property def device(self) -> Device: @@ -325,9 +351,12 @@ cdef class Stream: return Device(self._device_id) cdef int _get_context(Stream self) except?-1: + # TODO: consider making self._ctx_handle typed? + cdef cydriver.CUcontext ctx if self._ctx_handle is None: - err, self._ctx_handle = driver.cuStreamGetCtx(self._handle) - raise_if_driver_error(err) + # TODO: switch to HANDLE_RETURN + err = cydriver.cuStreamGetCtx(self._handle, &ctx) + self._ctx_handle = driver.CUcontext(ctx) return 0 cdef int _get_device_and_context(Stream self) except?-1: From 07df441f8e7e9b9c8080ea2a123600efb7c0a977 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sun, 28 Sep 2025 19:39:51 +0000 Subject: [PATCH 04/27] nit: move dlpack.h to the include dir --- cuda_core/cuda/core/experimental/_dlpack.pxd | 2 +- cuda_core/cuda/core/experimental/{ => include}/dlpack.h | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename cuda_core/cuda/core/experimental/{ => include}/dlpack.h (100%) diff --git a/cuda_core/cuda/core/experimental/_dlpack.pxd b/cuda_core/cuda/core/experimental/_dlpack.pxd index 843beb873..d61b6a2bc 100644 --- a/cuda_core/cuda/core/experimental/_dlpack.pxd +++ b/cuda_core/cuda/core/experimental/_dlpack.pxd @@ -14,7 +14,7 @@ from libc.stdint cimport uint64_t from libc.stdint cimport intptr_t -cdef extern from "dlpack.h" nogil: +cdef extern from "include/dlpack.h" nogil: """ #define DLPACK_TENSOR_UNUSED_NAME "dltensor" #define DLPACK_VERSIONED_TENSOR_UNUSED_NAME "dltensor_versioned" diff --git a/cuda_core/cuda/core/experimental/dlpack.h b/cuda_core/cuda/core/experimental/include/dlpack.h similarity index 100% rename from cuda_core/cuda/core/experimental/dlpack.h rename to cuda_core/cuda/core/experimental/include/dlpack.h From 6be8e7d0b6f0523ff63ed291b73680b7d6b0f503 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sun, 28 Sep 2025 19:40:19 +0000 Subject: [PATCH 05/27] purge cu11 --- cuda_core/pyproject.toml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/cuda_core/pyproject.toml b/cuda_core/pyproject.toml index 27fa5ce19..ee0030f1c 100644 --- a/cuda_core/pyproject.toml +++ b/cuda_core/pyproject.toml @@ -39,22 +39,20 @@ classifiers = [ "Programming Language :: Python :: 3.13", "Programming Language :: Python :: Implementation :: CPython", "Environment :: GPU :: NVIDIA CUDA", - "Environment :: GPU :: NVIDIA CUDA :: 11", "Environment :: GPU :: NVIDIA CUDA :: 12", + "Environment :: GPU :: NVIDIA CUDA :: 13", ] dependencies = [ "numpy", ] [project.optional-dependencies] -cu11 = ["cuda-bindings[all]==11.8.*"] cu12 = ["cuda-bindings[all]==12.*"] cu13 = ["cuda-bindings[all]==13.*"] # TODO: these should all be in development dependencies; optional dependencies # are for features exposed to *users*, not a dumping ground for all tooling # needed to build and test the project test = ["cython>=3.1", "setuptools", "pytest>=6.2.4"] -test-cu11 = ["cuda-core[test]", "cupy-cuda11x; python_version < '3.14'", "cuda-toolkit[cudart]==11.*"] # runtime headers needed by CuPy test-cu12 = ["cuda-core[test]", "cupy-cuda12x; python_version < '3.14'", "cuda-toolkit[cudart]==12.*"] # runtime headers needed by CuPy test-cu13 = ["cuda-core[test]", "cupy-cuda13x; python_version < '3.14'", "cuda-toolkit[cudart]==13.*"] # runtime headers needed by CuPy # free threaded build, cupy doesn't support free-threaded builds yet, so avoid installing it for now From 021e0f3406aceaccb1828abb392be5c0de681775 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sun, 28 Sep 2025 19:47:55 +0000 Subject: [PATCH 06/27] check in a working merger script --- ci/tools/merge_cuda_core_wheels.py | 200 +++++++++++++++++++++++++++++ 1 file changed, 200 insertions(+) create mode 100644 ci/tools/merge_cuda_core_wheels.py diff --git a/ci/tools/merge_cuda_core_wheels.py b/ci/tools/merge_cuda_core_wheels.py new file mode 100644 index 000000000..3b47a9f7f --- /dev/null +++ b/ci/tools/merge_cuda_core_wheels.py @@ -0,0 +1,200 @@ +#!/usr/bin/env python3 +""" +Script to merge CUDA-specific wheels into a single multi-CUDA wheel. + +This script takes wheels built for different CUDA versions (cu12, cu13) and merges them +into a single wheel that supports both CUDA versions. + +In particular, each wheel contains a CUDA-specific build of the `cuda.core` library +and the associated bindings. This script merges these directories into a single wheel +that supports both CUDA versions, i.e., containing both `cuda/core/experimental/cu12` +and `cuda/core/experimental/cu13`. At runtime, the code in `cuda/core/experimental/__init__.py` +is used to import the appropriate CUDA-specific bindings. +""" + +import argparse +import os +import shutil +import subprocess +import sys +import tempfile +from pathlib import Path +from typing import List + + +def run_command( + cmd: List[str], cwd: Path = None, env: dict = None +) -> subprocess.CompletedProcess: + """Run a command with error handling.""" + print(f"Running: {' '.join(cmd)}") + if cwd: + print(f" Working directory: {cwd}") + + result = subprocess.run(cmd, cwd=cwd, env=env, capture_output=True, text=True) + + if result.returncode != 0: + print(f"Command failed with return code {result.returncode}") + print("STDOUT:", result.stdout) + print("STDERR:", result.stderr) + result.check_returncode() + + return result + + +def merge_wheels(wheels: List[Path], output_dir: Path) -> Path: + """Merge multiple wheels into a single wheel with version-specific binaries.""" + print("\n=== Merging wheels ===") + print(f"Input wheels: {[w.name for w in wheels]}") + + if len(wheels) == 1: + raise RuntimeError("only one wheel is provided, nothing to merge") + + # Extract all wheels to temporary directories + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + extracted_wheels = [] + + for i, wheel in enumerate(wheels): + print(f"Extracting wheel {i + 1}/{len(wheels)}: {wheel.name}") + # Extract wheel - wheel unpack creates the directory itself + run_command( + [ + "python", + "-m", + "wheel", + "unpack", + str(wheel), + "--dest", + str(temp_path), + ] + ) + + # Find the extracted directory (wheel unpack creates a subdirectory) + extract_dir = None + for item in temp_path.iterdir(): + if item.is_dir() and item.name.startswith("cuda_core"): + extract_dir = item + break + + if not extract_dir: + raise RuntimeError( + f"Could not find extracted wheel directory for {wheel.name}" + ) + + # Rename to our expected name + expected_name = temp_path / f"wheel_{i}" + extract_dir.rename(expected_name) + extract_dir = expected_name + + extracted_wheels.append(extract_dir) + + # Use the first wheel as the base and merge binaries from others + base_wheel = extracted_wheels[0] + + # now copy the version-specific directory from other wheels + # into the appropriate place in the base wheel + for i, wheel_dir in enumerate(extracted_wheels): + cuda_version = wheels[i].name.split(".cu")[1].split(".")[0] + base_dir = ( + Path("cuda") + / "core" + / "experimental" + ) + # Copy from other wheels + print(f" Copying {wheel_dir} to {base_wheel}") + shutil.copytree(wheel_dir / base_dir, base_wheel / base_dir / f"cu{cuda_version}") + + # Overwrite the __init__.py in versioned dirs + open(base_wheel / base_dir / f"cu{cuda_version}" / "__init__.py", "w").close() + + # The base dir should only contain __init__.py, the include dir, and the versioned dirs + files_to_remove = os.listdir(base_wheel / base_dir) + for f in files_to_remove: + f_abspath = base_wheel / base_dir / f + if f not in ("__init__.py", "cu12", "cu13", "include"): + if os.path.isdir(f_abspath): + shutil.rmtree(f_abspath) + else: + os.remove(f_abspath) + + # Repack the merged wheel + output_dir.mkdir(parents=True, exist_ok=True) + + # Create a clean wheel name without CUDA version suffixes + base_wheel_name = wheels[0].name + # Remove any .cu* suffix from the wheel name + if ".cu" in base_wheel_name: + base_wheel_name = base_wheel_name.split(".cu")[0] + ".whl" + + print(f"Repacking merged wheel as: {base_wheel_name}") + run_command( + [ + "python", + "-m", + "wheel", + "pack", + str(base_wheel), + "--dest-dir", + str(output_dir), + ] + ) + + # Find the output wheel + output_wheels = list(output_dir.glob("*.whl")) + if not output_wheels: + raise RuntimeError("Failed to create merged wheel") + + merged_wheel = output_wheels[0] + print(f"Successfully merged wheel: {merged_wheel}") + return merged_wheel + + +def main(): + """Main merge script.""" + parser = argparse.ArgumentParser( + description="Merge CUDA-specific wheels into a single multi-CUDA wheel" + ) + parser.add_argument( + "wheels", nargs="+", help="Paths to the CUDA-specific wheels to merge" + ) + parser.add_argument( + "--output-dir", "-o", default="dist", help="Output directory for merged wheel" + ) + + args = parser.parse_args() + + print("cuda.core Wheel Merger") + print("======================") + + # Convert wheel paths to Path objects and validate + wheels = [] + for wheel_path in args.wheels: + wheel = Path(wheel_path) + if not wheel.exists(): + print(f"Error: Wheel not found: {wheel}") + sys.exit(1) + if not wheel.name.endswith(".whl"): + print(f"Error: Not a wheel file: {wheel}") + sys.exit(1) + wheels.append(wheel) + + if not wheels: + print("Error: No wheels provided") + sys.exit(1) + + output_dir = Path(args.output_dir) + + # Check that we have wheel tool available + try: + run_command(["python", "-m", "wheel", "--help"]) + except Exception: + print("Error: wheel package not available. Install with: pip install wheel") + sys.exit(1) + + # Merge the wheels + merged_wheel = merge_wheels(wheels, output_dir) + print(f"\nMerge complete! Output: {merged_wheel}") + + +if __name__ == "__main__": + main() From 19020b2768cf60087bb316996077affd7cf66b8d Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sun, 28 Sep 2025 20:21:58 +0000 Subject: [PATCH 07/27] support loading from the versioned module if any exists --- cuda_core/cuda/core/experimental/__init__.py | 23 ++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py index a06119321..40d10c3aa 100644 --- a/cuda_core/cuda/core/experimental/__init__.py +++ b/cuda_core/cuda/core/experimental/__init__.py @@ -2,6 +2,29 @@ # # SPDX-License-Identifier: Apache-2.0 +try: + import cuda.bindings +except ImportError as e: + raise ImportError("cuda.bindings 12.x or 13.x must be installed") +else: + cuda_major, cuda_minor = cuda.bindings.__version__.split(".")[:2] + if cuda_major not in ("12", "13"): + raise ImportError("cuda.bindings 12.x or 13.x must be installed") + +import importlib +subdir = f"cu{cuda_major}" +try: + verioned_mod = importlib.import_module(f".{subdir}", __package__) + # Import all symbols from the module + globals().update(verioned_mod.__dict__) +except ImportError: + # This is not a wheel build, but a conda or local build, do nothing + pass +else: + del verioned_mod +finally: + del cuda.bindings, importlib, subdir, cuda_major, cuda_minor + from cuda.core.experimental import utils from cuda.core.experimental._device import Device from cuda.core.experimental._event import Event, EventOptions From e51f9107fdabd6bb4ba6b1283304fa0f6b4491bd Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sun, 28 Sep 2025 20:53:03 +0000 Subject: [PATCH 08/27] fix linter errors --- .spdx-ignore | 2 +- ci/tools/merge_cuda_core_wheels.py | 36 ++++++++------------ cuda_core/cuda/core/experimental/__init__.py | 29 ++++++++-------- 3 files changed, 30 insertions(+), 37 deletions(-) diff --git a/.spdx-ignore b/.spdx-ignore index 60435ebb5..84f051faf 100644 --- a/.spdx-ignore +++ b/.spdx-ignore @@ -10,4 +10,4 @@ requirements*.txt cuda_bindings/examples/* # Vendored -cuda_core/cuda/core/experimental/dlpack.h +cuda_core/cuda/core/experimental/include/dlpack.h diff --git a/ci/tools/merge_cuda_core_wheels.py b/ci/tools/merge_cuda_core_wheels.py index 3b47a9f7f..14c380b76 100644 --- a/ci/tools/merge_cuda_core_wheels.py +++ b/ci/tools/merge_cuda_core_wheels.py @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + #!/usr/bin/env python3 """ Script to merge CUDA-specific wheels into a single multi-CUDA wheel. @@ -10,27 +14,27 @@ that supports both CUDA versions, i.e., containing both `cuda/core/experimental/cu12` and `cuda/core/experimental/cu13`. At runtime, the code in `cuda/core/experimental/__init__.py` is used to import the appropriate CUDA-specific bindings. + +This script is based on the one in NVIDIA/CCCL. """ import argparse import os import shutil -import subprocess +import subprocess # nosec: B404 import sys import tempfile from pathlib import Path from typing import List -def run_command( - cmd: List[str], cwd: Path = None, env: dict = None -) -> subprocess.CompletedProcess: +def run_command(cmd: List[str], cwd: Path = None, env: dict = os.environ) -> subprocess.CompletedProcess: """Run a command with error handling.""" print(f"Running: {' '.join(cmd)}") if cwd: print(f" Working directory: {cwd}") - result = subprocess.run(cmd, cwd=cwd, env=env, capture_output=True, text=True) + result = subprocess.run(cmd, cwd=cwd, env=env, capture_output=True, text=True) # nosec: B603 if result.returncode != 0: print(f"Command failed with return code {result.returncode}") @@ -77,9 +81,7 @@ def merge_wheels(wheels: List[Path], output_dir: Path) -> Path: break if not extract_dir: - raise RuntimeError( - f"Could not find extracted wheel directory for {wheel.name}" - ) + raise RuntimeError(f"Could not find extracted wheel directory for {wheel.name}") # Rename to our expected name expected_name = temp_path / f"wheel_{i}" @@ -95,11 +97,7 @@ def merge_wheels(wheels: List[Path], output_dir: Path) -> Path: # into the appropriate place in the base wheel for i, wheel_dir in enumerate(extracted_wheels): cuda_version = wheels[i].name.split(".cu")[1].split(".")[0] - base_dir = ( - Path("cuda") - / "core" - / "experimental" - ) + base_dir = Path("cuda") / "core" / "experimental" # Copy from other wheels print(f" Copying {wheel_dir} to {base_wheel}") shutil.copytree(wheel_dir / base_dir, base_wheel / base_dir / f"cu{cuda_version}") @@ -151,15 +149,9 @@ def merge_wheels(wheels: List[Path], output_dir: Path) -> Path: def main(): """Main merge script.""" - parser = argparse.ArgumentParser( - description="Merge CUDA-specific wheels into a single multi-CUDA wheel" - ) - parser.add_argument( - "wheels", nargs="+", help="Paths to the CUDA-specific wheels to merge" - ) - parser.add_argument( - "--output-dir", "-o", default="dist", help="Output directory for merged wheel" - ) + parser = argparse.ArgumentParser(description="Merge CUDA-specific wheels into a single multi-CUDA wheel") + parser.add_argument("wheels", nargs="+", help="Paths to the CUDA-specific wheels to merge") + parser.add_argument("--output-dir", "-o", default="dist", help="Output directory for merged wheel") args = parser.parse_args() diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py index 40d10c3aa..90d520d78 100644 --- a/cuda_core/cuda/core/experimental/__init__.py +++ b/cuda_core/cuda/core/experimental/__init__.py @@ -4,14 +4,15 @@ try: import cuda.bindings -except ImportError as e: - raise ImportError("cuda.bindings 12.x or 13.x must be installed") +except ImportError: + raise ImportError("cuda.bindings 12.x or 13.x must be installed") from None else: cuda_major, cuda_minor = cuda.bindings.__version__.split(".")[:2] if cuda_major not in ("12", "13"): raise ImportError("cuda.bindings 12.x or 13.x must be installed") import importlib + subdir = f"cu{cuda_major}" try: verioned_mod = importlib.import_module(f".{subdir}", __package__) @@ -25,29 +26,29 @@ finally: del cuda.bindings, importlib, subdir, cuda_major, cuda_minor -from cuda.core.experimental import utils -from cuda.core.experimental._device import Device -from cuda.core.experimental._event import Event, EventOptions -from cuda.core.experimental._graph import ( +from cuda.core.experimental import utils # noqa: E402 +from cuda.core.experimental._device import Device # noqa: E402 +from cuda.core.experimental._event import Event, EventOptions # noqa: E402 +from cuda.core.experimental._graph import ( # noqa: E402 Graph, GraphBuilder, GraphCompleteOptions, GraphDebugPrintOptions, ) -from cuda.core.experimental._launch_config import LaunchConfig -from cuda.core.experimental._launcher import launch -from cuda.core.experimental._linker import Linker, LinkerOptions -from cuda.core.experimental._memory import ( +from cuda.core.experimental._launch_config import LaunchConfig # noqa: E402 +from cuda.core.experimental._launcher import launch # noqa: E402 +from cuda.core.experimental._linker import Linker, LinkerOptions # noqa: E402 +from cuda.core.experimental._memory import ( # noqa: E402 Buffer, DeviceMemoryResource, IPCChannel, LegacyPinnedMemoryResource, MemoryResource, ) -from cuda.core.experimental._module import Kernel, ObjectCode -from cuda.core.experimental._program import Program, ProgramOptions -from cuda.core.experimental._stream import Stream, StreamOptions -from cuda.core.experimental._system import System +from cuda.core.experimental._module import Kernel, ObjectCode # noqa: E402 +from cuda.core.experimental._program import Program, ProgramOptions # noqa: E402 +from cuda.core.experimental._stream import Stream, StreamOptions # noqa: E402 +from cuda.core.experimental._system import System # noqa: E402 system = System() __import__("sys").modules[__spec__.name + ".system"] = system From 61617cfbf1e6b083693aa6ba9c983642b46b83cb Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sun, 28 Sep 2025 23:26:33 +0000 Subject: [PATCH 09/27] set up double-build CI workflow --- .github/actions/fetch_ctk/action.yml | 17 +++- .github/workflows/build-wheel.yml | 136 +++++++++++++++++++++++---- .github/workflows/ci.yml | 9 +- ci/tools/env-vars | 3 + ci/versions.json | 3 + cuda_bindings/pyproject.toml | 4 - cuda_core/pyproject.toml | 2 +- 7 files changed, 145 insertions(+), 29 deletions(-) diff --git a/.github/actions/fetch_ctk/action.yml b/.github/actions/fetch_ctk/action.yml index 83b447f0c..be7536c63 100644 --- a/.github/actions/fetch_ctk/action.yml +++ b/.github/actions/fetch_ctk/action.yml @@ -18,6 +18,11 @@ inputs: required: false type: string default: "cuda_nvcc,cuda_cudart,cuda_crt,libnvvm,cuda_nvrtc,cuda_profiler_api,cuda_cccl,libnvjitlink,libcufile" + cuda-path: + description: "where the CTK components will be installed to, relative to $PWD" + required: false + type: string + default: "./cuda_toolkit" runs: using: composite @@ -159,18 +164,24 @@ runs: exit 1 fi + - name: Move CTK to the specified location + if: ${{ inputs.cuda-path != './cuda_toolkit' }} + shell: bash --noprofile --norc -xeuo pipefail {0} + run: | + mv ./cuda_toolkit ${{ inputs.cuda-path }} + - name: Set output environment variables shell: bash --noprofile --norc -xeuo pipefail {0} run: | # mimics actual CTK installation if [[ "${{ inputs.host-platform }}" == linux* ]]; then - CUDA_PATH=$(realpath "./cuda_toolkit") - echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${CUDA_PATH}/lib" >> $GITHUB_ENV + CUDA_PATH=$(realpath "${{ inputs.cuda-path }}") + echo "LD_LIBRARY_PATH=${CUDA_PATH}/lib:${LD_LIBRARY_PATH:-}" >> $GITHUB_ENV elif [[ "${{ inputs.host-platform }}" == win* ]]; then function normpath() { echo "$(echo $(cygpath -w $1) | sed 's/\\/\\\\/g')" } - CUDA_PATH=$(normpath $(realpath "./cuda_toolkit")) + CUDA_PATH=$(normpath $(realpath "${{ inputs.cuda-path }}")) echo "$(normpath ${CUDA_PATH}/bin)" >> $GITHUB_PATH fi echo "CUDA_PATH=${CUDA_PATH}" >> $GITHUB_ENV diff --git a/.github/workflows/build-wheel.yml b/.github/workflows/build-wheel.yml index db006be32..fa11ba76f 100644 --- a/.github/workflows/build-wheel.yml +++ b/.github/workflows/build-wheel.yml @@ -11,6 +11,9 @@ on: cuda-version: required: true type: string + prev-cuda-version: + required: true + type: string defaults: run: @@ -109,13 +112,33 @@ jobs: path: cuda_pathfinder/*.whl if-no-files-found: error + - name: Set up mini CTK + uses: ./.github/actions/fetch_ctk + continue-on-error: false + with: + host-platform: ${{ inputs.host-platform }} + cuda-version: ${{ inputs.cuda-version }} + + # TODO: this currently builds against the public cuda.bindings wheel. Consider + # building against the wheel from main instead (the below step). - name: Build cuda.core wheel uses: pypa/cibuildwheel@c923d83ad9c1bc00211c5041d0c3f73294ff88f6 # v3.1.4 with: package-dir: ./cuda_core/ output-dir: ${{ env.CUDA_CORE_ARTIFACTS_DIR }} - - - name: List the cuda.core artifacts directory + env: + CIBW_BUILD: ${{ env.CIBW_BUILD }} + CIBW_ENVIRONMENT: > + CUDA_CORE_BUILD_MAJOR=${{ env.BUILD_CUDA_MAJOR }} + # CIBW mounts the host filesystem under /host + CIBW_ENVIRONMENT_LINUX: > + CUDA_PATH=/host/${{ env.CUDA_PATH }} + CUDA_PYTHON_PARALLEL_LEVEL=${{ env.CUDA_PYTHON_PARALLEL_LEVEL }} + CIBW_ENVIRONMENT_WINDOWS: > + CUDA_PATH="$(cygpath -w ${{ env.CUDA_PATH }})" + CUDA_PYTHON_PARALLEL_LEVEL=${{ env.CUDA_PYTHON_PARALLEL_LEVEL }} + + - name: List the cuda.core artifacts directory and rename run: | if [[ "${{ inputs.host-platform }}" == win* ]]; then export CHOWN=chown @@ -123,31 +146,34 @@ jobs: export CHOWN="sudo chown" fi $CHOWN -R $(whoami) ${{ env.CUDA_CORE_ARTIFACTS_DIR }} - ls -lahR ${{ env.CUDA_CORE_ARTIFACTS_DIR }} - - - name: Check cuda.core wheel - run: | - twine check --strict ${{ env.CUDA_CORE_ARTIFACTS_DIR }}/*.whl - - name: Upload cuda.core build artifacts - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 - with: - name: ${{ env.CUDA_CORE_ARTIFACT_NAME }} - path: ${{ env.CUDA_CORE_ARTIFACTS_DIR }}/*.whl - if-no-files-found: error + # Rename wheel to include CUDA version suffix + mkdir -p "${{ env.CUDA_CORE_ARTIFACTS_DIR }}/cu${BUILD_CUDA_MAJOR}" + for wheel in ${{ env.CUDA_CORE_ARTIFACTS_DIR }}/*.whl; do + if [[ -f "${wheel}" ]]; then + base_name=$(basename "${wheel}" .whl) + new_name="${base_name}.cu${BUILD_CUDA_MAJOR}.whl" + mv "${wheel}" "${{ env.CUDA_CORE_ARTIFACTS_DIR }}/cu${BUILD_CUDA_MAJOR}/${new_name}" + echo "Renamed wheel to: ${new_name}" + fi + done - - name: Set up mini CTK - uses: ./.github/actions/fetch_ctk - continue-on-error: false - with: - host-platform: ${{ inputs.host-platform }} - cuda-version: ${{ inputs.cuda-version }} + ls -lahR ${{ env.CUDA_CORE_ARTIFACTS_DIR }} - name: Build cuda.bindings wheel uses: pypa/cibuildwheel@c923d83ad9c1bc00211c5041d0c3f73294ff88f6 # v3.1.4 with: package-dir: ./cuda_bindings/ output-dir: ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }} + env: + CIBW_BUILD: ${{ env.CIBW_BUILD }} + # CIBW mounts the host filesystem under /host + CIBW_ENVIRONMENT_LINUX: > + CUDA_PATH=/host/${{ env.CUDA_PATH }} + CUDA_PYTHON_PARALLEL_LEVEL=${{ env.CUDA_PYTHON_PARALLEL_LEVEL }} + CIBW_ENVIRONMENT_WINDOWS: > + CUDA_PATH="$(cygpath -w ${{ env.CUDA_PATH }})" + CUDA_PYTHON_PARALLEL_LEVEL=${{ env.CUDA_PYTHON_PARALLEL_LEVEL }} - name: List the cuda.bindings artifacts directory run: | @@ -241,7 +267,7 @@ jobs: - name: Build cuda.core Cython tests run: | - pip install $(ls ${{ env.CUDA_CORE_ARTIFACTS_DIR }}/*.whl)[test] + pip install $(ls ${{ env.CUDA_CORE_ARTIFACTS_DIR }}/"cu${BUILD_CUDA_MAJOR}"/*.whl)[test] pushd ${{ env.CUDA_CORE_CYTHON_TESTS_DIR }} bash build_tests.sh popd @@ -252,3 +278,73 @@ jobs: name: ${{ env.CUDA_CORE_ARTIFACT_NAME }}-tests path: ${{ env.CUDA_CORE_CYTHON_TESTS_DIR }}/test_*${{ env.PY_EXT_SUFFIX }} if-no-files-found: error + + # Note: This overwrites CUDA_PATH etc + - name: Set up mini CTK + uses: ./.github/actions/fetch_ctk + continue-on-error: false + with: + host-platform: ${{ inputs.host-platform }} + cuda-version: ${{ inputs.prev-cuda-version }} + cuda-path: "./cuda_toolkit_prev" + + # TODO: this currently builds against the public cuda.bindings wheel. Consider + # building against the wheel from the backport branch instead. + - name: Build cuda.core wheel + uses: pypa/cibuildwheel@c923d83ad9c1bc00211c5041d0c3f73294ff88f6 # v3.1.4 + with: + package-dir: ./cuda_core/ + output-dir: ${{ env.CUDA_CORE_ARTIFACTS_DIR }} + env: + CIBW_BUILD: ${{ env.CIBW_BUILD }} + CIBW_ENVIRONMENT: > + CUDA_CORE_BUILD_MAJOR=${{ env.BUILD_PREV_CUDA_MAJOR }} + # CIBW mounts the host filesystem under /host + CIBW_ENVIRONMENT_LINUX: > + CUDA_PATH=/host/${{ env.CUDA_PATH }} + CUDA_PYTHON_PARALLEL_LEVEL=${{ env.CUDA_PYTHON_PARALLEL_LEVEL }} + CIBW_ENVIRONMENT_WINDOWS: > + CUDA_PATH="$(cygpath -w ${{ env.CUDA_PATH }})" + CUDA_PYTHON_PARALLEL_LEVEL=${{ env.CUDA_PYTHON_PARALLEL_LEVEL }} + + - name: List the cuda.core artifacts directory and rename + run: | + if [[ "${{ inputs.host-platform }}" == win* ]]; then + export CHOWN=chown + else + export CHOWN="sudo chown" + fi + $CHOWN -R $(whoami) ${{ env.CUDA_CORE_ARTIFACTS_DIR }} + ls -lahR ${{ env.CUDA_CORE_ARTIFACTS_DIR }} + + # Rename wheel to include CUDA version suffix + mkdir -p "${{ env.CUDA_CORE_ARTIFACTS_DIR }}/cu${BUILD_PREV_CUDA_MAJOR}" + for wheel in ${{ env.CUDA_CORE_ARTIFACTS_DIR }}/*.whl; do + if [[ -f "${wheel}" ]]; then + base_name=$(basename "${wheel}" .whl) + new_name="${base_name}.cu${BUILD_PREV_CUDA_MAJOR}.whl" + mv "${wheel}" "${{ env.CUDA_CORE_ARTIFACTS_DIR }}/cu${BUILD_PREV_CUDA_MAJOR}/${new_name}" + echo "Renamed wheel to: ${new_name}" + fi + done + + ls -lahR ${{ env.CUDA_CORE_ARTIFACTS_DIR }} + + - name: Merge cuda.core wheels + run: | + pip install wheel + python ci/tools/merge_cuda_core_wheels.py \ + "${{ env.CUDA_CORE_ARTIFACTS_DIR }}"/cu"${BUILD_CUDA_MAJOR}"/cuda_core*.whl \ + "${{ env.CUDA_CORE_ARTIFACTS_DIR }}"/cu"${BUILD_PREV_CUDA_MAJOR}"/cuda_core*.whl \ + --output-dir "${{ env.CUDA_CORE_ARTIFACTS_DIR }}" + + - name: Check cuda.core wheel + run: | + twine check --strict ${{ env.CUDA_CORE_ARTIFACTS_DIR }}/*.whl + + - name: Upload cuda.core build artifacts + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 + with: + name: ${{ env.CUDA_CORE_ARTIFACT_NAME }} + path: ${{ env.CUDA_CORE_ARTIFACTS_DIR }}/*.whl + if-no-files-found: error diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 1b2bb241f..fbc267135 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -21,17 +21,21 @@ jobs: runs-on: ubuntu-latest outputs: CUDA_BUILD_VER: ${{ steps.get-vars.outputs.cuda_build_ver }} + CUDA_PREV_BUILD_VER: ${{ steps.get-vars.outputs.cuda_prev_build_ver }} steps: - name: Checkout repository uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: fetch-depth: 0 - - name: Get CUDA build version + - name: Get CUDA build versions id: get-vars run: | cuda_build_ver=$(jq -r .cuda.build.version ci/versions.json) echo "cuda_build_ver=$cuda_build_ver" >> $GITHUB_OUTPUT + cuda_prev_build_ver=$(jq -r .cuda.prev_build.version ci/versions.json) + echo "cuda_prev_build_ver=$cuda_prev_build_ver" >> $GITHUB_OUTPUT + # WARNING: make sure all of the build jobs are in sync build-linux-64: needs: @@ -48,6 +52,7 @@ jobs: with: host-platform: ${{ matrix.host-platform }} cuda-version: ${{ needs.ci-vars.outputs.CUDA_BUILD_VER }} + prev-cuda-version: ${{ needs.ci-vars.outputs.CUDA_PREV_BUILD_VER }} # WARNING: make sure all of the build jobs are in sync build-linux-aarch64: @@ -65,6 +70,7 @@ jobs: with: host-platform: ${{ matrix.host-platform }} cuda-version: ${{ needs.ci-vars.outputs.CUDA_BUILD_VER }} + prev-cuda-version: ${{ needs.ci-vars.outputs.CUDA_PREV_BUILD_VER }} # WARNING: make sure all of the build jobs are in sync build-windows: @@ -82,6 +88,7 @@ jobs: with: host-platform: ${{ matrix.host-platform }} cuda-version: ${{ needs.ci-vars.outputs.CUDA_BUILD_VER }} + prev-cuda-version: ${{ needs.ci-vars.outputs.CUDA_PREV_BUILD_VER }} # WARNING: make sure both Linux test jobs are in sync test-linux-64: diff --git a/ci/tools/env-vars b/ci/tools/env-vars index de4a5a6b9..f7db5179d 100755 --- a/ci/tools/env-vars +++ b/ci/tools/env-vars @@ -41,6 +41,9 @@ if [[ "${1}" == "build" ]]; then # platform is handled by the default value of platform (`auto`) in cibuildwheel # here we only need to specify the python version we want echo "CIBW_BUILD=cp${PYTHON_VERSION_FORMATTED}-*" >> $GITHUB_ENV + BUILD_CUDA_MAJOR="$(cut -d '.' -f 1 <<< ${CUDA_VER})" + echo "BUILD_CUDA_MAJOR=${BUILD_CUDA_MAJOR}" >> $GITHUB_ENV + echo "BUILD_PREV_CUDA_MAJOR=$((${BUILD_CUDA_MAJOR} - 1))" >> $GITHUB_ENV CUDA_BINDINGS_ARTIFACT_BASENAME="cuda-bindings-python${PYTHON_VERSION_FORMATTED}-cuda${CUDA_VER}-${HOST_PLATFORM}" elif [[ "${1}" == "test" ]]; then BUILD_CUDA_MAJOR="$(cut -d '.' -f 1 <<< ${BUILD_CUDA_VER})" diff --git a/ci/versions.json b/ci/versions.json index 271c69ac3..2acfae1e3 100644 --- a/ci/versions.json +++ b/ci/versions.json @@ -2,6 +2,9 @@ "cuda": { "build": { "version": "13.0.1" + }, + "prev_build": { + "version": "12.9.1" } } } diff --git a/cuda_bindings/pyproject.toml b/cuda_bindings/pyproject.toml index 97901678a..dc6c87eef 100644 --- a/cuda_bindings/pyproject.toml +++ b/cuda_bindings/pyproject.toml @@ -63,12 +63,8 @@ environment-pass = ["CUDA_PATH", "CUDA_PYTHON_PARALLEL_LEVEL"] [tool.cibuildwheel.linux] archs = "native" -# CIBW mounts the host filesystem under /host -environment-pass = ["CUDA_PATH"] -environment = { CUDA_HOME = "/host/$CUDA_PATH" } [tool.cibuildwheel.windows] archs = "AMD64" before-build = "pip install delvewheel" repair-wheel-command = "delvewheel repair --namespace-pkg cuda -w {dest_dir} {wheel}" -environment = { CUDA_HOME = "$(cygpath -w $CUDA_PATH)" } diff --git a/cuda_core/pyproject.toml b/cuda_core/pyproject.toml index ee0030f1c..75cc43abf 100644 --- a/cuda_core/pyproject.toml +++ b/cuda_core/pyproject.toml @@ -79,7 +79,7 @@ readme = { file = ["DESCRIPTION.rst"], content-type = "text/x-rst" } skip = "*-musllinux_*" enable = "cpython-freethreading" build-verbosity = 1 -environment-pass = ["CUDA_PYTHON_PARALLEL_LEVEL"] +environment-pass = ["CUDA_PATH", "CUDA_PYTHON_PARALLEL_LEVEL"] [tool.cibuildwheel.linux] archs = "native" From 9e799e40785eac42e9bd9ed63e0ef331445b5622 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sun, 28 Sep 2025 23:44:37 +0000 Subject: [PATCH 10/27] ensure CUDA_PATH is honored by the build backend --- cuda_core/build_hooks.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/cuda_core/build_hooks.py b/cuda_core/build_hooks.py index fd1692ad3..82bb77869 100644 --- a/cuda_core/build_hooks.py +++ b/cuda_core/build_hooks.py @@ -24,7 +24,6 @@ get_requires_for_build_sdist = _build_meta.get_requires_for_build_sdist -@functools.cache def _get_proper_cuda_bindings_major_version() -> str: # for local development (with/without build isolation) try: @@ -72,10 +71,21 @@ def strip_prefix_suffix(filename): return filename[len(root_path) : -4] module_names = (strip_prefix_suffix(f) for f in ext_files) + + @functools.cache + def get_cuda_paths(): + CUDA_PATH = os.environ.get("CUDA_PATH", os.environ.get("CUDA_HOME", None)) + if not CUDA_PATH: + raise RuntimeError("Environment variable CUDA_PATH or CUDA_HOME is not set") + CUDA_PATH = CUDA_PATH.split(os.pathsep) + print("CUDA paths:", CUDA_PATH) + return CUDA_PATH + ext_modules = tuple( Extension( f"cuda.core.experimental.{mod.replace(os.path.sep, '.')}", sources=[f"cuda/core/experimental/{mod}.pyx"], + include_dirs=list(os.path.join(root, "include") for root in get_cuda_paths()), language="c++", ) for mod in module_names From d5001d4f8a4ef3eb9db2dcd238fda4a037100b77 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Mon, 29 Sep 2025 01:26:05 +0000 Subject: [PATCH 11/27] try to reuse cuda-bindings wheels for 3.13t/3.14/3.14t --- .github/workflows/build-wheel.yml | 114 ++++++++++++++++++++---------- cuda_core/pyproject.toml | 1 - 2 files changed, 75 insertions(+), 40 deletions(-) diff --git a/.github/workflows/build-wheel.yml b/.github/workflows/build-wheel.yml index fa11ba76f..3b47f4615 100644 --- a/.github/workflows/build-wheel.yml +++ b/.github/workflows/build-wheel.yml @@ -119,17 +119,13 @@ jobs: host-platform: ${{ inputs.host-platform }} cuda-version: ${{ inputs.cuda-version }} - # TODO: this currently builds against the public cuda.bindings wheel. Consider - # building against the wheel from main instead (the below step). - - name: Build cuda.core wheel + - name: Build cuda.bindings wheel uses: pypa/cibuildwheel@c923d83ad9c1bc00211c5041d0c3f73294ff88f6 # v3.1.4 with: - package-dir: ./cuda_core/ - output-dir: ${{ env.CUDA_CORE_ARTIFACTS_DIR }} + package-dir: ./cuda_bindings/ + output-dir: ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }} env: CIBW_BUILD: ${{ env.CIBW_BUILD }} - CIBW_ENVIRONMENT: > - CUDA_CORE_BUILD_MAJOR=${{ env.BUILD_CUDA_MAJOR }} # CIBW mounts the host filesystem under /host CIBW_ENVIRONMENT_LINUX: > CUDA_PATH=/host/${{ env.CUDA_PATH }} @@ -138,63 +134,68 @@ jobs: CUDA_PATH="$(cygpath -w ${{ env.CUDA_PATH }})" CUDA_PYTHON_PARALLEL_LEVEL=${{ env.CUDA_PYTHON_PARALLEL_LEVEL }} - - name: List the cuda.core artifacts directory and rename + - name: List the cuda.bindings artifacts directory run: | if [[ "${{ inputs.host-platform }}" == win* ]]; then export CHOWN=chown else export CHOWN="sudo chown" fi - $CHOWN -R $(whoami) ${{ env.CUDA_CORE_ARTIFACTS_DIR }} + $CHOWN -R $(whoami) ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }} + ls -lahR ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }} - # Rename wheel to include CUDA version suffix - mkdir -p "${{ env.CUDA_CORE_ARTIFACTS_DIR }}/cu${BUILD_CUDA_MAJOR}" - for wheel in ${{ env.CUDA_CORE_ARTIFACTS_DIR }}/*.whl; do - if [[ -f "${wheel}" ]]; then - base_name=$(basename "${wheel}" .whl) - new_name="${base_name}.cu${BUILD_CUDA_MAJOR}.whl" - mv "${wheel}" "${{ env.CUDA_CORE_ARTIFACTS_DIR }}/cu${BUILD_CUDA_MAJOR}/${new_name}" - echo "Renamed wheel to: ${new_name}" - fi - done + - name: Check cuda.bindings wheel + run: | + twine check --strict ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}/*.whl - ls -lahR ${{ env.CUDA_CORE_ARTIFACTS_DIR }} + - name: Upload cuda.bindings build artifacts + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 + with: + name: ${{ env.CUDA_BINDINGS_ARTIFACT_NAME }} + path: ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}/*.whl + if-no-files-found: error - - name: Build cuda.bindings wheel + # TODO: ideally we want to build against public cuda-bindings + - name: Build cuda.core wheel uses: pypa/cibuildwheel@c923d83ad9c1bc00211c5041d0c3f73294ff88f6 # v3.1.4 with: - package-dir: ./cuda_bindings/ - output-dir: ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }} + package-dir: ./cuda_core/ + output-dir: ${{ env.CUDA_CORE_ARTIFACTS_DIR }} env: CIBW_BUILD: ${{ env.CIBW_BUILD }} # CIBW mounts the host filesystem under /host CIBW_ENVIRONMENT_LINUX: > CUDA_PATH=/host/${{ env.CUDA_PATH }} CUDA_PYTHON_PARALLEL_LEVEL=${{ env.CUDA_PYTHON_PARALLEL_LEVEL }} + CUDA_CORE_BUILD_MAJOR=${{ env.BUILD_CUDA_MAJOR }} + PIP_FIND_LINKS=/host/${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }} CIBW_ENVIRONMENT_WINDOWS: > CUDA_PATH="$(cygpath -w ${{ env.CUDA_PATH }})" CUDA_PYTHON_PARALLEL_LEVEL=${{ env.CUDA_PYTHON_PARALLEL_LEVEL }} + CUDA_CORE_BUILD_MAJOR=${{ env.BUILD_CUDA_MAJOR }} + PIP_FIND_LINKS="$(cygpath -w ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }})" - - name: List the cuda.bindings artifacts directory + - name: List the cuda.core artifacts directory and rename run: | if [[ "${{ inputs.host-platform }}" == win* ]]; then export CHOWN=chown else export CHOWN="sudo chown" fi - $CHOWN -R $(whoami) ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }} - ls -lahR ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }} + $CHOWN -R $(whoami) ${{ env.CUDA_CORE_ARTIFACTS_DIR }} - - name: Check cuda.bindings wheel - run: | - twine check --strict ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}/*.whl + # Rename wheel to include CUDA version suffix + mkdir -p "${{ env.CUDA_CORE_ARTIFACTS_DIR }}/cu${BUILD_CUDA_MAJOR}" + for wheel in ${{ env.CUDA_CORE_ARTIFACTS_DIR }}/*.whl; do + if [[ -f "${wheel}" ]]; then + base_name=$(basename "${wheel}" .whl) + new_name="${base_name}.cu${BUILD_CUDA_MAJOR}.whl" + mv "${wheel}" "${{ env.CUDA_CORE_ARTIFACTS_DIR }}/cu${BUILD_CUDA_MAJOR}/${new_name}" + echo "Renamed wheel to: ${new_name}" + fi + done - - name: Upload cuda.bindings build artifacts - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 - with: - name: ${{ env.CUDA_BINDINGS_ARTIFACT_NAME }} - path: ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}/*.whl - if-no-files-found: error + ls -lahR ${{ env.CUDA_CORE_ARTIFACTS_DIR }} # We only need/want a single pure python wheel, pick linux-64 index 0. - name: Build and check cuda-python wheel @@ -288,8 +289,41 @@ jobs: cuda-version: ${{ inputs.prev-cuda-version }} cuda-path: "./cuda_toolkit_prev" - # TODO: this currently builds against the public cuda.bindings wheel. Consider - # building against the wheel from the backport branch instead. + # TODO: ideally we want to build against public cuda-bindings + - name: Download cuda.bindings build artifacts from the prior branch + if: ${{ matrix.python-version == '3.13t' + || matrix.python-version == '3.14' + || matrix.python-version == '3.14t' }} + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + if ! (command -v gh 2>&1 >/dev/null); then + # See https://github.com/cli/cli/blob/trunk/docs/install_linux.md#debian-ubuntu-linux-raspberry-pi-os-apt. + # gh is needed for artifact fetching. + mkdir -p -m 755 /etc/apt/keyrings \ + && out=$(mktemp) && wget -nv -O$out https://cli.github.com/packages/githubcli-archive-keyring.gpg \ + && cat $out | tee /etc/apt/keyrings/githubcli-archive-keyring.gpg > /dev/null \ + && chmod go+r /etc/apt/keyrings/githubcli-archive-keyring.gpg \ + && echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/githubcli-archive-keyring.gpg] https://cli.github.com/packages stable main" | tee /etc/apt/sources.list.d/github-cli.list > /dev/null \ + && apt update \ + && apt install gh -y + fi + + OLD_BRANCH=$(cat .github/BACKPORT_BRANCH) + OLD_BASENAME="cuda-bindings-python${PYTHON_VERSION_FORMATTED}-cuda*-${{ inputs.host-platform }}*" + LATEST_PRIOR_RUN_ID=$(gh run list -b ${OLD_BRANCH} -L 1 -w "ci.yml" -s completed -R NVIDIA/cuda-python --json databaseId | jq '.[]| .databaseId') + if [[ "$LATEST_PRIOR_RUN_ID" == "" ]]; then + echo "LATEST_PRIOR_RUN_ID not found!" + exit 1 + fi + + gh run download $LATEST_PRIOR_RUN_ID -p ${OLD_BASENAME} -R NVIDIA/cuda-python + rm -rf ${OLD_BASENAME}-tests # exclude cython test artifacts + ls -al $OLD_BASENAME + mkdir -p "${{ env.CUDA_CORE_ARTIFACTS_DIR }}" + mv $OLD_BASENAME/*.whl "${{ env.CUDA_CORE_ARTIFACTS_DIR }}" + rmdir $OLD_BASENAME + - name: Build cuda.core wheel uses: pypa/cibuildwheel@c923d83ad9c1bc00211c5041d0c3f73294ff88f6 # v3.1.4 with: @@ -297,15 +331,17 @@ jobs: output-dir: ${{ env.CUDA_CORE_ARTIFACTS_DIR }} env: CIBW_BUILD: ${{ env.CIBW_BUILD }} - CIBW_ENVIRONMENT: > - CUDA_CORE_BUILD_MAJOR=${{ env.BUILD_PREV_CUDA_MAJOR }} # CIBW mounts the host filesystem under /host CIBW_ENVIRONMENT_LINUX: > CUDA_PATH=/host/${{ env.CUDA_PATH }} CUDA_PYTHON_PARALLEL_LEVEL=${{ env.CUDA_PYTHON_PARALLEL_LEVEL }} + CUDA_CORE_BUILD_MAJOR=${{ env.BUILD_PREV_CUDA_MAJOR }} + PIP_FIND_LINKS=/host/${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }} CIBW_ENVIRONMENT_WINDOWS: > CUDA_PATH="$(cygpath -w ${{ env.CUDA_PATH }})" CUDA_PYTHON_PARALLEL_LEVEL=${{ env.CUDA_PYTHON_PARALLEL_LEVEL }} + CUDA_CORE_BUILD_MAJOR=${{ env.BUILD_PREV_CUDA_MAJOR }} + PIP_FIND_LINKS="$(cygpath -w ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }})" - name: List the cuda.core artifacts directory and rename run: | diff --git a/cuda_core/pyproject.toml b/cuda_core/pyproject.toml index 75cc43abf..d107f0d6e 100644 --- a/cuda_core/pyproject.toml +++ b/cuda_core/pyproject.toml @@ -79,7 +79,6 @@ readme = { file = ["DESCRIPTION.rst"], content-type = "text/x-rst" } skip = "*-musllinux_*" enable = "cpython-freethreading" build-verbosity = 1 -environment-pass = ["CUDA_PATH", "CUDA_PYTHON_PARALLEL_LEVEL"] [tool.cibuildwheel.linux] archs = "native" From 1180ab6bf2fd5d4d03dd1376e1db4985171f2145 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Mon, 29 Sep 2025 02:32:11 +0000 Subject: [PATCH 12/27] disable building/testing 313t/314/314t for now --- .github/workflows/build-wheel.yml | 6 +++--- ci/test-matrix.json | 16 ++-------------- 2 files changed, 5 insertions(+), 17 deletions(-) diff --git a/.github/workflows/build-wheel.yml b/.github/workflows/build-wheel.yml index 3b47f4615..c0b2a421b 100644 --- a/.github/workflows/build-wheel.yml +++ b/.github/workflows/build-wheel.yml @@ -33,9 +33,9 @@ jobs: - "3.11" - "3.12" - "3.13" - - "3.13t" - - "3.14" - - "3.14t" +# - "3.13t" +# - "3.14" +# - "3.14t" name: py${{ matrix.python-version }} runs-on: ${{ (inputs.host-platform == 'linux-64' && 'linux-amd64-cpu8') || (inputs.host-platform == 'linux-aarch64' && 'linux-arm64-cpu8') || diff --git a/ci/test-matrix.json b/ci/test-matrix.json index 10721659b..41cf03018 100644 --- a/ci/test-matrix.json +++ b/ci/test-matrix.json @@ -14,9 +14,6 @@ { "ARCH": "amd64", "PY_VER": "3.12", "CUDA_VER": "13.0.1", "LOCAL_CTK": "0", "GPU": "l4", "DRIVER": "latest" }, { "ARCH": "amd64", "PY_VER": "3.13", "CUDA_VER": "12.9.1", "LOCAL_CTK": "0", "GPU": "l4", "DRIVER": "latest" }, { "ARCH": "amd64", "PY_VER": "3.13", "CUDA_VER": "13.0.1", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" }, - { "ARCH": "amd64", "PY_VER": "3.13t", "CUDA_VER": "13.0.1", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" }, - { "ARCH": "amd64", "PY_VER": "3.14", "CUDA_VER": "13.0.1", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" }, - { "ARCH": "amd64", "PY_VER": "3.14t", "CUDA_VER": "13.0.1", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" }, { "ARCH": "arm64", "PY_VER": "3.9", "CUDA_VER": "12.9.1", "LOCAL_CTK": "0", "GPU": "a100", "DRIVER": "latest" }, { "ARCH": "arm64", "PY_VER": "3.9", "CUDA_VER": "13.0.1", "LOCAL_CTK": "1", "GPU": "a100", "DRIVER": "latest" }, { "ARCH": "arm64", "PY_VER": "3.10", "CUDA_VER": "12.9.1", "LOCAL_CTK": "1", "GPU": "a100", "DRIVER": "latest" }, @@ -26,10 +23,7 @@ { "ARCH": "arm64", "PY_VER": "3.12", "CUDA_VER": "12.9.1", "LOCAL_CTK": "1", "GPU": "a100", "DRIVER": "latest" }, { "ARCH": "arm64", "PY_VER": "3.12", "CUDA_VER": "13.0.1", "LOCAL_CTK": "0", "GPU": "a100", "DRIVER": "latest" }, { "ARCH": "arm64", "PY_VER": "3.13", "CUDA_VER": "12.9.1", "LOCAL_CTK": "0", "GPU": "a100", "DRIVER": "latest" }, - { "ARCH": "arm64", "PY_VER": "3.13", "CUDA_VER": "13.0.1", "LOCAL_CTK": "1", "GPU": "a100", "DRIVER": "latest" }, - { "ARCH": "arm64", "PY_VER": "3.13t", "CUDA_VER": "13.0.1", "LOCAL_CTK": "1", "GPU": "a100", "DRIVER": "latest" }, - { "ARCH": "arm64", "PY_VER": "3.14", "CUDA_VER": "13.0.1", "LOCAL_CTK": "1", "GPU": "a100", "DRIVER": "latest" }, - { "ARCH": "arm64", "PY_VER": "3.14t", "CUDA_VER": "13.0.1", "LOCAL_CTK": "1", "GPU": "a100", "DRIVER": "latest" } + { "ARCH": "arm64", "PY_VER": "3.13", "CUDA_VER": "13.0.1", "LOCAL_CTK": "1", "GPU": "a100", "DRIVER": "latest" } ], "nightly": [ { "ARCH": "amd64", "PY_VER": "3.9", "CUDA_VER": "11.8.0", "LOCAL_CTK": "0", "GPU": "l4", "DRIVER": "earliest" }, @@ -94,13 +88,7 @@ { "ARCH": "amd64", "PY_VER": "3.12", "CUDA_VER": "12.9.1", "LOCAL_CTK": "0", "GPU": "l4", "DRIVER": "latest" }, { "ARCH": "amd64", "PY_VER": "3.12", "CUDA_VER": "12.9.1", "LOCAL_CTK": "1", "GPU": "t4", "DRIVER": "latest" }, { "ARCH": "amd64", "PY_VER": "3.13", "CUDA_VER": "13.0.1", "LOCAL_CTK": "0", "GPU": "t4", "DRIVER": "latest" }, - { "ARCH": "amd64", "PY_VER": "3.13", "CUDA_VER": "13.0.1", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" }, - { "ARCH": "amd64", "PY_VER": "3.13t", "CUDA_VER": "13.0.1", "LOCAL_CTK": "0", "GPU": "t4", "DRIVER": "latest" }, - { "ARCH": "amd64", "PY_VER": "3.13t", "CUDA_VER": "13.0.1", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" }, - { "ARCH": "amd64", "PY_VER": "3.14", "CUDA_VER": "13.0.1", "LOCAL_CTK": "0", "GPU": "t4", "DRIVER": "latest" }, - { "ARCH": "amd64", "PY_VER": "3.14", "CUDA_VER": "13.0.1", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" }, - { "ARCH": "amd64", "PY_VER": "3.14t", "CUDA_VER": "13.0.1", "LOCAL_CTK": "0", "GPU": "t4", "DRIVER": "latest" }, - { "ARCH": "amd64", "PY_VER": "3.14t", "CUDA_VER": "13.0.1", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" } + { "ARCH": "amd64", "PY_VER": "3.13", "CUDA_VER": "13.0.1", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" } ], "nightly": [ { "ARCH": "amd64", "PY_VER": "3.12", "CUDA_VER": "11.8.0", "LOCAL_CTK": "0", "GPU": "l4", "DRIVER": "latest" }, From 8cbf40c6597358fddd3a9224016b7ea1381a42d3 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Mon, 29 Sep 2025 03:06:20 +0000 Subject: [PATCH 13/27] deprecate PY39 as per #846 --- cuda_core/cuda/core/experimental/__init__.py | 11 +++++++++++ cuda_core/docs/source/release/0.X.Y-notes.rst | 1 + 2 files changed, 12 insertions(+) diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py index 90d520d78..fb0a2f469 100644 --- a/cuda_core/cuda/core/experimental/__init__.py +++ b/cuda_core/cuda/core/experimental/__init__.py @@ -26,6 +26,17 @@ finally: del cuda.bindings, importlib, subdir, cuda_major, cuda_minor +import sys # noqa: E402 +import warnings # noqa: E402 + +if sys.version_info < (3, 10): + warnings.warn( + "support for Python 3.9 and below is deprecated and subject to future removal", + category=UserWarning, + stacklevel=1, + ) +del sys, warnings + from cuda.core.experimental import utils # noqa: E402 from cuda.core.experimental._device import Device # noqa: E402 from cuda.core.experimental._event import Event, EventOptions # noqa: E402 diff --git a/cuda_core/docs/source/release/0.X.Y-notes.rst b/cuda_core/docs/source/release/0.X.Y-notes.rst index 433e34353..551cbe65c 100644 --- a/cuda_core/docs/source/release/0.X.Y-notes.rst +++ b/cuda_core/docs/source/release/0.X.Y-notes.rst @@ -12,6 +12,7 @@ Released on TBD Highlights ---------- +- This is the last release that officially supports Python 3.9. - Fix for :class:`LaunchConfig` grid parameter unit conversion when thread block clusters are used. From 9519904d0914be3009540bbbb78e56bed0d42bd1 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Mon, 29 Sep 2025 03:12:13 +0000 Subject: [PATCH 14/27] also turn on parallel cythonization --- cuda_core/build_hooks.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/cuda_core/build_hooks.py b/cuda_core/build_hooks.py index 82bb77869..73bfbe4a9 100644 --- a/cuda_core/build_hooks.py +++ b/cuda_core/build_hooks.py @@ -91,8 +91,12 @@ def get_cuda_paths(): for mod in module_names ) + nthreads = int(os.environ.get("CUDA_PYTHON_PARALLEL_LEVEL", os.cpu_count() // 2)) + global _extensions - _extensions = cythonize(ext_modules, verbose=True, language_level=3, compiler_directives={"embedsignature": True}) + _extensions = cythonize( + ext_modules, verbose=True, language_level=3, nthreads=nthreads, compiler_directives={"embedsignature": True} + ) return _build_meta.build_wheel(wheel_directory, config_settings, metadata_directory) From 96ce48055c5175047077f1e49cde00b232d4d106 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Mon, 29 Sep 2025 03:32:29 +0000 Subject: [PATCH 15/27] cythonize event --- cuda_core/cuda/core/experimental/_event.pyx | 59 +++++++++++--------- cuda_core/cuda/core/experimental/_stream.pyx | 4 +- 2 files changed, 36 insertions(+), 27 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_event.pyx b/cuda_core/cuda/core/experimental/_event.pyx index 41c0b1ce6..14f65a90d 100644 --- a/cuda_core/cuda/core/experimental/_event.pyx +++ b/cuda_core/cuda/core/experimental/_event.pyx @@ -4,6 +4,11 @@ from __future__ import annotations +from libc.stdint cimport uintptr_t + +# TODO: how about cuda.bindings < 12.6.2? +from cuda.bindings cimport cydriver + from cuda.core.experimental._utils.cuda_utils cimport ( _check_driver_error as raise_if_driver_error, check_or_create_options, @@ -78,12 +83,15 @@ cdef class Event: """ cdef: - object _handle + cydriver.CUevent _handle bint _timing_disabled bint _busy_waited int _device_id object _ctx_handle + def __cinit__(self): + self._handle = (NULL) + def __init__(self, *args, **kwargs): raise RuntimeError("Event objects cannot be instantiated directly. Please use Stream APIs (record).") @@ -91,19 +99,19 @@ cdef class Event: def _init(cls, device_id: int, ctx_handle: Context, options=None): cdef Event self = Event.__new__(cls) cdef EventOptions opts = check_or_create_options(EventOptions, options, "Event options") - flags = 0x0 + cdef unsigned int flags = 0x0 self._timing_disabled = False self._busy_waited = False if not opts.enable_timing: - flags |= driver.CUevent_flags.CU_EVENT_DISABLE_TIMING + flags |= cydriver.CUevent_flags.CU_EVENT_DISABLE_TIMING self._timing_disabled = True if opts.busy_waited_sync: - flags |= driver.CUevent_flags.CU_EVENT_BLOCKING_SYNC + flags |= cydriver.CUevent_flags.CU_EVENT_BLOCKING_SYNC self._busy_waited = True if opts.support_ipc: raise NotImplementedError("WIP: https://github.com/NVIDIA/cuda-python/issues/103") - err, self._handle = driver.cuEventCreate(flags) - raise_if_driver_error(err) + # TODO: use HANDLE_RETURN + err = cydriver.cuEventCreate(&self._handle, flags) self._device_id = device_id self._ctx_handle = ctx_handle return self @@ -111,10 +119,10 @@ cdef class Event: cdef _shutdown_safe_close(self, is_shutting_down=sys.is_finalizing): if is_shutting_down and is_shutting_down(): return - if self._handle is not None: - err, = driver.cuEventDestroy(self._handle) - self._handle = None - raise_if_driver_error(err) + if self._handle != NULL: + # TODO: use HANDLE_RETURN + err = cydriver.cuEventDestroy(self._handle) + self._handle = (NULL) cpdef close(self): """Destroy the event.""" @@ -129,14 +137,14 @@ cdef class Event: def __rsub__(self, other): return NotImplemented - def __sub__(self, other): + def __sub__(self, other: Event): # return self - other (in milliseconds) - err, timing = driver.cuEventElapsedTime(other.handle, self._handle) - try: - raise_if_driver_error(err) + cdef float timing + err = cydriver.cuEventElapsedTime(&timing, other._handle, self._handle) + if err == 0: return timing - except CUDAError as e: - if err == driver.CUresult.CUDA_ERROR_INVALID_HANDLE: + else: + if err == cydriver.CUresult.CUDA_ERROR_INVALID_HANDLE: if self.is_timing_disabled or other.is_timing_disabled: explanation = ( "Both Events must be created with timing enabled in order to subtract them; " @@ -147,15 +155,15 @@ cdef class Event: "Both Events must be recorded before they can be subtracted; " "use Stream.record() to record both events to a stream." ) - elif err == driver.CUresult.CUDA_ERROR_NOT_READY: + elif err == cydriver.CUresult.CUDA_ERROR_NOT_READY: explanation = ( "One or both events have not completed; " "use Event.sync(), Stream.sync(), or Device.sync() to wait for the events to complete " "before subtracting them." ) else: - raise e - raise RuntimeError(explanation) from e + raise CUDAError(err) + raise RuntimeError(explanation) @property def is_timing_disabled(self) -> bool: @@ -182,17 +190,18 @@ cdef class Event: has been completed. """ - handle_return(driver.cuEventSynchronize(self._handle)) + # TODO: use HANDLE_RETURN + err = cydriver.cuEventSynchronize(self._handle) @property def is_done(self) -> bool: """Return True if all captured works have been completed, otherwise False.""" - result, = driver.cuEventQuery(self._handle) - if result == driver.CUresult.CUDA_SUCCESS: + result = cydriver.cuEventQuery(self._handle) + if result == cydriver.CUresult.CUDA_SUCCESS: return True - if result == driver.CUresult.CUDA_ERROR_NOT_READY: + if result == cydriver.CUresult.CUDA_ERROR_NOT_READY: return False - handle_return(result) + # TODO: use HANDLE_RETURN @property def handle(self) -> cuda.bindings.driver.CUevent: @@ -203,7 +212,7 @@ cdef class Event: This handle is a Python object. To get the memory address of the underlying C handle, call ``int(Event.handle)``. """ - return self._handle + return driver.CUevent((self._handle)) @property def device(self) -> Device: diff --git a/cuda_core/cuda/core/experimental/_stream.pyx b/cuda_core/cuda/core/experimental/_stream.pyx index 284831cd6..cad5612fa 100644 --- a/cuda_core/cuda/core/experimental/_stream.pyx +++ b/cuda_core/cuda/core/experimental/_stream.pyx @@ -122,7 +122,7 @@ cdef class Stream: object _device_id object _ctx_handle - def __cinit__(self, *args, **kwargs): + def __cinit__(self): self._handle = (NULL) def __init__(self, *args, **kwargs): @@ -235,7 +235,7 @@ cdef class Stream: This handle is a Python object. To get the memory address of the underlying C handle, call ``int(Stream.handle)``. """ - return driver.CUstream((self._handle)) + return driver.CUstream((self._handle)) @property def is_nonblocking(self) -> bool: From e702b5ef328819336affb0221b10cd480e73e9b2 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Mon, 29 Sep 2025 04:14:01 +0000 Subject: [PATCH 16/27] fix error handling --- cuda_core/cuda/core/experimental/_event.pyx | 13 +++---- cuda_core/cuda/core/experimental/_stream.pyx | 36 +++++++------------ .../core/experimental/_utils/cuda_utils.pxd | 18 ++++++++-- .../core/experimental/_utils/cuda_utils.pyx | 6 ++++ 4 files changed, 39 insertions(+), 34 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_event.pyx b/cuda_core/cuda/core/experimental/_event.pyx index 14f65a90d..0d5737e37 100644 --- a/cuda_core/cuda/core/experimental/_event.pyx +++ b/cuda_core/cuda/core/experimental/_event.pyx @@ -10,8 +10,8 @@ from libc.stdint cimport uintptr_t from cuda.bindings cimport cydriver from cuda.core.experimental._utils.cuda_utils cimport ( - _check_driver_error as raise_if_driver_error, check_or_create_options, + HANDLE_RETURN ) from dataclasses import dataclass @@ -110,8 +110,7 @@ cdef class Event: self._busy_waited = True if opts.support_ipc: raise NotImplementedError("WIP: https://github.com/NVIDIA/cuda-python/issues/103") - # TODO: use HANDLE_RETURN - err = cydriver.cuEventCreate(&self._handle, flags) + HANDLE_RETURN(cydriver.cuEventCreate(&self._handle, flags)) self._device_id = device_id self._ctx_handle = ctx_handle return self @@ -120,8 +119,7 @@ cdef class Event: if is_shutting_down and is_shutting_down(): return if self._handle != NULL: - # TODO: use HANDLE_RETURN - err = cydriver.cuEventDestroy(self._handle) + HANDLE_RETURN(cydriver.cuEventDestroy(self._handle)) self._handle = (NULL) cpdef close(self): @@ -190,8 +188,7 @@ cdef class Event: has been completed. """ - # TODO: use HANDLE_RETURN - err = cydriver.cuEventSynchronize(self._handle) + HANDLE_RETURN(cydriver.cuEventSynchronize(self._handle)) @property def is_done(self) -> bool: @@ -201,7 +198,7 @@ cdef class Event: return True if result == cydriver.CUresult.CUDA_ERROR_NOT_READY: return False - # TODO: use HANDLE_RETURN + HANDLE_RETURN(result) @property def handle(self) -> cuda.bindings.driver.CUevent: diff --git a/cuda_core/cuda/core/experimental/_stream.pyx b/cuda_core/cuda/core/experimental/_stream.pyx index cad5612fa..ee6f6be01 100644 --- a/cuda_core/cuda/core/experimental/_stream.pyx +++ b/cuda_core/cuda/core/experimental/_stream.pyx @@ -10,8 +10,8 @@ from libc.stdint cimport uintptr_t from cuda.bindings cimport cydriver from cuda.core.experimental._utils.cuda_utils cimport ( - _check_driver_error as raise_if_driver_error, check_or_create_options, + HANDLE_RETURN, ) import sys @@ -178,9 +178,8 @@ cdef class Stream: priority = opts.priority flags = cydriver.CUstream_flags.CU_STREAM_NON_BLOCKING if nonblocking else cydriver.CUstream_flags.CU_STREAM_DEFAULT - # TODO: use HANDLE_RETURN cdef int high, low - err = cydriver.cuCtxGetStreamPriorityRange(&high, &low) + HANDLE_RETURN(cydriver.cuCtxGetStreamPriorityRange(&high, &low)) if priority is not None: if not (low <= priority <= high): raise ValueError(f"{priority=} is out of range {[low, high]}") @@ -188,8 +187,7 @@ cdef class Stream: priority = high cdef cydriver.CUstream s - # TODO: add HANDLE_RETURN macro to check driver error code? - err = cydriver.cuStreamCreateWithPriority(&s, flags, priority) + HANDLE_RETURN(cydriver.cuStreamCreateWithPriority(&s, flags, priority)) self._handle = s self._owner = None self._nonblocking = nonblocking @@ -207,8 +205,7 @@ cdef class Stream: if self._owner is None: if self._handle and not self._builtin: - # TODO: use HANDLE_RETURN - err = cydriver.cuStreamDestroy(self._handle) + HANDLE_RETURN(cydriver.cuStreamDestroy(self._handle)) else: self._owner = None self._handle = (NULL) @@ -242,8 +239,7 @@ cdef class Stream: """Return True if this is a nonblocking stream, otherwise False.""" cdef unsigned int flags if self._nonblocking is None: - # TODO: switch to HANDLE_RETURN - err = cydriver.cuStreamGetFlags(self._handle, &flags) + HANDLE_RETURN(cydriver.cuStreamGetFlags(self._handle, &flags)) if flags & cydriver.CUstream_flags.CU_STREAM_NON_BLOCKING: self._nonblocking = True else: @@ -255,15 +251,13 @@ cdef class Stream: """Return the stream priority.""" cdef int prio if self._priority is None: - # TODO: switch to HANDLE_RETURN - err = cydriver.cuStreamGetPriority(self._handle, &prio) + HANDLE_RETURN(cydriver.cuStreamGetPriority(self._handle, &prio)) self._priority = prio return self._priority def sync(self): """Synchronize the stream.""" - # TODO: switch to HANDLE_RETURN - err = cydriver.cuStreamSynchronize(self._handle) + HANDLE_RETURN(cydriver.cuStreamSynchronize(self._handle)) def record(self, event: Event = None, options: EventOptions = None) -> Event: """Record an event onto the stream. @@ -290,9 +284,8 @@ cdef class Stream: if event is None: self._get_device_and_context() event = Event._init(self._device_id, self._ctx_handle, options) - # TODO: switch to HANDLE_RETURN # TODO: revisit after Event is cythonized - err = cydriver.cuEventRecord((event.handle), self._handle) + HANDLE_RETURN(cydriver.cuEventRecord((event.handle), self._handle)) return event def wait(self, event_or_stream: Union[Event, Stream]): @@ -324,16 +317,14 @@ cdef class Stream: f" got {type(event_or_stream)}" ) from e stream = (s.handle) - # TODO: switch to HANDLE_RETURN - err = cydriver.cuEventCreate(&event, cydriver.CUevent_flags.CU_EVENT_DISABLE_TIMING) - err = cydriver.cuEventRecord(event, stream) + HANDLE_RETURN(cydriver.cuEventCreate(&event, cydriver.CUevent_flags.CU_EVENT_DISABLE_TIMING)) + HANDLE_RETURN(cydriver.cuEventRecord(event, stream)) discard_event = True # TODO: support flags other than 0? - # TODO: switch to HANDLE_RETURN - err = cydriver.cuStreamWaitEvent(self._handle, event, 0) + HANDLE_RETURN(cydriver.cuStreamWaitEvent(self._handle, event, 0)) if discard_event: - err = cydriver.cuEventDestroy(event) + HANDLE_RETURN(cydriver.cuEventDestroy(event)) @property def device(self) -> Device: @@ -354,8 +345,7 @@ cdef class Stream: # TODO: consider making self._ctx_handle typed? cdef cydriver.CUcontext ctx if self._ctx_handle is None: - # TODO: switch to HANDLE_RETURN - err = cydriver.cuStreamGetCtx(self._handle, &ctx) + HANDLE_RETURN(cydriver.cuStreamGetCtx(self._handle, &ctx)) self._ctx_handle = driver.CUcontext(ctx) return 0 diff --git a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd index 601736c47..c58f32610 100644 --- a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd +++ b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd @@ -2,18 +2,30 @@ # # SPDX-License-Identifier: Apache-2.0 - cimport cpython -cimport libc.stdint +from libc.stdint cimport int64_t + +# TODO: how about cuda.bindings < 12.6.2? +from cuda.bindings cimport cydriver + + +ctypedef fused supported_error_type: + cydriver.CUresult +cdef int HANDLE_RETURN(supported_error_type err) except?-1 + + +# TODO: stop exposing these within the codebase? cpdef int _check_driver_error(error) except?-1 cpdef int _check_runtime_error(error) except?-1 cpdef int _check_nvrtc_error(error) except?-1 + + cpdef check_or_create_options(type cls, options, str options_description=*, bint keep_none=*) -cdef inline tuple carray_int64_t_to_tuple(libc.stdint.int64_t *ptr, int length): +cdef inline tuple carray_int64_t_to_tuple(int64_t *ptr, int length): # Construct shape and strides tuples using the Python/C API for speed result = cpython.PyTuple_New(length) for i in range(length): diff --git a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx index 86588f733..c095e7564 100644 --- a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx +++ b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx @@ -52,6 +52,12 @@ def _reduce_3_tuple(t: tuple): return t[0] * t[1] * t[2] +cdef int HANDLE_RETURN(supported_error_type err) except?-1: + if supported_error_type is cydriver.CUresult: + if err != cydriver.CUresult.CUDA_SUCCESS: + return _check_driver_error(err) + + cdef object _DRIVER_SUCCESS = driver.CUresult.CUDA_SUCCESS cdef object _RUNTIME_SUCCESS = runtime.cudaError_t.cudaSuccess cdef object _NVRTC_SUCCESS = nvrtc.nvrtcResult.NVRTC_SUCCESS From 1f5159e231d409b9604ec50625a100a688f8ec75 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Mon, 29 Sep 2025 20:33:37 +0000 Subject: [PATCH 17/27] Revert "disable building/testing 313t/314/314t for now" This reverts commit 1180ab6bf2fd5d4d03dd1376e1db4985171f2145. --- .github/workflows/build-wheel.yml | 6 +++--- ci/test-matrix.json | 16 ++++++++++++++-- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/.github/workflows/build-wheel.yml b/.github/workflows/build-wheel.yml index c0b2a421b..3b47f4615 100644 --- a/.github/workflows/build-wheel.yml +++ b/.github/workflows/build-wheel.yml @@ -33,9 +33,9 @@ jobs: - "3.11" - "3.12" - "3.13" -# - "3.13t" -# - "3.14" -# - "3.14t" + - "3.13t" + - "3.14" + - "3.14t" name: py${{ matrix.python-version }} runs-on: ${{ (inputs.host-platform == 'linux-64' && 'linux-amd64-cpu8') || (inputs.host-platform == 'linux-aarch64' && 'linux-arm64-cpu8') || diff --git a/ci/test-matrix.json b/ci/test-matrix.json index 41cf03018..10721659b 100644 --- a/ci/test-matrix.json +++ b/ci/test-matrix.json @@ -14,6 +14,9 @@ { "ARCH": "amd64", "PY_VER": "3.12", "CUDA_VER": "13.0.1", "LOCAL_CTK": "0", "GPU": "l4", "DRIVER": "latest" }, { "ARCH": "amd64", "PY_VER": "3.13", "CUDA_VER": "12.9.1", "LOCAL_CTK": "0", "GPU": "l4", "DRIVER": "latest" }, { "ARCH": "amd64", "PY_VER": "3.13", "CUDA_VER": "13.0.1", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" }, + { "ARCH": "amd64", "PY_VER": "3.13t", "CUDA_VER": "13.0.1", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" }, + { "ARCH": "amd64", "PY_VER": "3.14", "CUDA_VER": "13.0.1", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" }, + { "ARCH": "amd64", "PY_VER": "3.14t", "CUDA_VER": "13.0.1", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" }, { "ARCH": "arm64", "PY_VER": "3.9", "CUDA_VER": "12.9.1", "LOCAL_CTK": "0", "GPU": "a100", "DRIVER": "latest" }, { "ARCH": "arm64", "PY_VER": "3.9", "CUDA_VER": "13.0.1", "LOCAL_CTK": "1", "GPU": "a100", "DRIVER": "latest" }, { "ARCH": "arm64", "PY_VER": "3.10", "CUDA_VER": "12.9.1", "LOCAL_CTK": "1", "GPU": "a100", "DRIVER": "latest" }, @@ -23,7 +26,10 @@ { "ARCH": "arm64", "PY_VER": "3.12", "CUDA_VER": "12.9.1", "LOCAL_CTK": "1", "GPU": "a100", "DRIVER": "latest" }, { "ARCH": "arm64", "PY_VER": "3.12", "CUDA_VER": "13.0.1", "LOCAL_CTK": "0", "GPU": "a100", "DRIVER": "latest" }, { "ARCH": "arm64", "PY_VER": "3.13", "CUDA_VER": "12.9.1", "LOCAL_CTK": "0", "GPU": "a100", "DRIVER": "latest" }, - { "ARCH": "arm64", "PY_VER": "3.13", "CUDA_VER": "13.0.1", "LOCAL_CTK": "1", "GPU": "a100", "DRIVER": "latest" } + { "ARCH": "arm64", "PY_VER": "3.13", "CUDA_VER": "13.0.1", "LOCAL_CTK": "1", "GPU": "a100", "DRIVER": "latest" }, + { "ARCH": "arm64", "PY_VER": "3.13t", "CUDA_VER": "13.0.1", "LOCAL_CTK": "1", "GPU": "a100", "DRIVER": "latest" }, + { "ARCH": "arm64", "PY_VER": "3.14", "CUDA_VER": "13.0.1", "LOCAL_CTK": "1", "GPU": "a100", "DRIVER": "latest" }, + { "ARCH": "arm64", "PY_VER": "3.14t", "CUDA_VER": "13.0.1", "LOCAL_CTK": "1", "GPU": "a100", "DRIVER": "latest" } ], "nightly": [ { "ARCH": "amd64", "PY_VER": "3.9", "CUDA_VER": "11.8.0", "LOCAL_CTK": "0", "GPU": "l4", "DRIVER": "earliest" }, @@ -88,7 +94,13 @@ { "ARCH": "amd64", "PY_VER": "3.12", "CUDA_VER": "12.9.1", "LOCAL_CTK": "0", "GPU": "l4", "DRIVER": "latest" }, { "ARCH": "amd64", "PY_VER": "3.12", "CUDA_VER": "12.9.1", "LOCAL_CTK": "1", "GPU": "t4", "DRIVER": "latest" }, { "ARCH": "amd64", "PY_VER": "3.13", "CUDA_VER": "13.0.1", "LOCAL_CTK": "0", "GPU": "t4", "DRIVER": "latest" }, - { "ARCH": "amd64", "PY_VER": "3.13", "CUDA_VER": "13.0.1", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" } + { "ARCH": "amd64", "PY_VER": "3.13", "CUDA_VER": "13.0.1", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" }, + { "ARCH": "amd64", "PY_VER": "3.13t", "CUDA_VER": "13.0.1", "LOCAL_CTK": "0", "GPU": "t4", "DRIVER": "latest" }, + { "ARCH": "amd64", "PY_VER": "3.13t", "CUDA_VER": "13.0.1", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" }, + { "ARCH": "amd64", "PY_VER": "3.14", "CUDA_VER": "13.0.1", "LOCAL_CTK": "0", "GPU": "t4", "DRIVER": "latest" }, + { "ARCH": "amd64", "PY_VER": "3.14", "CUDA_VER": "13.0.1", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" }, + { "ARCH": "amd64", "PY_VER": "3.14t", "CUDA_VER": "13.0.1", "LOCAL_CTK": "0", "GPU": "t4", "DRIVER": "latest" }, + { "ARCH": "amd64", "PY_VER": "3.14t", "CUDA_VER": "13.0.1", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" } ], "nightly": [ { "ARCH": "amd64", "PY_VER": "3.12", "CUDA_VER": "11.8.0", "LOCAL_CTK": "0", "GPU": "l4", "DRIVER": "latest" }, From dc8d076ffa7a76524aa25d9507ee0a076f218643 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Mon, 29 Sep 2025 20:55:27 +0000 Subject: [PATCH 18/27] fix artifact location --- .github/workflows/build-wheel.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build-wheel.yml b/.github/workflows/build-wheel.yml index 3b47f4615..4e97d36fb 100644 --- a/.github/workflows/build-wheel.yml +++ b/.github/workflows/build-wheel.yml @@ -320,8 +320,8 @@ jobs: gh run download $LATEST_PRIOR_RUN_ID -p ${OLD_BASENAME} -R NVIDIA/cuda-python rm -rf ${OLD_BASENAME}-tests # exclude cython test artifacts ls -al $OLD_BASENAME - mkdir -p "${{ env.CUDA_CORE_ARTIFACTS_DIR }}" - mv $OLD_BASENAME/*.whl "${{ env.CUDA_CORE_ARTIFACTS_DIR }}" + mkdir -p "${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}" + mv $OLD_BASENAME/*.whl "${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}" rmdir $OLD_BASENAME - name: Build cuda.core wheel From a501cc751c0d3bdd5c04b13efe2993f446b328eb Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Tue, 30 Sep 2025 13:39:08 +0000 Subject: [PATCH 19/27] cythonize device --- cuda_core/build_hooks.py | 6 +- .../experimental/{_device.py => _device.pyx} | 164 +++++++++--------- 2 files changed, 91 insertions(+), 79 deletions(-) rename cuda_core/cuda/core/experimental/{_device.py => _device.pyx} (91%) diff --git a/cuda_core/build_hooks.py b/cuda_core/build_hooks.py index 73bfbe4a9..c712e92cb 100644 --- a/cuda_core/build_hooks.py +++ b/cuda_core/build_hooks.py @@ -24,6 +24,7 @@ get_requires_for_build_sdist = _build_meta.get_requires_for_build_sdist +@functools.cache def _get_proper_cuda_bindings_major_version() -> str: # for local development (with/without build isolation) try: @@ -92,10 +93,13 @@ def get_cuda_paths(): ) nthreads = int(os.environ.get("CUDA_PYTHON_PARALLEL_LEVEL", os.cpu_count() // 2)) + compile_time_env = {"CUDA_CORE_BUILD_MAJOR": _get_proper_cuda_bindings_major_version()} global _extensions _extensions = cythonize( - ext_modules, verbose=True, language_level=3, nthreads=nthreads, compiler_directives={"embedsignature": True} + ext_modules, verbose=True, language_level=3, nthreads=nthreads, + compiler_directives={"embedsignature": True, "warn.deprecated.IF": False}, + compile_time_env=compile_time_env ) return _build_meta.build_wheel(wheel_directory, config_settings, metadata_directory) diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.pyx similarity index 91% rename from cuda_core/cuda/core/experimental/_device.py rename to cuda_core/cuda/core/experimental/_device.pyx index 0499baa58..589d5a42c 100644 --- a/cuda_core/cuda/core/experimental/_device.py +++ b/cuda_core/cuda/core/experimental/_device.pyx @@ -2,6 +2,13 @@ # # SPDX-License-Identifier: Apache-2.0 +from libc.stdint cimport uintptr_t + +# TODO: how about cuda.bindings < 12.6.2? +from cuda.bindings cimport cydriver + +from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN + import threading from typing import Optional, Union @@ -14,41 +21,44 @@ from cuda.core.experimental._utils.cuda_utils import ( ComputeCapability, CUDAError, - _check_driver_error, driver, handle_return, runtime, ) + _tls = threading.local() _lock = threading.Lock() -_is_cuInit = False +cdef bint _is_cuInit = False -class DeviceProperties: +cdef class DeviceProperties: """ A class to query various attributes of a CUDA device. Attributes are read-only and provide information about the device. """ + cdef: + int _handle + dict _cache - def __new__(self, *args, **kwargs): + def __init__(self, *args, **kwargs): raise RuntimeError("DeviceProperties cannot be instantiated directly. Please use Device APIs.") - __slots__ = ("_handle", "_cache") - @classmethod def _init(cls, handle): - self = super().__new__(cls) + cdef DeviceProperties self = DeviceProperties.__new__(cls) self._handle = handle self._cache = {} return self - def _get_attribute(self, attr): + cdef inline _get_attribute(self, cydriver.CUdevice_attribute attr): """Retrieve the attribute value directly from the driver.""" - return handle_return(driver.cuDeviceGetAttribute(attr, self._handle)) + cdef int val + HANDLE_RETURN(cydriver.cuDeviceGetAttribute(&val, attr, self._handle)) + return val - def _get_cached_attribute(self, attr): + cdef _get_cached_attribute(self, attr): """Retrieve the attribute value, using cache if applicable.""" if attr not in self._cache: self._cache[attr] = self._get_attribute(attr) @@ -931,8 +941,17 @@ def multicast_supported(self) -> bool: return bool(self._get_cached_attribute(driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED)) -_SUCCESS = driver.CUresult.CUDA_SUCCESS -_INVALID_CTX = driver.CUresult.CUDA_ERROR_INVALID_CONTEXT +cdef cydriver.CUcontext _get_primary_context(int dev_id) except?NULL: + try: + primary_ctxs = _tls.primary_ctxs + except AttributeError: + total = len(_tls.devices) + primary_ctxs = _tls.primary_ctxs = [0] * total + cdef cydriver.CUcontext ctx = (primary_ctxs[dev_id]) + if ctx == NULL: + HANDLE_RETURN(cydriver.cuDevicePrimaryCtxRetain(&ctx, dev_id)) + primary_ctxs[dev_id] = (ctx) + return ctx class Device: @@ -961,55 +980,56 @@ class Device: Default value of `None` return the currently used device. """ - __slots__ = ("_id", "_mr", "_has_inited", "_properties") def __new__(cls, device_id: Optional[int] = None): global _is_cuInit if _is_cuInit is False: with _lock: - handle_return(driver.cuInit(0)) + HANDLE_RETURN(cydriver.cuInit(0)) _is_cuInit = True # important: creating a Device instance does not initialize the GPU! + cdef cydriver.CUdevice dev + cdef cydriver.CUcontext ctx if device_id is None: - err, dev = driver.cuCtxGetDevice() - if err == _SUCCESS: + err = cydriver.cuCtxGetDevice(&dev) + if err == cydriver.CUresult.CUDA_SUCCESS: device_id = int(dev) - elif err == _INVALID_CTX: - ctx = handle_return(driver.cuCtxGetCurrent()) - assert int(ctx) == 0 + elif err == cydriver.CUresult.CUDA_ERROR_INVALID_CONTEXT: + HANDLE_RETURN(cydriver.cuCtxGetCurrent(&ctx)) + assert (ctx) == NULL device_id = 0 # cudart behavior else: - _check_driver_error(err) + HANDLE_RETURN(err) elif device_id < 0: raise ValueError(f"device_id must be >= 0, got {device_id}") # ensure Device is singleton + cdef int total, attr try: devices = _tls.devices except AttributeError: - total = handle_return(driver.cuDeviceGetCount()) + HANDLE_RETURN(cydriver.cuDeviceGetCount(&total)) devices = _tls.devices = [] for dev_id in range(total): - dev = super().__new__(cls) - dev._id = dev_id + device = super().__new__(cls) + device._id = dev_id # If the device is in TCC mode, or does not support memory pools for some other reason, # use the SynchronousMemoryResource which does not use memory pools. - if ( - handle_return( - driver.cuDeviceGetAttribute( - driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED, dev_id - ) + HANDLE_RETURN( + cydriver.cuDeviceGetAttribute( + &attr, cydriver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED, dev_id ) - ) == 1: - dev._mr = DeviceMemoryResource(dev_id) + ) + if attr == 1: + device._mr = DeviceMemoryResource(dev_id) else: - dev._mr = _SynchronousMemoryResource(dev_id) + device._mr = _SynchronousMemoryResource(dev_id) - dev._has_inited = False - dev._properties = None - devices.append(dev) + device._has_inited = False + device._properties = None + devices.append(device) try: return devices[device_id] @@ -1022,36 +1042,17 @@ def _check_context_initialized(self): f"Device {self._id} is not yet initialized, perhaps you forgot to call .set_current() first?" ) - def _get_primary_context(self) -> driver.CUcontext: - try: - primary_ctxs = _tls.primary_ctxs - except AttributeError: - total = len(_tls.devices) - primary_ctxs = _tls.primary_ctxs = [None] * total - ctx = primary_ctxs[self._id] - if ctx is None: - ctx = handle_return(driver.cuDevicePrimaryCtxRetain(self._id)) - primary_ctxs[self._id] = ctx - return ctx - def _get_current_context(self, check_consistency=False) -> driver.CUcontext: - err, ctx = driver.cuCtxGetCurrent() - - # TODO: We want to just call this: - # _check_driver_error(err) - # but even the simplest success check causes 50-100 ns. Wait until we cythonize this file... - if ctx is None: - _check_driver_error(err) - - if int(ctx) == 0: + cdef cydriver.CUcontext ctx + HANDLE_RETURN(cydriver.cuCtxGetCurrent(&ctx)) + if ctx == NULL: raise CUDAError("No context is bound to the calling CPU thread.") + cdef cydriver.CUdevice dev if check_consistency: - err, dev = driver.cuCtxGetDevice() - if err != _SUCCESS: - handle_return((err,)) - if int(dev) != self._id: + HANDLE_RETURN(cydriver.cuCtxGetDevice(&dev)) + if (dev) != self._id: raise CUDAError("Internal error (current device is not equal to Device.device_id)") - return ctx + return driver.CUcontext(ctx) @property def device_id(self) -> int: @@ -1078,20 +1079,23 @@ def uuid(self) -> str: driver is older than CUDA 11.4. """ - driver_ver = handle_return(driver.cuDriverGetVersion()) - if 11040 <= driver_ver < 13000: - uuid = handle_return(driver.cuDeviceGetUuid_v2(self._id)) - else: - uuid = handle_return(driver.cuDeviceGetUuid(self._id)) - uuid = uuid.bytes.hex() + cdef cydriver.CUuuid uuid + IF CUDA_CORE_BUILD_MAJOR == "12": + HANDLE_RETURN(cydriver.cuDeviceGetUuid_v2(&uuid, self._id)) + ELSE: # 13.0+ + HANDLE_RETURN(cydriver.cuDeviceGetUuid(&uuid, self._id)) + cdef bytes uuid_b = uuid.bytes + cdef str uuid_hex = uuid_b.hex() # 8-4-4-4-12 - return f"{uuid[:8]}-{uuid[8:12]}-{uuid[12:16]}-{uuid[16:20]}-{uuid[20:]}" + return f"{uuid_hex[:8]}-{uuid_hex[8:12]}-{uuid_hex[12:16]}-{uuid_hex[16:20]}-{uuid_hex[20:]}" @property def name(self) -> str: """Return the device name.""" # Use 256 characters to be consistent with CUDA Runtime - name = handle_return(driver.cuDeviceGetName(256, self._id)) + cdef int LENGTH = 256 + cdef bytes name = bytes(LENGTH) + HANDLE_RETURN(cydriver.cuDeviceGetName(name, LENGTH, self._id)) name = name.split(b"\0")[0] return name.decode() @@ -1106,10 +1110,11 @@ def properties(self) -> DeviceProperties: @property def compute_capability(self) -> ComputeCapability: """Return a named tuple with 2 fields: major and minor.""" - if "compute_capability" in self.properties._cache: - return self.properties._cache["compute_capability"] - cc = ComputeCapability(self.properties.compute_capability_major, self.properties.compute_capability_minor) - self.properties._cache["compute_capability"] = cc + cdef DeviceProperties prop = self.properties + if "compute_capability" in prop._cache: + return prop._cache["compute_capability"] + cc = ComputeCapability(prop.compute_capability_major, prop.compute_capability_minor) + prop._cache["compute_capability"] = cc return cc @property @@ -1190,22 +1195,25 @@ def set_current(self, ctx: Context = None) -> Union[Context, None]: >>> # ... do work on device 0 ... """ + cdef cydriver.CUcontext _ctx if ctx is not None: + # TODO: revisit once Context is cythonized assert_type(ctx, Context) if ctx._id != self._id: raise RuntimeError( "the provided context was created on the device with" f" id={ctx._id}, which is different from the target id={self._id}" ) - prev_ctx = handle_return(driver.cuCtxPopCurrent()) - handle_return(driver.cuCtxPushCurrent(ctx._handle)) + # _ctx is the previous context + HANDLE_RETURN(cydriver.cuCtxPopCurrent(&_ctx)) + HANDLE_RETURN(cydriver.cuCtxPushCurrent((ctx._handle))) self._has_inited = True - if int(prev_ctx) != 0: - return Context._from_ctx(prev_ctx, self._id) + if _ctx != NULL: + return Context._from_ctx((_ctx), self._id) else: # use primary ctx - ctx = self._get_primary_context() - handle_return(driver.cuCtxSetCurrent(ctx)) + _ctx = _get_primary_context(self._id) + HANDLE_RETURN(cydriver.cuCtxSetCurrent(_ctx)) self._has_inited = True def create_context(self, options: ContextOptions = None) -> Context: From 04d3f5dc58c88a832a79477541d9ec8ac652841b Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Tue, 30 Sep 2025 13:44:20 +0000 Subject: [PATCH 20/27] making the linter happy, again --- ci/tools/merge_cuda_core_wheels.py | 4 ++-- cuda_core/build_hooks.py | 11 +++++++---- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/ci/tools/merge_cuda_core_wheels.py b/ci/tools/merge_cuda_core_wheels.py index 14c380b76..51cc97ba3 100644 --- a/ci/tools/merge_cuda_core_wheels.py +++ b/ci/tools/merge_cuda_core_wheels.py @@ -21,7 +21,7 @@ import argparse import os import shutil -import subprocess # nosec: B404 +import subprocess import sys import tempfile from pathlib import Path @@ -34,7 +34,7 @@ def run_command(cmd: List[str], cwd: Path = None, env: dict = os.environ) -> sub if cwd: print(f" Working directory: {cwd}") - result = subprocess.run(cmd, cwd=cwd, env=env, capture_output=True, text=True) # nosec: B603 + result = subprocess.run(cmd, cwd=cwd, env=env, capture_output=True, text=True) # noqa: S603 if result.returncode != 0: print(f"Command failed with return code {result.returncode}") diff --git a/cuda_core/build_hooks.py b/cuda_core/build_hooks.py index c712e92cb..7c5fd4672 100644 --- a/cuda_core/build_hooks.py +++ b/cuda_core/build_hooks.py @@ -13,7 +13,7 @@ import glob import os import re -import subprocess # nosec: B404 +import subprocess from Cython.Build import cythonize from setuptools import Extension @@ -41,7 +41,7 @@ def _get_proper_cuda_bindings_major_version() -> str: # also for local development try: - out = subprocess.run("nvidia-smi", env=os.environ, capture_output=True, check=True) # nosec: B603, B607 + out = subprocess.run("nvidia-smi", env=os.environ, capture_output=True, check=True) # noqa: S603, S607 m = re.search(r"CUDA Version:\s*([\d\.]+)", out.stdout.decode()) if m: return m.group(1).split(".")[0] @@ -97,9 +97,12 @@ def get_cuda_paths(): global _extensions _extensions = cythonize( - ext_modules, verbose=True, language_level=3, nthreads=nthreads, + ext_modules, + verbose=True, + language_level=3, + nthreads=nthreads, compiler_directives={"embedsignature": True, "warn.deprecated.IF": False}, - compile_time_env=compile_time_env + compile_time_env=compile_time_env, ) return _build_meta.build_wheel(wheel_directory, config_settings, metadata_directory) From a5d6826ac129879febd07abb19f413a1cdfca81c Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Tue, 30 Sep 2025 14:31:03 +0000 Subject: [PATCH 21/27] fix uuid handling --- cuda_core/cuda/core/experimental/_device.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/experimental/_device.pyx b/cuda_core/cuda/core/experimental/_device.pyx index 589d5a42c..0d05679f1 100644 --- a/cuda_core/cuda/core/experimental/_device.pyx +++ b/cuda_core/cuda/core/experimental/_device.pyx @@ -1084,7 +1084,8 @@ class Device: HANDLE_RETURN(cydriver.cuDeviceGetUuid_v2(&uuid, self._id)) ELSE: # 13.0+ HANDLE_RETURN(cydriver.cuDeviceGetUuid(&uuid, self._id)) - cdef bytes uuid_b = uuid.bytes + cdef bytearray uuid_b = bytearray(sizeof(uuid.bytes)) + uuid_b[:] = uuid.bytes cdef str uuid_hex = uuid_b.hex() # 8-4-4-4-12 return f"{uuid_hex[:8]}-{uuid_hex[8:12]}-{uuid_hex[12:16]}-{uuid_hex[16:20]}-{uuid_hex[20:]}" From 578984dcac4d6ef8f3465277930e75498683e59d Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Tue, 30 Sep 2025 14:43:39 +0000 Subject: [PATCH 22/27] update release notes to note about compatibility requirement --- cuda_core/cuda/core/experimental/_device.pyx | 1 - cuda_core/cuda/core/experimental/_event.pyx | 1 - cuda_core/cuda/core/experimental/_stream.pxd | 1 - cuda_core/cuda/core/experimental/_stream.pyx | 1 - cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd | 1 - cuda_core/docs/source/release/0.X.Y-notes.rst | 2 ++ 6 files changed, 2 insertions(+), 5 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_device.pyx b/cuda_core/cuda/core/experimental/_device.pyx index 0d05679f1..d3cd4bf4c 100644 --- a/cuda_core/cuda/core/experimental/_device.pyx +++ b/cuda_core/cuda/core/experimental/_device.pyx @@ -4,7 +4,6 @@ from libc.stdint cimport uintptr_t -# TODO: how about cuda.bindings < 12.6.2? from cuda.bindings cimport cydriver from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN diff --git a/cuda_core/cuda/core/experimental/_event.pyx b/cuda_core/cuda/core/experimental/_event.pyx index 0d5737e37..db243717f 100644 --- a/cuda_core/cuda/core/experimental/_event.pyx +++ b/cuda_core/cuda/core/experimental/_event.pyx @@ -6,7 +6,6 @@ from __future__ import annotations from libc.stdint cimport uintptr_t -# TODO: how about cuda.bindings < 12.6.2? from cuda.bindings cimport cydriver from cuda.core.experimental._utils.cuda_utils cimport ( diff --git a/cuda_core/cuda/core/experimental/_stream.pxd b/cuda_core/cuda/core/experimental/_stream.pxd index f7d97de33..6b8a7f0f6 100644 --- a/cuda_core/cuda/core/experimental/_stream.pxd +++ b/cuda_core/cuda/core/experimental/_stream.pxd @@ -2,7 +2,6 @@ # # SPDX-License-Identifier: Apache-2.0 -# TODO: how about cuda.bindings < 12.6.2? from cuda.bindings cimport cydriver diff --git a/cuda_core/cuda/core/experimental/_stream.pyx b/cuda_core/cuda/core/experimental/_stream.pyx index ee6f6be01..737fd13f9 100644 --- a/cuda_core/cuda/core/experimental/_stream.pyx +++ b/cuda_core/cuda/core/experimental/_stream.pyx @@ -6,7 +6,6 @@ from __future__ import annotations from libc.stdint cimport uintptr_t -# TODO: how about cuda.bindings < 12.6.2? from cuda.bindings cimport cydriver from cuda.core.experimental._utils.cuda_utils cimport ( diff --git a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd index c58f32610..bf570965f 100644 --- a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd +++ b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd @@ -5,7 +5,6 @@ cimport cpython from libc.stdint cimport int64_t -# TODO: how about cuda.bindings < 12.6.2? from cuda.bindings cimport cydriver diff --git a/cuda_core/docs/source/release/0.X.Y-notes.rst b/cuda_core/docs/source/release/0.X.Y-notes.rst index 551cbe65c..5ed53c723 100644 --- a/cuda_core/docs/source/release/0.X.Y-notes.rst +++ b/cuda_core/docs/source/release/0.X.Y-notes.rst @@ -20,6 +20,7 @@ Breaking Changes ---------------- - **CUDA 11 support dropped**: CUDA 11 support is no longer tested and it may or may not work with cuda.bindings and CTK 11.x. Users are encouraged to migrate to CUDA 12.x or 13.x. +- Support for ``cuda-bindings`` (and ``cuda-python``) < 12.6.2 is dropped. Internally, ``cuda.core`` now always requires the `new binding module layout `_. As per the ``cuda-bindings`` `support policy `_), CUDA 12 users are encouraged to use the latest ``cuda-bindings`` 12.9.x, which is backward-compatible with all CUDA Toolkit 12.y. - **LaunchConfig grid parameter interpretation**: When :attr:`LaunchConfig.cluster` is specified, the :attr:`LaunchConfig.grid` parameter now correctly represents the number of clusters instead of blocks. Previously, the grid parameter was incorrectly interpreted as blocks, causing a mismatch with the expected C++ behavior. This change ensures that ``LaunchConfig(grid=4, cluster=2, block=32)`` correctly produces 4 clusters × 2 blocks/cluster = 8 total blocks, matching the C++ equivalent ``cudax::make_hierarchy(cudax::grid_dims(4), cudax::cluster_dims(2), cudax::block_dims(32))``. - When :class:`Buffer` is closed, :attr:`Buffer.handle` is now set to ``None``. It was previously set to ``0`` by accident. @@ -49,3 +50,4 @@ Fixes and enhancements - Make :class:`Buffer` creation more performant. - Enabled :class:`MemoryResource` subclasses to accept :class:`Device` objects, in addition to previously supported device ordinals. - Fixed a bug in :class:`Stream` and other classes where object cleanup would error during interpreter shutdown. +- General performance improvement. From 0e6f9278739d75c2f62495a6770c4cc2c2395804 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Tue, 30 Sep 2025 14:48:23 +0000 Subject: [PATCH 23/27] fix env vars being passed twice --- cuda_bindings/pyproject.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/cuda_bindings/pyproject.toml b/cuda_bindings/pyproject.toml index dc6c87eef..f6a3c5f40 100644 --- a/cuda_bindings/pyproject.toml +++ b/cuda_bindings/pyproject.toml @@ -59,7 +59,6 @@ readme = { file = ["DESCRIPTION.rst"], content-type = "text/x-rst" } skip = "*-musllinux_*" enable = "cpython-freethreading" build-verbosity = 1 -environment-pass = ["CUDA_PATH", "CUDA_PYTHON_PARALLEL_LEVEL"] [tool.cibuildwheel.linux] archs = "native" From 0e28aa19e2eb8c5d0ab9acb61ee37b081c833448 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Tue, 30 Sep 2025 15:24:47 +0000 Subject: [PATCH 24/27] fix uuid handling, again --- cuda_core/cuda/core/experimental/_device.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_device.pyx b/cuda_core/cuda/core/experimental/_device.pyx index d3cd4bf4c..2808d025b 100644 --- a/cuda_core/cuda/core/experimental/_device.pyx +++ b/cuda_core/cuda/core/experimental/_device.pyx @@ -2,6 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 +cimport cpython from libc.stdint cimport uintptr_t from cuda.bindings cimport cydriver @@ -1083,8 +1084,7 @@ class Device: HANDLE_RETURN(cydriver.cuDeviceGetUuid_v2(&uuid, self._id)) ELSE: # 13.0+ HANDLE_RETURN(cydriver.cuDeviceGetUuid(&uuid, self._id)) - cdef bytearray uuid_b = bytearray(sizeof(uuid.bytes)) - uuid_b[:] = uuid.bytes + cdef bytes uuid_b = cpython.PyBytes_FromStringAndSize(uuid.bytes, sizeof(uuid.bytes)) cdef str uuid_hex = uuid_b.hex() # 8-4-4-4-12 return f"{uuid_hex[:8]}-{uuid_hex[8:12]}-{uuid_hex[12:16]}-{uuid_hex[16:20]}-{uuid_hex[20:]}" From 4ba0090a740cac0743186aa1bca6dd7cfc0c04fd Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Tue, 30 Sep 2025 12:51:17 -0400 Subject: [PATCH 25/27] Apply suggestions from code review Co-authored-by: Phillip Cloud <417981+cpcloud@users.noreply.github.com> --- ci/tools/merge_cuda_core_wheels.py | 47 ++++++++++++++---------------- 1 file changed, 22 insertions(+), 25 deletions(-) diff --git a/ci/tools/merge_cuda_core_wheels.py b/ci/tools/merge_cuda_core_wheels.py index 51cc97ba3..992a13185 100644 --- a/ci/tools/merge_cuda_core_wheels.py +++ b/ci/tools/merge_cuda_core_wheels.py @@ -47,8 +47,8 @@ def run_command(cmd: List[str], cwd: Path = None, env: dict = os.environ) -> sub def merge_wheels(wheels: List[Path], output_dir: Path) -> Path: """Merge multiple wheels into a single wheel with version-specific binaries.""" - print("\n=== Merging wheels ===") - print(f"Input wheels: {[w.name for w in wheels]}") + print("\n=== Merging wheels ===", file=sys.stderr) + print(f"Input wheels: {[w.name for w in wheels]}", file=sys.stderr) if len(wheels) == 1: raise RuntimeError("only one wheel is provided, nothing to merge") @@ -59,11 +59,11 @@ def merge_wheels(wheels: List[Path], output_dir: Path) -> Path: extracted_wheels = [] for i, wheel in enumerate(wheels): - print(f"Extracting wheel {i + 1}/{len(wheels)}: {wheel.name}") + print(f"Extracting wheel {i + 1}/{len(wheels)}: {wheel.name}", file=sys.stderr) # Extract wheel - wheel unpack creates the directory itself run_command( [ - "python", + sys.executable, "-m", "wheel", "unpack", @@ -99,18 +99,18 @@ def merge_wheels(wheels: List[Path], output_dir: Path) -> Path: cuda_version = wheels[i].name.split(".cu")[1].split(".")[0] base_dir = Path("cuda") / "core" / "experimental" # Copy from other wheels - print(f" Copying {wheel_dir} to {base_wheel}") + print(f" Copying {wheel_dir} to {base_wheel}", file=sys.stderr) shutil.copytree(wheel_dir / base_dir, base_wheel / base_dir / f"cu{cuda_version}") # Overwrite the __init__.py in versioned dirs - open(base_wheel / base_dir / f"cu{cuda_version}" / "__init__.py", "w").close() + os.truncate(base_wheel / base_dir / f"cu{cuda_version}" / "__init__.py", 0) # The base dir should only contain __init__.py, the include dir, and the versioned dirs - files_to_remove = os.listdir(base_wheel / base_dir) + files_to_remove = os.scandir(base_wheel / base_dir) for f in files_to_remove: - f_abspath = base_wheel / base_dir / f - if f not in ("__init__.py", "cu12", "cu13", "include"): - if os.path.isdir(f_abspath): + f_abspath = f.path + if f.name not in ("__init__.py", "cu12", "cu13", "include"): + if f.is_dir(): shutil.rmtree(f_abspath) else: os.remove(f_abspath) @@ -119,15 +119,12 @@ def merge_wheels(wheels: List[Path], output_dir: Path) -> Path: output_dir.mkdir(parents=True, exist_ok=True) # Create a clean wheel name without CUDA version suffixes - base_wheel_name = wheels[0].name - # Remove any .cu* suffix from the wheel name - if ".cu" in base_wheel_name: - base_wheel_name = base_wheel_name.split(".cu")[0] + ".whl" + base_wheel_name = wheels[0].with_suffix(".whl").name - print(f"Repacking merged wheel as: {base_wheel_name}") + print(f"Repacking merged wheel as: {base_wheel_name}", file=sys.stderr) run_command( [ - "python", + sys.executable, "-m", "wheel", "pack", @@ -143,7 +140,7 @@ def merge_wheels(wheels: List[Path], output_dir: Path) -> Path: raise RuntimeError("Failed to create merged wheel") merged_wheel = output_wheels[0] - print(f"Successfully merged wheel: {merged_wheel}") + print(f"Successfully merged wheel: {merged_wheel}", file=sys.stderr) return merged_wheel @@ -155,32 +152,32 @@ def main(): args = parser.parse_args() - print("cuda.core Wheel Merger") - print("======================") + print("cuda.core Wheel Merger", file=sys.stderr) + print("======================", file=sys.stderr) # Convert wheel paths to Path objects and validate wheels = [] for wheel_path in args.wheels: wheel = Path(wheel_path) if not wheel.exists(): - print(f"Error: Wheel not found: {wheel}") + print(f"Error: Wheel not found: {wheel}", file=sys.stderr) sys.exit(1) if not wheel.name.endswith(".whl"): - print(f"Error: Not a wheel file: {wheel}") + print(f"Error: Not a wheel file: {wheel}", file=sys.stderr) sys.exit(1) wheels.append(wheel) if not wheels: - print("Error: No wheels provided") + print("Error: No wheels provided", file=sys.stderr) sys.exit(1) output_dir = Path(args.output_dir) # Check that we have wheel tool available try: - run_command(["python", "-m", "wheel", "--help"]) - except Exception: - print("Error: wheel package not available. Install with: pip install wheel") + import wheel + except ImportError: + print("Error: wheel package not available. Install with: pip install wheel", file=sys.stderr) sys.exit(1) # Merge the wheels From 20c5a99130aa5a6eec640b88ace6d8a7ae19dda1 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Wed, 1 Oct 2025 23:13:44 +0000 Subject: [PATCH 26/27] address review comments --- .github/workflows/build-wheel.yml | 2 -- ci/tools/merge_cuda_core_wheels.py | 3 ++- cuda_core/cuda/core/experimental/__init__.py | 6 +++--- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/.github/workflows/build-wheel.yml b/.github/workflows/build-wheel.yml index ab7672e7f..2c4be6695 100644 --- a/.github/workflows/build-wheel.yml +++ b/.github/workflows/build-wheel.yml @@ -155,7 +155,6 @@ jobs: path: ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}/*.whl if-no-files-found: error - # TODO: ideally we want to build against public cuda-bindings - name: Build cuda.core wheel uses: pypa/cibuildwheel@c923d83ad9c1bc00211c5041d0c3f73294ff88f6 # v3.1.4 with: @@ -289,7 +288,6 @@ jobs: cuda-version: ${{ inputs.prev-cuda-version }} cuda-path: "./cuda_toolkit_prev" - # TODO: ideally we want to build against public cuda-bindings - name: Download cuda.bindings build artifacts from the prior branch if: ${{ matrix.python-version == '3.13t' || matrix.python-version == '3.14' diff --git a/ci/tools/merge_cuda_core_wheels.py b/ci/tools/merge_cuda_core_wheels.py index 51cc97ba3..71ff069f3 100644 --- a/ci/tools/merge_cuda_core_wheels.py +++ b/ci/tools/merge_cuda_core_wheels.py @@ -1,8 +1,9 @@ +#!/usr/bin/env python3 + # SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # SPDX-License-Identifier: Apache-2.0 -#!/usr/bin/env python3 """ Script to merge CUDA-specific wheels into a single multi-CUDA wheel. diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py index fb0a2f469..b0383e408 100644 --- a/cuda_core/cuda/core/experimental/__init__.py +++ b/cuda_core/cuda/core/experimental/__init__.py @@ -15,14 +15,14 @@ subdir = f"cu{cuda_major}" try: - verioned_mod = importlib.import_module(f".{subdir}", __package__) + versioned_mod = importlib.import_module(f".{subdir}", __package__) # Import all symbols from the module - globals().update(verioned_mod.__dict__) + globals().update(versioned_mod.__dict__) except ImportError: # This is not a wheel build, but a conda or local build, do nothing pass else: - del verioned_mod + del versioned_mod finally: del cuda.bindings, importlib, subdir, cuda_major, cuda_minor From d79e317cd11e450a8013da6cb66ebaeffed64d3b Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Thu, 2 Oct 2025 10:14:28 -0400 Subject: [PATCH 27/27] switch to use FutureWarning --- cuda_core/cuda/core/experimental/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py index b0383e408..2bdcc4f83 100644 --- a/cuda_core/cuda/core/experimental/__init__.py +++ b/cuda_core/cuda/core/experimental/__init__.py @@ -32,7 +32,7 @@ if sys.version_info < (3, 10): warnings.warn( "support for Python 3.9 and below is deprecated and subject to future removal", - category=UserWarning, + category=FutureWarning, stacklevel=1, ) del sys, warnings