diff --git a/.github/actions/fetch_ctk/action.yml b/.github/actions/fetch_ctk/action.yml index 83b447f0c..be7536c63 100644 --- a/.github/actions/fetch_ctk/action.yml +++ b/.github/actions/fetch_ctk/action.yml @@ -18,6 +18,11 @@ inputs: required: false type: string default: "cuda_nvcc,cuda_cudart,cuda_crt,libnvvm,cuda_nvrtc,cuda_profiler_api,cuda_cccl,libnvjitlink,libcufile" + cuda-path: + description: "where the CTK components will be installed to, relative to $PWD" + required: false + type: string + default: "./cuda_toolkit" runs: using: composite @@ -159,18 +164,24 @@ runs: exit 1 fi + - name: Move CTK to the specified location + if: ${{ inputs.cuda-path != './cuda_toolkit' }} + shell: bash --noprofile --norc -xeuo pipefail {0} + run: | + mv ./cuda_toolkit ${{ inputs.cuda-path }} + - name: Set output environment variables shell: bash --noprofile --norc -xeuo pipefail {0} run: | # mimics actual CTK installation if [[ "${{ inputs.host-platform }}" == linux* ]]; then - CUDA_PATH=$(realpath "./cuda_toolkit") - echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${CUDA_PATH}/lib" >> $GITHUB_ENV + CUDA_PATH=$(realpath "${{ inputs.cuda-path }}") + echo "LD_LIBRARY_PATH=${CUDA_PATH}/lib:${LD_LIBRARY_PATH:-}" >> $GITHUB_ENV elif [[ "${{ inputs.host-platform }}" == win* ]]; then function normpath() { echo "$(echo $(cygpath -w $1) | sed 's/\\/\\\\/g')" } - CUDA_PATH=$(normpath $(realpath "./cuda_toolkit")) + CUDA_PATH=$(normpath $(realpath "${{ inputs.cuda-path }}")) echo "$(normpath ${CUDA_PATH}/bin)" >> $GITHUB_PATH fi echo "CUDA_PATH=${CUDA_PATH}" >> $GITHUB_ENV diff --git a/.github/workflows/build-wheel.yml b/.github/workflows/build-wheel.yml index 472833faa..90bd005d4 100644 --- a/.github/workflows/build-wheel.yml +++ b/.github/workflows/build-wheel.yml @@ -11,6 +11,9 @@ on: cuda-version: required: true type: string + prev-cuda-version: + required: true + type: string defaults: run: @@ -109,33 +112,6 @@ jobs: path: cuda_pathfinder/*.whl if-no-files-found: error - - name: Build cuda.core wheel - uses: pypa/cibuildwheel@7c619efba910c04005a835b110b057fc28fd6e93 # v3.2.0 - with: - package-dir: ./cuda_core/ - output-dir: ${{ env.CUDA_CORE_ARTIFACTS_DIR }} - - - name: List the cuda.core artifacts directory - run: | - if [[ "${{ inputs.host-platform }}" == win* ]]; then - export CHOWN=chown - else - export CHOWN="sudo chown" - fi - $CHOWN -R $(whoami) ${{ env.CUDA_CORE_ARTIFACTS_DIR }} - ls -lahR ${{ env.CUDA_CORE_ARTIFACTS_DIR }} - - - name: Check cuda.core wheel - run: | - twine check --strict ${{ env.CUDA_CORE_ARTIFACTS_DIR }}/*.whl - - - name: Upload cuda.core build artifacts - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 - with: - name: ${{ env.CUDA_CORE_ARTIFACT_NAME }} - path: ${{ env.CUDA_CORE_ARTIFACTS_DIR }}/*.whl - if-no-files-found: error - - name: Set up mini CTK uses: ./.github/actions/fetch_ctk continue-on-error: false @@ -148,6 +124,15 @@ jobs: with: package-dir: ./cuda_bindings/ output-dir: ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }} + env: + CIBW_BUILD: ${{ env.CIBW_BUILD }} + # CIBW mounts the host filesystem under /host + CIBW_ENVIRONMENT_LINUX: > + CUDA_PATH=/host/${{ env.CUDA_PATH }} + CUDA_PYTHON_PARALLEL_LEVEL=${{ env.CUDA_PYTHON_PARALLEL_LEVEL }} + CIBW_ENVIRONMENT_WINDOWS: > + CUDA_PATH="$(cygpath -w ${{ env.CUDA_PATH }})" + CUDA_PYTHON_PARALLEL_LEVEL=${{ env.CUDA_PYTHON_PARALLEL_LEVEL }} - name: List the cuda.bindings artifacts directory run: | @@ -170,6 +155,47 @@ jobs: path: ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}/*.whl if-no-files-found: error + - name: Build cuda.core wheel + uses: pypa/cibuildwheel@c923d83ad9c1bc00211c5041d0c3f73294ff88f6 # v3.1.4 + with: + package-dir: ./cuda_core/ + output-dir: ${{ env.CUDA_CORE_ARTIFACTS_DIR }} + env: + CIBW_BUILD: ${{ env.CIBW_BUILD }} + # CIBW mounts the host filesystem under /host + CIBW_ENVIRONMENT_LINUX: > + CUDA_PATH=/host/${{ env.CUDA_PATH }} + CUDA_PYTHON_PARALLEL_LEVEL=${{ env.CUDA_PYTHON_PARALLEL_LEVEL }} + CUDA_CORE_BUILD_MAJOR=${{ env.BUILD_CUDA_MAJOR }} + PIP_FIND_LINKS=/host/${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }} + CIBW_ENVIRONMENT_WINDOWS: > + CUDA_PATH="$(cygpath -w ${{ env.CUDA_PATH }})" + CUDA_PYTHON_PARALLEL_LEVEL=${{ env.CUDA_PYTHON_PARALLEL_LEVEL }} + CUDA_CORE_BUILD_MAJOR=${{ env.BUILD_CUDA_MAJOR }} + PIP_FIND_LINKS="$(cygpath -w ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }})" + + - name: List the cuda.core artifacts directory and rename + run: | + if [[ "${{ inputs.host-platform }}" == win* ]]; then + export CHOWN=chown + else + export CHOWN="sudo chown" + fi + $CHOWN -R $(whoami) ${{ env.CUDA_CORE_ARTIFACTS_DIR }} + + # Rename wheel to include CUDA version suffix + mkdir -p "${{ env.CUDA_CORE_ARTIFACTS_DIR }}/cu${BUILD_CUDA_MAJOR}" + for wheel in ${{ env.CUDA_CORE_ARTIFACTS_DIR }}/*.whl; do + if [[ -f "${wheel}" ]]; then + base_name=$(basename "${wheel}" .whl) + new_name="${base_name}.cu${BUILD_CUDA_MAJOR}.whl" + mv "${wheel}" "${{ env.CUDA_CORE_ARTIFACTS_DIR }}/cu${BUILD_CUDA_MAJOR}/${new_name}" + echo "Renamed wheel to: ${new_name}" + fi + done + + ls -lahR ${{ env.CUDA_CORE_ARTIFACTS_DIR }} + # We only need/want a single pure python wheel, pick linux-64 index 0. - name: Build and check cuda-python wheel if: ${{ strategy.job-index == 0 && inputs.host-platform == 'linux-64' }} @@ -241,7 +267,7 @@ jobs: - name: Build cuda.core Cython tests run: | - pip install ${{ env.CUDA_CORE_ARTIFACTS_DIR }}/*.whl --group ./cuda_core/pyproject.toml:test + pip install ${{ env.CUDA_CORE_ARTIFACTS_DIR }}/"cu${BUILD_CUDA_MAJOR}"/*.whl --group ./cuda_core/pyproject.toml:test pushd ${{ env.CUDA_CORE_CYTHON_TESTS_DIR }} bash build_tests.sh popd @@ -252,3 +278,107 @@ jobs: name: ${{ env.CUDA_CORE_ARTIFACT_NAME }}-tests path: ${{ env.CUDA_CORE_CYTHON_TESTS_DIR }}/test_*${{ env.PY_EXT_SUFFIX }} if-no-files-found: error + + # Note: This overwrites CUDA_PATH etc + - name: Set up mini CTK + uses: ./.github/actions/fetch_ctk + continue-on-error: false + with: + host-platform: ${{ inputs.host-platform }} + cuda-version: ${{ inputs.prev-cuda-version }} + cuda-path: "./cuda_toolkit_prev" + + - name: Download cuda.bindings build artifacts from the prior branch + if: ${{ matrix.python-version == '3.13t' + || matrix.python-version == '3.14' + || matrix.python-version == '3.14t' }} + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + if ! (command -v gh 2>&1 >/dev/null); then + # See https://github.com/cli/cli/blob/trunk/docs/install_linux.md#debian-ubuntu-linux-raspberry-pi-os-apt. + # gh is needed for artifact fetching. + mkdir -p -m 755 /etc/apt/keyrings \ + && out=$(mktemp) && wget -nv -O$out https://cli.github.com/packages/githubcli-archive-keyring.gpg \ + && cat $out | tee /etc/apt/keyrings/githubcli-archive-keyring.gpg > /dev/null \ + && chmod go+r /etc/apt/keyrings/githubcli-archive-keyring.gpg \ + && echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/githubcli-archive-keyring.gpg] https://cli.github.com/packages stable main" | tee /etc/apt/sources.list.d/github-cli.list > /dev/null \ + && apt update \ + && apt install gh -y + fi + + OLD_BRANCH=$(cat .github/BACKPORT_BRANCH) + OLD_BASENAME="cuda-bindings-python${PYTHON_VERSION_FORMATTED}-cuda*-${{ inputs.host-platform }}*" + LATEST_PRIOR_RUN_ID=$(gh run list -b ${OLD_BRANCH} -L 1 -w "ci.yml" -s completed -R NVIDIA/cuda-python --json databaseId | jq '.[]| .databaseId') + if [[ "$LATEST_PRIOR_RUN_ID" == "" ]]; then + echo "LATEST_PRIOR_RUN_ID not found!" + exit 1 + fi + + gh run download $LATEST_PRIOR_RUN_ID -p ${OLD_BASENAME} -R NVIDIA/cuda-python + rm -rf ${OLD_BASENAME}-tests # exclude cython test artifacts + ls -al $OLD_BASENAME + mkdir -p "${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}" + mv $OLD_BASENAME/*.whl "${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}" + rmdir $OLD_BASENAME + + - name: Build cuda.core wheel + uses: pypa/cibuildwheel@c923d83ad9c1bc00211c5041d0c3f73294ff88f6 # v3.1.4 + with: + package-dir: ./cuda_core/ + output-dir: ${{ env.CUDA_CORE_ARTIFACTS_DIR }} + env: + CIBW_BUILD: ${{ env.CIBW_BUILD }} + # CIBW mounts the host filesystem under /host + CIBW_ENVIRONMENT_LINUX: > + CUDA_PATH=/host/${{ env.CUDA_PATH }} + CUDA_PYTHON_PARALLEL_LEVEL=${{ env.CUDA_PYTHON_PARALLEL_LEVEL }} + CUDA_CORE_BUILD_MAJOR=${{ env.BUILD_PREV_CUDA_MAJOR }} + PIP_FIND_LINKS=/host/${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }} + CIBW_ENVIRONMENT_WINDOWS: > + CUDA_PATH="$(cygpath -w ${{ env.CUDA_PATH }})" + CUDA_PYTHON_PARALLEL_LEVEL=${{ env.CUDA_PYTHON_PARALLEL_LEVEL }} + CUDA_CORE_BUILD_MAJOR=${{ env.BUILD_PREV_CUDA_MAJOR }} + PIP_FIND_LINKS="$(cygpath -w ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }})" + + - name: List the cuda.core artifacts directory and rename + run: | + if [[ "${{ inputs.host-platform }}" == win* ]]; then + export CHOWN=chown + else + export CHOWN="sudo chown" + fi + $CHOWN -R $(whoami) ${{ env.CUDA_CORE_ARTIFACTS_DIR }} + ls -lahR ${{ env.CUDA_CORE_ARTIFACTS_DIR }} + + # Rename wheel to include CUDA version suffix + mkdir -p "${{ env.CUDA_CORE_ARTIFACTS_DIR }}/cu${BUILD_PREV_CUDA_MAJOR}" + for wheel in ${{ env.CUDA_CORE_ARTIFACTS_DIR }}/*.whl; do + if [[ -f "${wheel}" ]]; then + base_name=$(basename "${wheel}" .whl) + new_name="${base_name}.cu${BUILD_PREV_CUDA_MAJOR}.whl" + mv "${wheel}" "${{ env.CUDA_CORE_ARTIFACTS_DIR }}/cu${BUILD_PREV_CUDA_MAJOR}/${new_name}" + echo "Renamed wheel to: ${new_name}" + fi + done + + ls -lahR ${{ env.CUDA_CORE_ARTIFACTS_DIR }} + + - name: Merge cuda.core wheels + run: | + pip install wheel + python ci/tools/merge_cuda_core_wheels.py \ + "${{ env.CUDA_CORE_ARTIFACTS_DIR }}"/cu"${BUILD_CUDA_MAJOR}"/cuda_core*.whl \ + "${{ env.CUDA_CORE_ARTIFACTS_DIR }}"/cu"${BUILD_PREV_CUDA_MAJOR}"/cuda_core*.whl \ + --output-dir "${{ env.CUDA_CORE_ARTIFACTS_DIR }}" + + - name: Check cuda.core wheel + run: | + twine check --strict ${{ env.CUDA_CORE_ARTIFACTS_DIR }}/*.whl + + - name: Upload cuda.core build artifacts + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 + with: + name: ${{ env.CUDA_CORE_ARTIFACT_NAME }} + path: ${{ env.CUDA_CORE_ARTIFACTS_DIR }}/*.whl + if-no-files-found: error diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index cfe8bc7a6..e8641119a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -21,17 +21,21 @@ jobs: runs-on: ubuntu-latest outputs: CUDA_BUILD_VER: ${{ steps.get-vars.outputs.cuda_build_ver }} + CUDA_PREV_BUILD_VER: ${{ steps.get-vars.outputs.cuda_prev_build_ver }} steps: - name: Checkout repository uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: fetch-depth: 0 - - name: Get CUDA build version + - name: Get CUDA build versions id: get-vars run: | cuda_build_ver=$(jq -r .cuda.build.version ci/versions.json) echo "cuda_build_ver=$cuda_build_ver" >> $GITHUB_OUTPUT + cuda_prev_build_ver=$(jq -r .cuda.prev_build.version ci/versions.json) + echo "cuda_prev_build_ver=$cuda_prev_build_ver" >> $GITHUB_OUTPUT + should-skip: runs-on: ubuntu-latest outputs: @@ -69,6 +73,7 @@ jobs: with: host-platform: ${{ matrix.host-platform }} cuda-version: ${{ needs.ci-vars.outputs.CUDA_BUILD_VER }} + prev-cuda-version: ${{ needs.ci-vars.outputs.CUDA_PREV_BUILD_VER }} # WARNING: make sure all of the build jobs are in sync build-linux-aarch64: @@ -87,6 +92,7 @@ jobs: with: host-platform: ${{ matrix.host-platform }} cuda-version: ${{ needs.ci-vars.outputs.CUDA_BUILD_VER }} + prev-cuda-version: ${{ needs.ci-vars.outputs.CUDA_PREV_BUILD_VER }} # WARNING: make sure all of the build jobs are in sync build-windows: @@ -105,6 +111,7 @@ jobs: with: host-platform: ${{ matrix.host-platform }} cuda-version: ${{ needs.ci-vars.outputs.CUDA_BUILD_VER }} + prev-cuda-version: ${{ needs.ci-vars.outputs.CUDA_PREV_BUILD_VER }} # WARNING: make sure both Linux test jobs are in sync test-linux-64: diff --git a/.spdx-ignore b/.spdx-ignore index 60435ebb5..84f051faf 100644 --- a/.spdx-ignore +++ b/.spdx-ignore @@ -10,4 +10,4 @@ requirements*.txt cuda_bindings/examples/* # Vendored -cuda_core/cuda/core/experimental/dlpack.h +cuda_core/cuda/core/experimental/include/dlpack.h diff --git a/ci/tools/env-vars b/ci/tools/env-vars index de4a5a6b9..f7db5179d 100755 --- a/ci/tools/env-vars +++ b/ci/tools/env-vars @@ -41,6 +41,9 @@ if [[ "${1}" == "build" ]]; then # platform is handled by the default value of platform (`auto`) in cibuildwheel # here we only need to specify the python version we want echo "CIBW_BUILD=cp${PYTHON_VERSION_FORMATTED}-*" >> $GITHUB_ENV + BUILD_CUDA_MAJOR="$(cut -d '.' -f 1 <<< ${CUDA_VER})" + echo "BUILD_CUDA_MAJOR=${BUILD_CUDA_MAJOR}" >> $GITHUB_ENV + echo "BUILD_PREV_CUDA_MAJOR=$((${BUILD_CUDA_MAJOR} - 1))" >> $GITHUB_ENV CUDA_BINDINGS_ARTIFACT_BASENAME="cuda-bindings-python${PYTHON_VERSION_FORMATTED}-cuda${CUDA_VER}-${HOST_PLATFORM}" elif [[ "${1}" == "test" ]]; then BUILD_CUDA_MAJOR="$(cut -d '.' -f 1 <<< ${BUILD_CUDA_VER})" diff --git a/ci/tools/merge_cuda_core_wheels.py b/ci/tools/merge_cuda_core_wheels.py new file mode 100644 index 000000000..359b98d6a --- /dev/null +++ b/ci/tools/merge_cuda_core_wheels.py @@ -0,0 +1,190 @@ +#!/usr/bin/env python3 + +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +""" +Script to merge CUDA-specific wheels into a single multi-CUDA wheel. + +This script takes wheels built for different CUDA versions (cu12, cu13) and merges them +into a single wheel that supports both CUDA versions. + +In particular, each wheel contains a CUDA-specific build of the `cuda.core` library +and the associated bindings. This script merges these directories into a single wheel +that supports both CUDA versions, i.e., containing both `cuda/core/experimental/cu12` +and `cuda/core/experimental/cu13`. At runtime, the code in `cuda/core/experimental/__init__.py` +is used to import the appropriate CUDA-specific bindings. + +This script is based on the one in NVIDIA/CCCL. +""" + +import argparse +import os +import shutil +import subprocess +import sys +import tempfile +from pathlib import Path +from typing import List + + +def run_command(cmd: List[str], cwd: Path = None, env: dict = os.environ) -> subprocess.CompletedProcess: + """Run a command with error handling.""" + print(f"Running: {' '.join(cmd)}") + if cwd: + print(f" Working directory: {cwd}") + + result = subprocess.run(cmd, cwd=cwd, env=env, capture_output=True, text=True) # noqa: S603 + + if result.returncode != 0: + print(f"Command failed with return code {result.returncode}") + print("STDOUT:", result.stdout) + print("STDERR:", result.stderr) + result.check_returncode() + + return result + + +def merge_wheels(wheels: List[Path], output_dir: Path) -> Path: + """Merge multiple wheels into a single wheel with version-specific binaries.""" + print("\n=== Merging wheels ===", file=sys.stderr) + print(f"Input wheels: {[w.name for w in wheels]}", file=sys.stderr) + + if len(wheels) == 1: + raise RuntimeError("only one wheel is provided, nothing to merge") + + # Extract all wheels to temporary directories + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + extracted_wheels = [] + + for i, wheel in enumerate(wheels): + print(f"Extracting wheel {i + 1}/{len(wheels)}: {wheel.name}", file=sys.stderr) + # Extract wheel - wheel unpack creates the directory itself + run_command( + [ + sys.executable, + "-m", + "wheel", + "unpack", + str(wheel), + "--dest", + str(temp_path), + ] + ) + + # Find the extracted directory (wheel unpack creates a subdirectory) + extract_dir = None + for item in temp_path.iterdir(): + if item.is_dir() and item.name.startswith("cuda_core"): + extract_dir = item + break + + if not extract_dir: + raise RuntimeError(f"Could not find extracted wheel directory for {wheel.name}") + + # Rename to our expected name + expected_name = temp_path / f"wheel_{i}" + extract_dir.rename(expected_name) + extract_dir = expected_name + + extracted_wheels.append(extract_dir) + + # Use the first wheel as the base and merge binaries from others + base_wheel = extracted_wheels[0] + + # now copy the version-specific directory from other wheels + # into the appropriate place in the base wheel + for i, wheel_dir in enumerate(extracted_wheels): + cuda_version = wheels[i].name.split(".cu")[1].split(".")[0] + base_dir = Path("cuda") / "core" / "experimental" + # Copy from other wheels + print(f" Copying {wheel_dir} to {base_wheel}", file=sys.stderr) + shutil.copytree(wheel_dir / base_dir, base_wheel / base_dir / f"cu{cuda_version}") + + # Overwrite the __init__.py in versioned dirs + os.truncate(base_wheel / base_dir / f"cu{cuda_version}" / "__init__.py", 0) + + # The base dir should only contain __init__.py, the include dir, and the versioned dirs + files_to_remove = os.scandir(base_wheel / base_dir) + for f in files_to_remove: + f_abspath = f.path + if f.name not in ("__init__.py", "cu12", "cu13", "include"): + if f.is_dir(): + shutil.rmtree(f_abspath) + else: + os.remove(f_abspath) + + # Repack the merged wheel + output_dir.mkdir(parents=True, exist_ok=True) + + # Create a clean wheel name without CUDA version suffixes + base_wheel_name = wheels[0].with_suffix(".whl").name + + print(f"Repacking merged wheel as: {base_wheel_name}", file=sys.stderr) + run_command( + [ + sys.executable, + "-m", + "wheel", + "pack", + str(base_wheel), + "--dest-dir", + str(output_dir), + ] + ) + + # Find the output wheel + output_wheels = list(output_dir.glob("*.whl")) + if not output_wheels: + raise RuntimeError("Failed to create merged wheel") + + merged_wheel = output_wheels[0] + print(f"Successfully merged wheel: {merged_wheel}", file=sys.stderr) + return merged_wheel + + +def main(): + """Main merge script.""" + parser = argparse.ArgumentParser(description="Merge CUDA-specific wheels into a single multi-CUDA wheel") + parser.add_argument("wheels", nargs="+", help="Paths to the CUDA-specific wheels to merge") + parser.add_argument("--output-dir", "-o", default="dist", help="Output directory for merged wheel") + + args = parser.parse_args() + + print("cuda.core Wheel Merger", file=sys.stderr) + print("======================", file=sys.stderr) + + # Convert wheel paths to Path objects and validate + wheels = [] + for wheel_path in args.wheels: + wheel = Path(wheel_path) + if not wheel.exists(): + print(f"Error: Wheel not found: {wheel}", file=sys.stderr) + sys.exit(1) + if not wheel.name.endswith(".whl"): + print(f"Error: Not a wheel file: {wheel}", file=sys.stderr) + sys.exit(1) + wheels.append(wheel) + + if not wheels: + print("Error: No wheels provided", file=sys.stderr) + sys.exit(1) + + output_dir = Path(args.output_dir) + + # Check that we have wheel tool available + try: + import wheel + except ImportError: + print("Error: wheel package not available. Install with: pip install wheel", file=sys.stderr) + sys.exit(1) + + # Merge the wheels + merged_wheel = merge_wheels(wheels, output_dir) + print(f"\nMerge complete! Output: {merged_wheel}") + + +if __name__ == "__main__": + main() diff --git a/ci/versions.json b/ci/versions.json index 271c69ac3..2acfae1e3 100644 --- a/ci/versions.json +++ b/ci/versions.json @@ -2,6 +2,9 @@ "cuda": { "build": { "version": "13.0.1" + }, + "prev_build": { + "version": "12.9.1" } } } diff --git a/cuda_bindings/pyproject.toml b/cuda_bindings/pyproject.toml index 5545c966c..250f8e407 100644 --- a/cuda_bindings/pyproject.toml +++ b/cuda_bindings/pyproject.toml @@ -59,16 +59,11 @@ readme = { file = ["DESCRIPTION.rst"], content-type = "text/x-rst" } skip = "*-musllinux_*" enable = "cpython-freethreading" build-verbosity = 1 -environment-pass = ["CUDA_PATH", "CUDA_PYTHON_PARALLEL_LEVEL"] [tool.cibuildwheel.linux] archs = "native" -# CIBW mounts the host filesystem under /host -environment-pass = ["CUDA_PATH"] -environment = { CUDA_HOME = "/host/$CUDA_PATH" } [tool.cibuildwheel.windows] archs = "AMD64" before-build = "pip install delvewheel" repair-wheel-command = "delvewheel repair --namespace-pkg cuda -w {dest_dir} {wheel}" -environment = { CUDA_HOME = "$(cygpath -w $CUDA_PATH)" } diff --git a/cuda_core/build_hooks.py b/cuda_core/build_hooks.py new file mode 100644 index 000000000..7c5fd4672 --- /dev/null +++ b/cuda_core/build_hooks.py @@ -0,0 +1,114 @@ +# SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +# This module implements basic PEP 517 backend support, see e.g. +# - https://peps.python.org/pep-0517/ +# - https://setuptools.pypa.io/en/latest/build_meta.html#dynamic-build-dependencies-and-other-build-meta-tweaks +# Specifically, there are 5 APIs required to create a proper build backend, see below. +# +# TODO: also implement PEP-660 API hooks + +import functools +import glob +import os +import re +import subprocess + +from Cython.Build import cythonize +from setuptools import Extension +from setuptools import build_meta as _build_meta + +prepare_metadata_for_build_wheel = _build_meta.prepare_metadata_for_build_wheel +build_sdist = _build_meta.build_sdist +get_requires_for_build_sdist = _build_meta.get_requires_for_build_sdist + + +@functools.cache +def _get_proper_cuda_bindings_major_version() -> str: + # for local development (with/without build isolation) + try: + import cuda.bindings + + return cuda.bindings.__version__.split(".")[0] + except ImportError: + pass + + # for custom overwrite, e.g. in CI + cuda_major = os.environ.get("CUDA_CORE_BUILD_MAJOR") + if cuda_major is not None: + return cuda_major + + # also for local development + try: + out = subprocess.run("nvidia-smi", env=os.environ, capture_output=True, check=True) # noqa: S603, S607 + m = re.search(r"CUDA Version:\s*([\d\.]+)", out.stdout.decode()) + if m: + return m.group(1).split(".")[0] + except FileNotFoundError: + # the build machine has no driver installed + pass + + # default fallback + return "13" + + +# used later by setup() +_extensions = None + + +def build_wheel(wheel_directory, config_settings=None, metadata_directory=None): + # Customizing this hook is needed because we must defer cythonization until cuda-bindings, + # now a required build-time dependency that's dynamically installed via the other hook below, + # is installed. Otherwise, cimport any cuda.bindings modules would fail! + + # It seems setuptools' wildcard support has problems for namespace packages, + # so we explicitly spell out all Extension instances. + root_module = "cuda.core.experimental" + root_path = f"{os.path.sep}".join(root_module.split(".")) + os.path.sep + ext_files = glob.glob(f"{root_path}/**/*.pyx", recursive=True) + + def strip_prefix_suffix(filename): + return filename[len(root_path) : -4] + + module_names = (strip_prefix_suffix(f) for f in ext_files) + + @functools.cache + def get_cuda_paths(): + CUDA_PATH = os.environ.get("CUDA_PATH", os.environ.get("CUDA_HOME", None)) + if not CUDA_PATH: + raise RuntimeError("Environment variable CUDA_PATH or CUDA_HOME is not set") + CUDA_PATH = CUDA_PATH.split(os.pathsep) + print("CUDA paths:", CUDA_PATH) + return CUDA_PATH + + ext_modules = tuple( + Extension( + f"cuda.core.experimental.{mod.replace(os.path.sep, '.')}", + sources=[f"cuda/core/experimental/{mod}.pyx"], + include_dirs=list(os.path.join(root, "include") for root in get_cuda_paths()), + language="c++", + ) + for mod in module_names + ) + + nthreads = int(os.environ.get("CUDA_PYTHON_PARALLEL_LEVEL", os.cpu_count() // 2)) + compile_time_env = {"CUDA_CORE_BUILD_MAJOR": _get_proper_cuda_bindings_major_version()} + + global _extensions + _extensions = cythonize( + ext_modules, + verbose=True, + language_level=3, + nthreads=nthreads, + compiler_directives={"embedsignature": True, "warn.deprecated.IF": False}, + compile_time_env=compile_time_env, + ) + + return _build_meta.build_wheel(wheel_directory, config_settings, metadata_directory) + + +def get_requires_for_build_wheel(config_settings=None): + cuda_major = _get_proper_cuda_bindings_major_version() + cuda_bindings_require = [f"cuda-bindings=={cuda_major}.*"] + return _build_meta.get_requires_for_build_wheel(config_settings) + cuda_bindings_require diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py index a01134373..dc4d20222 100644 --- a/cuda_core/cuda/core/experimental/__init__.py +++ b/cuda_core/cuda/core/experimental/__init__.py @@ -2,29 +2,64 @@ # # SPDX-License-Identifier: Apache-2.0 -from cuda.core.experimental import utils -from cuda.core.experimental._device import Device -from cuda.core.experimental._event import Event, EventOptions -from cuda.core.experimental._graph import ( +try: + import cuda.bindings +except ImportError: + raise ImportError("cuda.bindings 12.x or 13.x must be installed") from None +else: + cuda_major, cuda_minor = cuda.bindings.__version__.split(".")[:2] + if cuda_major not in ("12", "13"): + raise ImportError("cuda.bindings 12.x or 13.x must be installed") + +import importlib + +subdir = f"cu{cuda_major}" +try: + versioned_mod = importlib.import_module(f".{subdir}", __package__) + # Import all symbols from the module + globals().update(versioned_mod.__dict__) +except ImportError: + # This is not a wheel build, but a conda or local build, do nothing + pass +else: + del versioned_mod +finally: + del cuda.bindings, importlib, subdir, cuda_major, cuda_minor + +import sys # noqa: E402 +import warnings # noqa: E402 + +if sys.version_info < (3, 10): + warnings.warn( + "support for Python 3.9 and below is deprecated and subject to future removal", + category=FutureWarning, + stacklevel=1, + ) +del sys, warnings + +from cuda.core.experimental import utils # noqa: E402 +from cuda.core.experimental._device import Device # noqa: E402 +from cuda.core.experimental._event import Event, EventOptions # noqa: E402 +from cuda.core.experimental._graph import ( # noqa: E402 Graph, GraphBuilder, GraphCompleteOptions, GraphDebugPrintOptions, ) -from cuda.core.experimental._launch_config import LaunchConfig -from cuda.core.experimental._launcher import launch -from cuda.core.experimental._linker import Linker, LinkerOptions -from cuda.core.experimental._memory import ( +from cuda.core.experimental._launch_config import LaunchConfig # noqa: E402 +from cuda.core.experimental._launcher import launch # noqa: E402 +from cuda.core.experimental._linker import Linker, LinkerOptions # noqa: E402 +from cuda.core.experimental._memory import ( # noqa: E402 Buffer, DeviceMemoryResource, DeviceMemoryResourceOptions, LegacyPinnedMemoryResource, MemoryResource, ) -from cuda.core.experimental._module import Kernel, ObjectCode -from cuda.core.experimental._program import Program, ProgramOptions -from cuda.core.experimental._stream import Stream, StreamOptions -from cuda.core.experimental._system import System +from cuda.core.experimental._module import Kernel, ObjectCode # noqa: E402 +from cuda.core.experimental._program import Program, ProgramOptions # noqa: E402 +from cuda.core.experimental._stream import Stream, StreamOptions # noqa: E402 +from cuda.core.experimental._system import System # noqa: E402 system = System() __import__("sys").modules[__spec__.name + ".system"] = system diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.pyx similarity index 91% rename from cuda_core/cuda/core/experimental/_device.py rename to cuda_core/cuda/core/experimental/_device.pyx index be8c5170a..ae1c7f38c 100644 --- a/cuda_core/cuda/core/experimental/_device.py +++ b/cuda_core/cuda/core/experimental/_device.pyx @@ -2,6 +2,13 @@ # # SPDX-License-Identifier: Apache-2.0 +cimport cpython +from libc.stdint cimport uintptr_t + +from cuda.bindings cimport cydriver + +from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN + import threading from typing import Optional, Union @@ -14,41 +21,44 @@ from cuda.core.experimental._utils.cuda_utils import ( ComputeCapability, CUDAError, - _check_driver_error, driver, handle_return, runtime, ) + _tls = threading.local() _lock = threading.Lock() -_is_cuInit = False +cdef bint _is_cuInit = False -class DeviceProperties: +cdef class DeviceProperties: """ A class to query various attributes of a CUDA device. Attributes are read-only and provide information about the device. """ + cdef: + int _handle + dict _cache - def __new__(self, *args, **kwargs): + def __init__(self, *args, **kwargs): raise RuntimeError("DeviceProperties cannot be instantiated directly. Please use Device APIs.") - __slots__ = ("_handle", "_cache") - @classmethod def _init(cls, handle): - self = super().__new__(cls) + cdef DeviceProperties self = DeviceProperties.__new__(cls) self._handle = handle self._cache = {} return self - def _get_attribute(self, attr): + cdef inline _get_attribute(self, cydriver.CUdevice_attribute attr): """Retrieve the attribute value directly from the driver.""" - return handle_return(driver.cuDeviceGetAttribute(attr, self._handle)) + cdef int val + HANDLE_RETURN(cydriver.cuDeviceGetAttribute(&val, attr, self._handle)) + return val - def _get_cached_attribute(self, attr): + cdef _get_cached_attribute(self, attr): """Retrieve the attribute value, using cache if applicable.""" if attr not in self._cache: self._cache[attr] = self._get_attribute(attr) @@ -931,8 +941,17 @@ def multicast_supported(self) -> bool: return bool(self._get_cached_attribute(driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED)) -_SUCCESS = driver.CUresult.CUDA_SUCCESS -_INVALID_CTX = driver.CUresult.CUDA_ERROR_INVALID_CONTEXT +cdef cydriver.CUcontext _get_primary_context(int dev_id) except?NULL: + try: + primary_ctxs = _tls.primary_ctxs + except AttributeError: + total = len(_tls.devices) + primary_ctxs = _tls.primary_ctxs = [0] * total + cdef cydriver.CUcontext ctx = (primary_ctxs[dev_id]) + if ctx == NULL: + HANDLE_RETURN(cydriver.cuDevicePrimaryCtxRetain(&ctx, dev_id)) + primary_ctxs[dev_id] = (ctx) + return ctx class Device: @@ -961,55 +980,56 @@ class Device: Default value of `None` return the currently used device. """ - __slots__ = ("_id", "_mr", "_has_inited", "_properties") def __new__(cls, device_id: Optional[int] = None): global _is_cuInit if _is_cuInit is False: with _lock: - handle_return(driver.cuInit(0)) + HANDLE_RETURN(cydriver.cuInit(0)) _is_cuInit = True # important: creating a Device instance does not initialize the GPU! + cdef cydriver.CUdevice dev + cdef cydriver.CUcontext ctx if device_id is None: - err, dev = driver.cuCtxGetDevice() - if err == _SUCCESS: + err = cydriver.cuCtxGetDevice(&dev) + if err == cydriver.CUresult.CUDA_SUCCESS: device_id = int(dev) - elif err == _INVALID_CTX: - ctx = handle_return(driver.cuCtxGetCurrent()) - assert int(ctx) == 0 + elif err == cydriver.CUresult.CUDA_ERROR_INVALID_CONTEXT: + HANDLE_RETURN(cydriver.cuCtxGetCurrent(&ctx)) + assert (ctx) == NULL device_id = 0 # cudart behavior else: - _check_driver_error(err) + HANDLE_RETURN(err) elif device_id < 0: raise ValueError(f"device_id must be >= 0, got {device_id}") # ensure Device is singleton + cdef int total, attr try: devices = _tls.devices except AttributeError: - total = handle_return(driver.cuDeviceGetCount()) + HANDLE_RETURN(cydriver.cuDeviceGetCount(&total)) devices = _tls.devices = [] for dev_id in range(total): - dev = super().__new__(cls) - dev._id = dev_id + device = super().__new__(cls) + device._id = dev_id # If the device is in TCC mode, or does not support memory pools for some other reason, # use the SynchronousMemoryResource which does not use memory pools. - if ( - handle_return( - driver.cuDeviceGetAttribute( - driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED, dev_id - ) + HANDLE_RETURN( + cydriver.cuDeviceGetAttribute( + &attr, cydriver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED, dev_id ) - ) == 1: - dev._mr = DeviceMemoryResource(dev_id) + ) + if attr == 1: + device._mr = DeviceMemoryResource(dev_id) else: - dev._mr = _SynchronousMemoryResource(dev_id) + device._mr = _SynchronousMemoryResource(dev_id) - dev._has_inited = False - dev._properties = None - devices.append(dev) + device._has_inited = False + device._properties = None + devices.append(device) try: return devices[device_id] @@ -1022,36 +1042,17 @@ def _check_context_initialized(self): f"Device {self._id} is not yet initialized, perhaps you forgot to call .set_current() first?" ) - def _get_primary_context(self) -> driver.CUcontext: - try: - primary_ctxs = _tls.primary_ctxs - except AttributeError: - total = len(_tls.devices) - primary_ctxs = _tls.primary_ctxs = [None] * total - ctx = primary_ctxs[self._id] - if ctx is None: - ctx = handle_return(driver.cuDevicePrimaryCtxRetain(self._id)) - primary_ctxs[self._id] = ctx - return ctx - def _get_current_context(self, check_consistency=False) -> driver.CUcontext: - err, ctx = driver.cuCtxGetCurrent() - - # TODO: We want to just call this: - # _check_driver_error(err) - # but even the simplest success check causes 50-100 ns. Wait until we cythonize this file... - if ctx is None: - _check_driver_error(err) - - if int(ctx) == 0: + cdef cydriver.CUcontext ctx + HANDLE_RETURN(cydriver.cuCtxGetCurrent(&ctx)) + if ctx == NULL: raise CUDAError("No context is bound to the calling CPU thread.") + cdef cydriver.CUdevice dev if check_consistency: - err, dev = driver.cuCtxGetDevice() - if err != _SUCCESS: - handle_return((err,)) - if int(dev) != self._id: + HANDLE_RETURN(cydriver.cuCtxGetDevice(&dev)) + if (dev) != self._id: raise CUDAError("Internal error (current device is not equal to Device.device_id)") - return ctx + return driver.CUcontext(ctx) @property def device_id(self) -> int: @@ -1078,20 +1079,23 @@ def uuid(self) -> str: driver is older than CUDA 11.4. """ - driver_ver = handle_return(driver.cuDriverGetVersion()) - if 11040 <= driver_ver < 13000: - uuid = handle_return(driver.cuDeviceGetUuid_v2(self._id)) - else: - uuid = handle_return(driver.cuDeviceGetUuid(self._id)) - uuid = uuid.bytes.hex() + cdef cydriver.CUuuid uuid + IF CUDA_CORE_BUILD_MAJOR == "12": + HANDLE_RETURN(cydriver.cuDeviceGetUuid_v2(&uuid, self._id)) + ELSE: # 13.0+ + HANDLE_RETURN(cydriver.cuDeviceGetUuid(&uuid, self._id)) + cdef bytes uuid_b = cpython.PyBytes_FromStringAndSize(uuid.bytes, sizeof(uuid.bytes)) + cdef str uuid_hex = uuid_b.hex() # 8-4-4-4-12 - return f"{uuid[:8]}-{uuid[8:12]}-{uuid[12:16]}-{uuid[16:20]}-{uuid[20:]}" + return f"{uuid_hex[:8]}-{uuid_hex[8:12]}-{uuid_hex[12:16]}-{uuid_hex[16:20]}-{uuid_hex[20:]}" @property def name(self) -> str: """Return the device name.""" # Use 256 characters to be consistent with CUDA Runtime - name = handle_return(driver.cuDeviceGetName(256, self._id)) + cdef int LENGTH = 256 + cdef bytes name = bytes(LENGTH) + HANDLE_RETURN(cydriver.cuDeviceGetName(name, LENGTH, self._id)) name = name.split(b"\0")[0] return name.decode() @@ -1106,10 +1110,11 @@ def properties(self) -> DeviceProperties: @property def compute_capability(self) -> ComputeCapability: """Return a named tuple with 2 fields: major and minor.""" - if "compute_capability" in self.properties._cache: - return self.properties._cache["compute_capability"] - cc = ComputeCapability(self.properties.compute_capability_major, self.properties.compute_capability_minor) - self.properties._cache["compute_capability"] = cc + cdef DeviceProperties prop = self.properties + if "compute_capability" in prop._cache: + return prop._cache["compute_capability"] + cc = ComputeCapability(prop.compute_capability_major, prop.compute_capability_minor) + prop._cache["compute_capability"] = cc return cc @property @@ -1193,22 +1198,25 @@ def set_current(self, ctx: Context = None) -> Union[Context, None]: >>> # ... do work on device 0 ... """ + cdef cydriver.CUcontext _ctx if ctx is not None: + # TODO: revisit once Context is cythonized assert_type(ctx, Context) if ctx._id != self._id: raise RuntimeError( "the provided context was created on the device with" f" id={ctx._id}, which is different from the target id={self._id}" ) - prev_ctx = handle_return(driver.cuCtxPopCurrent()) - handle_return(driver.cuCtxPushCurrent(ctx._handle)) + # _ctx is the previous context + HANDLE_RETURN(cydriver.cuCtxPopCurrent(&_ctx)) + HANDLE_RETURN(cydriver.cuCtxPushCurrent((ctx._handle))) self._has_inited = True - if int(prev_ctx) != 0: - return Context._from_ctx(prev_ctx, self._id) + if _ctx != NULL: + return Context._from_ctx((_ctx), self._id) else: # use primary ctx - ctx = self._get_primary_context() - handle_return(driver.cuCtxSetCurrent(ctx)) + _ctx = _get_primary_context(self._id) + HANDLE_RETURN(cydriver.cuCtxSetCurrent(_ctx)) self._has_inited = True def create_context(self, options: ContextOptions = None) -> Context: diff --git a/cuda_core/cuda/core/experimental/_dlpack.pxd b/cuda_core/cuda/core/experimental/_dlpack.pxd index 843beb873..d61b6a2bc 100644 --- a/cuda_core/cuda/core/experimental/_dlpack.pxd +++ b/cuda_core/cuda/core/experimental/_dlpack.pxd @@ -14,7 +14,7 @@ from libc.stdint cimport uint64_t from libc.stdint cimport intptr_t -cdef extern from "dlpack.h" nogil: +cdef extern from "include/dlpack.h" nogil: """ #define DLPACK_TENSOR_UNUSED_NAME "dltensor" #define DLPACK_VERSIONED_TENSOR_UNUSED_NAME "dltensor_versioned" diff --git a/cuda_core/cuda/core/experimental/_event.pyx b/cuda_core/cuda/core/experimental/_event.pyx index 41c0b1ce6..db243717f 100644 --- a/cuda_core/cuda/core/experimental/_event.pyx +++ b/cuda_core/cuda/core/experimental/_event.pyx @@ -4,9 +4,13 @@ from __future__ import annotations +from libc.stdint cimport uintptr_t + +from cuda.bindings cimport cydriver + from cuda.core.experimental._utils.cuda_utils cimport ( - _check_driver_error as raise_if_driver_error, check_or_create_options, + HANDLE_RETURN ) from dataclasses import dataclass @@ -78,12 +82,15 @@ cdef class Event: """ cdef: - object _handle + cydriver.CUevent _handle bint _timing_disabled bint _busy_waited int _device_id object _ctx_handle + def __cinit__(self): + self._handle = (NULL) + def __init__(self, *args, **kwargs): raise RuntimeError("Event objects cannot be instantiated directly. Please use Stream APIs (record).") @@ -91,19 +98,18 @@ cdef class Event: def _init(cls, device_id: int, ctx_handle: Context, options=None): cdef Event self = Event.__new__(cls) cdef EventOptions opts = check_or_create_options(EventOptions, options, "Event options") - flags = 0x0 + cdef unsigned int flags = 0x0 self._timing_disabled = False self._busy_waited = False if not opts.enable_timing: - flags |= driver.CUevent_flags.CU_EVENT_DISABLE_TIMING + flags |= cydriver.CUevent_flags.CU_EVENT_DISABLE_TIMING self._timing_disabled = True if opts.busy_waited_sync: - flags |= driver.CUevent_flags.CU_EVENT_BLOCKING_SYNC + flags |= cydriver.CUevent_flags.CU_EVENT_BLOCKING_SYNC self._busy_waited = True if opts.support_ipc: raise NotImplementedError("WIP: https://github.com/NVIDIA/cuda-python/issues/103") - err, self._handle = driver.cuEventCreate(flags) - raise_if_driver_error(err) + HANDLE_RETURN(cydriver.cuEventCreate(&self._handle, flags)) self._device_id = device_id self._ctx_handle = ctx_handle return self @@ -111,10 +117,9 @@ cdef class Event: cdef _shutdown_safe_close(self, is_shutting_down=sys.is_finalizing): if is_shutting_down and is_shutting_down(): return - if self._handle is not None: - err, = driver.cuEventDestroy(self._handle) - self._handle = None - raise_if_driver_error(err) + if self._handle != NULL: + HANDLE_RETURN(cydriver.cuEventDestroy(self._handle)) + self._handle = (NULL) cpdef close(self): """Destroy the event.""" @@ -129,14 +134,14 @@ cdef class Event: def __rsub__(self, other): return NotImplemented - def __sub__(self, other): + def __sub__(self, other: Event): # return self - other (in milliseconds) - err, timing = driver.cuEventElapsedTime(other.handle, self._handle) - try: - raise_if_driver_error(err) + cdef float timing + err = cydriver.cuEventElapsedTime(&timing, other._handle, self._handle) + if err == 0: return timing - except CUDAError as e: - if err == driver.CUresult.CUDA_ERROR_INVALID_HANDLE: + else: + if err == cydriver.CUresult.CUDA_ERROR_INVALID_HANDLE: if self.is_timing_disabled or other.is_timing_disabled: explanation = ( "Both Events must be created with timing enabled in order to subtract them; " @@ -147,15 +152,15 @@ cdef class Event: "Both Events must be recorded before they can be subtracted; " "use Stream.record() to record both events to a stream." ) - elif err == driver.CUresult.CUDA_ERROR_NOT_READY: + elif err == cydriver.CUresult.CUDA_ERROR_NOT_READY: explanation = ( "One or both events have not completed; " "use Event.sync(), Stream.sync(), or Device.sync() to wait for the events to complete " "before subtracting them." ) else: - raise e - raise RuntimeError(explanation) from e + raise CUDAError(err) + raise RuntimeError(explanation) @property def is_timing_disabled(self) -> bool: @@ -182,17 +187,17 @@ cdef class Event: has been completed. """ - handle_return(driver.cuEventSynchronize(self._handle)) + HANDLE_RETURN(cydriver.cuEventSynchronize(self._handle)) @property def is_done(self) -> bool: """Return True if all captured works have been completed, otherwise False.""" - result, = driver.cuEventQuery(self._handle) - if result == driver.CUresult.CUDA_SUCCESS: + result = cydriver.cuEventQuery(self._handle) + if result == cydriver.CUresult.CUDA_SUCCESS: return True - if result == driver.CUresult.CUDA_ERROR_NOT_READY: + if result == cydriver.CUresult.CUDA_ERROR_NOT_READY: return False - handle_return(result) + HANDLE_RETURN(result) @property def handle(self) -> cuda.bindings.driver.CUevent: @@ -203,7 +208,7 @@ cdef class Event: This handle is a Python object. To get the memory address of the underlying C handle, call ``int(Event.handle)``. """ - return self._handle + return driver.CUevent((self._handle)) @property def device(self) -> Device: diff --git a/cuda_core/cuda/core/experimental/_launcher.py b/cuda_core/cuda/core/experimental/_launcher.pyx similarity index 93% rename from cuda_core/cuda/core/experimental/_launcher.py rename to cuda_core/cuda/core/experimental/_launcher.pyx index 2d0c274c7..ae808be89 100644 --- a/cuda_core/cuda/core/experimental/_launcher.py +++ b/cuda_core/cuda/core/experimental/_launcher.pyx @@ -2,12 +2,16 @@ # # SPDX-License-Identifier: Apache-2.0 +from libc.stdint cimport uintptr_t + +from cuda.core.experimental._stream cimport _try_to_get_stream_ptr + from typing import Union from cuda.core.experimental._kernel_arg_handler import ParamHolder from cuda.core.experimental._launch_config import LaunchConfig, _to_native_launch_config from cuda.core.experimental._module import Kernel -from cuda.core.experimental._stream import IsStreamT, Stream, _try_to_get_stream_ptr +from cuda.core.experimental._stream import IsStreamT, Stream from cuda.core.experimental._utils.clear_error_support import assert_type from cuda.core.experimental._utils.cuda_utils import ( _reduce_3_tuple, @@ -60,7 +64,7 @@ def launch(stream: Union[Stream, IsStreamT], config: LaunchConfig, kernel: Kerne stream_handle = stream.handle except AttributeError: try: - stream_handle = _try_to_get_stream_ptr(stream) + stream_handle = driver.CUstream((_try_to_get_stream_ptr(stream))) except Exception: raise ValueError( f"stream must either be a Stream object or support __cuda_stream__ (got {type(stream)})" diff --git a/cuda_core/cuda/core/experimental/_stream.pxd b/cuda_core/cuda/core/experimental/_stream.pxd new file mode 100644 index 000000000..6b8a7f0f6 --- /dev/null +++ b/cuda_core/cuda/core/experimental/_stream.pxd @@ -0,0 +1,8 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from cuda.bindings cimport cydriver + + +cdef cydriver.CUstream _try_to_get_stream_ptr(obj: IsStreamT) except* diff --git a/cuda_core/cuda/core/experimental/_stream.pyx b/cuda_core/cuda/core/experimental/_stream.pyx index a2c1a90b9..737fd13f9 100644 --- a/cuda_core/cuda/core/experimental/_stream.pyx +++ b/cuda_core/cuda/core/experimental/_stream.pyx @@ -4,10 +4,15 @@ from __future__ import annotations +from libc.stdint cimport uintptr_t + +from cuda.bindings cimport cydriver + from cuda.core.experimental._utils.cuda_utils cimport ( - _check_driver_error as raise_if_driver_error, check_or_create_options, + HANDLE_RETURN, ) + import sys import cython @@ -59,7 +64,7 @@ class IsStreamT(Protocol): ... -def _try_to_get_stream_ptr(obj: IsStreamT): +cdef cydriver.CUstream _try_to_get_stream_ptr(obj: IsStreamT) except*: try: cuda_stream_attr = obj.__cuda_stream__ except AttributeError: @@ -86,7 +91,7 @@ def _try_to_get_stream_ptr(obj: IsStreamT): raise RuntimeError( f"The first element of the sequence returned by obj.__cuda_stream__ must be 0, got {repr(info[0])}" ) - return driver.CUstream(info[1]) + return (info[1]) cdef class Stream: @@ -108,7 +113,7 @@ cdef class Stream: """ cdef: - object _handle + cydriver.CUstream _handle object _owner object _builtin object _nonblocking @@ -116,6 +121,9 @@ cdef class Stream: object _device_id object _ctx_handle + def __cinit__(self): + self._handle = (NULL) + def __init__(self, *args, **kwargs): raise RuntimeError( "Stream objects cannot be instantiated directly. " @@ -125,7 +133,7 @@ cdef class Stream: @classmethod def _legacy_default(cls): cdef Stream self = Stream.__new__(cls) - self._handle = driver.CUstream(driver.CU_STREAM_LEGACY) + self._handle = (cydriver.CU_STREAM_LEGACY) self._owner = None self._builtin = True self._nonblocking = None # delayed @@ -137,7 +145,7 @@ cdef class Stream: @classmethod def _per_thread_default(cls): cdef Stream self = Stream.__new__(cls) - self._handle = driver.CUstream(driver.CU_STREAM_PER_THREAD) + self._handle = (cydriver.CU_STREAM_PER_THREAD) self._owner = None self._builtin = True self._nonblocking = None # delayed @@ -149,7 +157,6 @@ cdef class Stream: @classmethod def _init(cls, obj: Optional[IsStreamT] = None, options=None, device_id: int = None): cdef Stream self = Stream.__new__(cls) - self._handle = None self._owner = None self._builtin = False @@ -169,16 +176,18 @@ cdef class Stream: nonblocking = opts.nonblocking priority = opts.priority - flags = driver.CUstream_flags.CU_STREAM_NON_BLOCKING if nonblocking else driver.CUstream_flags.CU_STREAM_DEFAULT - err, high, low = driver.cuCtxGetStreamPriorityRange() - raise_if_driver_error(err) + flags = cydriver.CUstream_flags.CU_STREAM_NON_BLOCKING if nonblocking else cydriver.CUstream_flags.CU_STREAM_DEFAULT + cdef int high, low + HANDLE_RETURN(cydriver.cuCtxGetStreamPriorityRange(&high, &low)) if priority is not None: if not (low <= priority <= high): raise ValueError(f"{priority=} is out of range {[low, high]}") else: priority = high - self._handle = handle_return(driver.cuStreamCreateWithPriority(flags, priority)) + cdef cydriver.CUstream s + HANDLE_RETURN(cydriver.cuStreamCreateWithPriority(&s, flags, priority)) + self._handle = s self._owner = None self._nonblocking = nonblocking self._priority = priority @@ -195,10 +204,10 @@ cdef class Stream: if self._owner is None: if self._handle and not self._builtin: - handle_return(driver.cuStreamDestroy(self._handle)) + HANDLE_RETURN(cydriver.cuStreamDestroy(self._handle)) else: self._owner = None - self._handle = None + self._handle = (NULL) cpdef close(self): """Destroy the stream. @@ -222,14 +231,15 @@ cdef class Stream: This handle is a Python object. To get the memory address of the underlying C handle, call ``int(Stream.handle)``. """ - return self._handle + return driver.CUstream((self._handle)) @property def is_nonblocking(self) -> bool: """Return True if this is a nonblocking stream, otherwise False.""" + cdef unsigned int flags if self._nonblocking is None: - flag = handle_return(driver.cuStreamGetFlags(self._handle)) - if flag == driver.CUstream_flags.CU_STREAM_NON_BLOCKING: + HANDLE_RETURN(cydriver.cuStreamGetFlags(self._handle, &flags)) + if flags & cydriver.CUstream_flags.CU_STREAM_NON_BLOCKING: self._nonblocking = True else: self._nonblocking = False @@ -238,14 +248,15 @@ cdef class Stream: @property def priority(self) -> int: """Return the stream priority.""" + cdef int prio if self._priority is None: - prio = handle_return(driver.cuStreamGetPriority(self._handle)) + HANDLE_RETURN(cydriver.cuStreamGetPriority(self._handle, &prio)) self._priority = prio return self._priority def sync(self): """Synchronize the stream.""" - handle_return(driver.cuStreamSynchronize(self._handle)) + HANDLE_RETURN(cydriver.cuStreamSynchronize(self._handle)) def record(self, event: Event = None, options: EventOptions = None) -> Event: """Record an event onto the stream. @@ -272,8 +283,8 @@ cdef class Stream: if event is None: self._get_device_and_context() event = Event._init(self._device_id, self._ctx_handle, options) - err, = driver.cuEventRecord(event.handle, self._handle) - raise_if_driver_error(err) + # TODO: revisit after Event is cythonized + HANDLE_RETURN(cydriver.cuEventRecord((event.handle), self._handle)) return event def wait(self, event_or_stream: Union[Event, Stream]): @@ -286,28 +297,33 @@ cdef class Stream: on the stream and then waiting on it. """ + cdef cydriver.CUevent event + cdef cydriver.CUstream stream + cdef bint discard_event + if isinstance(event_or_stream, Event): - event = event_or_stream.handle + event = (event_or_stream.handle) discard_event = False else: if isinstance(event_or_stream, Stream): - stream = event_or_stream + stream = (event_or_stream.handle) else: try: - stream = Stream._init(obj=event_or_stream) + s = Stream._init(obj=event_or_stream) except Exception as e: raise ValueError( "only an Event, Stream, or object supporting __cuda_stream__ can be waited," f" got {type(event_or_stream)}" ) from e - event = handle_return(driver.cuEventCreate(driver.CUevent_flags.CU_EVENT_DISABLE_TIMING)) - handle_return(driver.cuEventRecord(event, stream.handle)) + stream = (s.handle) + HANDLE_RETURN(cydriver.cuEventCreate(&event, cydriver.CUevent_flags.CU_EVENT_DISABLE_TIMING)) + HANDLE_RETURN(cydriver.cuEventRecord(event, stream)) discard_event = True # TODO: support flags other than 0? - handle_return(driver.cuStreamWaitEvent(self._handle, event, 0)) + HANDLE_RETURN(cydriver.cuStreamWaitEvent(self._handle, event, 0)) if discard_event: - handle_return(driver.cuEventDestroy(event)) + HANDLE_RETURN(cydriver.cuEventDestroy(event)) @property def device(self) -> Device: @@ -325,9 +341,11 @@ cdef class Stream: return Device(self._device_id) cdef int _get_context(Stream self) except?-1: + # TODO: consider making self._ctx_handle typed? + cdef cydriver.CUcontext ctx if self._ctx_handle is None: - err, self._ctx_handle = driver.cuStreamGetCtx(self._handle) - raise_if_driver_error(err) + HANDLE_RETURN(cydriver.cuStreamGetCtx(self._handle, &ctx)) + self._ctx_handle = driver.CUcontext(ctx) return 0 cdef int _get_device_and_context(Stream self) except?-1: diff --git a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd index 601736c47..bf570965f 100644 --- a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd +++ b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd @@ -2,18 +2,29 @@ # # SPDX-License-Identifier: Apache-2.0 - cimport cpython -cimport libc.stdint +from libc.stdint cimport int64_t + +from cuda.bindings cimport cydriver + + +ctypedef fused supported_error_type: + cydriver.CUresult +cdef int HANDLE_RETURN(supported_error_type err) except?-1 + + +# TODO: stop exposing these within the codebase? cpdef int _check_driver_error(error) except?-1 cpdef int _check_runtime_error(error) except?-1 cpdef int _check_nvrtc_error(error) except?-1 + + cpdef check_or_create_options(type cls, options, str options_description=*, bint keep_none=*) -cdef inline tuple carray_int64_t_to_tuple(libc.stdint.int64_t *ptr, int length): +cdef inline tuple carray_int64_t_to_tuple(int64_t *ptr, int length): # Construct shape and strides tuples using the Python/C API for speed result = cpython.PyTuple_New(length) for i in range(length): diff --git a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx index 86588f733..c095e7564 100644 --- a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx +++ b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx @@ -52,6 +52,12 @@ def _reduce_3_tuple(t: tuple): return t[0] * t[1] * t[2] +cdef int HANDLE_RETURN(supported_error_type err) except?-1: + if supported_error_type is cydriver.CUresult: + if err != cydriver.CUresult.CUDA_SUCCESS: + return _check_driver_error(err) + + cdef object _DRIVER_SUCCESS = driver.CUresult.CUDA_SUCCESS cdef object _RUNTIME_SUCCESS = runtime.cudaError_t.cudaSuccess cdef object _NVRTC_SUCCESS = nvrtc.nvrtcResult.NVRTC_SUCCESS diff --git a/cuda_core/cuda/core/experimental/dlpack.h b/cuda_core/cuda/core/experimental/include/dlpack.h similarity index 100% rename from cuda_core/cuda/core/experimental/dlpack.h rename to cuda_core/cuda/core/experimental/include/dlpack.h diff --git a/cuda_core/docs/source/release/0.X.Y-notes.rst b/cuda_core/docs/source/release/0.X.Y-notes.rst index 454a9d465..e87cbdee3 100644 --- a/cuda_core/docs/source/release/0.X.Y-notes.rst +++ b/cuda_core/docs/source/release/0.X.Y-notes.rst @@ -12,6 +12,7 @@ Released on TBD Highlights ---------- +- This is the last release that officially supports Python 3.9. - Fix for :class:`LaunchConfig` grid parameter unit conversion when thread block clusters are used. @@ -19,6 +20,7 @@ Breaking Changes ---------------- - **CUDA 11 support dropped**: CUDA 11 support is no longer tested and it may or may not work with cuda.bindings and CTK 11.x. Users are encouraged to migrate to CUDA 12.x or 13.x. +- Support for ``cuda-bindings`` (and ``cuda-python``) < 12.6.2 is dropped. Internally, ``cuda.core`` now always requires the `new binding module layout `_. As per the ``cuda-bindings`` `support policy `_), CUDA 12 users are encouraged to use the latest ``cuda-bindings`` 12.9.x, which is backward-compatible with all CUDA Toolkit 12.y. - **LaunchConfig grid parameter interpretation**: When :attr:`LaunchConfig.cluster` is specified, the :attr:`LaunchConfig.grid` parameter now correctly represents the number of clusters instead of blocks. Previously, the grid parameter was incorrectly interpreted as blocks, causing a mismatch with the expected C++ behavior. This change ensures that ``LaunchConfig(grid=4, cluster=2, block=32)`` correctly produces 4 clusters × 2 blocks/cluster = 8 total blocks, matching the C++ equivalent ``cudax::make_hierarchy(cudax::grid_dims(4), cudax::cluster_dims(2), cudax::block_dims(32))``. - When :class:`Buffer` is closed, :attr:`Buffer.handle` is now set to ``None``. It was previously set to ``0`` by accident. @@ -49,3 +51,4 @@ Fixes and enhancements - Enabled :class:`MemoryResource` subclasses to accept :class:`Device` objects, in addition to previously supported device ordinals. - Fixed a bug in :class:`Stream` and other classes where object cleanup would error during interpreter shutdown. - :class:`StridedMemoryView` of an underlying array using the DLPack protocol will no longer leak memory. +- General performance improvement. diff --git a/cuda_core/pyproject.toml b/cuda_core/pyproject.toml index 4e4ab5028..8bbfca07d 100644 --- a/cuda_core/pyproject.toml +++ b/cuda_core/pyproject.toml @@ -4,7 +4,8 @@ [build-system] requires = ["setuptools>=77.0.0", "Cython>=3.1"] -build-backend = "setuptools.build_meta" +build-backend = "build_hooks" +backend-path = ["."] [project] @@ -38,21 +39,19 @@ classifiers = [ "Programming Language :: Python :: 3.13", "Programming Language :: Python :: Implementation :: CPython", "Environment :: GPU :: NVIDIA CUDA", - "Environment :: GPU :: NVIDIA CUDA :: 11", "Environment :: GPU :: NVIDIA CUDA :: 12", + "Environment :: GPU :: NVIDIA CUDA :: 13", ] dependencies = [ "numpy", ] [project.optional-dependencies] -cu11 = ["cuda-bindings[all]==11.8.*"] cu12 = ["cuda-bindings[all]==12.*"] cu13 = ["cuda-bindings[all]==13.*"] [dependency-groups] test = ["cython>=3.0", "setuptools", "pytest>=6.2.4"] -test-cu11 = ["cuda-core[test]", "cupy-cuda11x; python_version < '3.14'", "cuda-toolkit[cudart]==11.*"] # runtime headers needed by CuPy test-cu12 = ["cuda-core[test]", "cupy-cuda12x; python_version < '3.14'", "cuda-toolkit[cudart]==12.*"] # runtime headers needed by CuPy test-cu13 = ["cuda-core[test]", "cupy-cuda13x; python_version < '3.14'", "cuda-toolkit[cudart]==13.*"] # runtime headers needed by CuPy # free threaded build, cupy doesn't support free-threaded builds yet, so avoid installing it for now @@ -79,7 +78,6 @@ readme = { file = ["DESCRIPTION.rst"], content-type = "text/x-rst" } skip = "*-musllinux_*" enable = "cpython-freethreading" build-verbosity = 1 -environment-pass = ["CUDA_PYTHON_PARALLEL_LEVEL"] [tool.cibuildwheel.linux] archs = "native" diff --git a/cuda_core/setup.py b/cuda_core/setup.py index d93eec45d..4a501edc1 100644 --- a/cuda_core/setup.py +++ b/cuda_core/setup.py @@ -2,38 +2,15 @@ # # SPDX-License-Identifier: Apache-2.0 -import glob import os -from Cython.Build import cythonize -from setuptools import Extension, setup +import build_hooks # our build backend +from setuptools import setup from setuptools.command.build_ext import build_ext as _build_ext nthreads = int(os.environ.get("CUDA_PYTHON_PARALLEL_LEVEL", os.cpu_count() // 2)) -# It seems setuptools' wildcard support has problems for namespace packages, -# so we explicitly spell out all Extension instances. -root_module = "cuda.core.experimental" -root_path = f"{os.path.sep}".join(root_module.split(".")) + os.path.sep -ext_files = glob.glob(f"{root_path}/**/*.pyx", recursive=True) - - -def strip_prefix_suffix(filename): - return filename[len(root_path) : -4] - - -module_names = (strip_prefix_suffix(f) for f in ext_files) -ext_modules = tuple( - Extension( - f"cuda.core.experimental.{mod.replace(os.path.sep, '.')}", - sources=[f"cuda/core/experimental/{mod}.pyx"], - language="c++", - ) - for mod in module_names -) - - class build_ext(_build_ext): def build_extensions(self): self.parallel = nthreads @@ -41,7 +18,7 @@ def build_extensions(self): setup( - ext_modules=cythonize(ext_modules, verbose=True, language_level=3, compiler_directives={"embedsignature": True}), + ext_modules=build_hooks._extensions, cmdclass={ "build_ext": build_ext, },