Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .github/workflows/build-wheel.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ jobs:
fail-fast: false
matrix:
python-version:
- "3.9"
- "3.10"
- "3.11"
- "3.12"
Expand Down
2 changes: 1 addition & 1 deletion CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ flowchart TD
B2["linux-aarch64<br/>(Self-hosted)"]
B3["win-64<br/>(GitHub-hosted)"]
end
BUILD_DETAILS["• Python versions: 3.9, 3.10, 3.11, 3.12, 3.13<br/>• CUDA version: 13.0.0 (build-time)<br/>• Components: cuda-core, cuda-bindings,<br/> cuda-pathfinder, cuda-python"]
BUILD_DETAILS["• Python versions: 3.10, 3.11, 3.12, 3.13, 3.14<br/>• CUDA version: 13.0.0 (build-time)<br/>• Components: cuda-core, cuda-bindings,<br/> cuda-pathfinder, cuda-python"]
end

%% Artifact Storage
Expand Down
14 changes: 0 additions & 14 deletions ci/test-matrix.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@
"_notes": "DRIVER: 'earliest' does not work with CUDA 12.9.1 and LOCAL_CTK: 0 does not work with CUDA 12.0.1",
"linux": {
"pull-request": [
{ "ARCH": "amd64", "PY_VER": "3.9", "CUDA_VER": "12.9.1", "LOCAL_CTK": "0", "GPU": "l4", "DRIVER": "latest" },
{ "ARCH": "amd64", "PY_VER": "3.9", "CUDA_VER": "13.0.2", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" },
{ "ARCH": "amd64", "PY_VER": "3.10", "CUDA_VER": "12.9.1", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" },
{ "ARCH": "amd64", "PY_VER": "3.10", "CUDA_VER": "13.0.2", "LOCAL_CTK": "0", "GPU": "l4", "DRIVER": "latest" },
{ "ARCH": "amd64", "PY_VER": "3.11", "CUDA_VER": "12.9.1", "LOCAL_CTK": "0", "GPU": "l4", "DRIVER": "latest" },
Expand All @@ -16,8 +14,6 @@
{ "ARCH": "amd64", "PY_VER": "3.13", "CUDA_VER": "13.0.2", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" },
{ "ARCH": "amd64", "PY_VER": "3.14", "CUDA_VER": "13.0.2", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" },
{ "ARCH": "amd64", "PY_VER": "3.14t", "CUDA_VER": "13.0.2", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" },
{ "ARCH": "arm64", "PY_VER": "3.9", "CUDA_VER": "12.9.1", "LOCAL_CTK": "0", "GPU": "a100", "DRIVER": "latest" },
{ "ARCH": "arm64", "PY_VER": "3.9", "CUDA_VER": "13.0.2", "LOCAL_CTK": "1", "GPU": "a100", "DRIVER": "latest" },
{ "ARCH": "arm64", "PY_VER": "3.10", "CUDA_VER": "12.9.1", "LOCAL_CTK": "1", "GPU": "a100", "DRIVER": "latest" },
{ "ARCH": "arm64", "PY_VER": "3.10", "CUDA_VER": "13.0.2", "LOCAL_CTK": "0", "GPU": "a100", "DRIVER": "latest" },
{ "ARCH": "arm64", "PY_VER": "3.11", "CUDA_VER": "12.9.1", "LOCAL_CTK": "0", "GPU": "a100", "DRIVER": "latest" },
Expand All @@ -30,11 +26,6 @@
{ "ARCH": "arm64", "PY_VER": "3.14t", "CUDA_VER": "13.0.2", "LOCAL_CTK": "1", "GPU": "a100", "DRIVER": "latest" }
],
"nightly": [
{ "ARCH": "amd64", "PY_VER": "3.9", "CUDA_VER": "11.8.0", "LOCAL_CTK": "0", "GPU": "l4", "DRIVER": "earliest" },
{ "ARCH": "amd64", "PY_VER": "3.9", "CUDA_VER": "11.8.0", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" },
{ "ARCH": "amd64", "PY_VER": "3.9", "CUDA_VER": "12.0.1", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" },
{ "ARCH": "amd64", "PY_VER": "3.9", "CUDA_VER": "12.9.1", "LOCAL_CTK": "0", "GPU": "l4", "DRIVER": "latest" },
{ "ARCH": "amd64", "PY_VER": "3.9", "CUDA_VER": "12.9.1", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" },
{ "ARCH": "amd64", "PY_VER": "3.10", "CUDA_VER": "11.8.0", "LOCAL_CTK": "0", "GPU": "l4", "DRIVER": "earliest" },
{ "ARCH": "amd64", "PY_VER": "3.10", "CUDA_VER": "11.8.0", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" },
{ "ARCH": "amd64", "PY_VER": "3.10", "CUDA_VER": "12.0.1", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" },
Expand All @@ -55,11 +46,6 @@
{ "ARCH": "amd64", "PY_VER": "3.13", "CUDA_VER": "12.0.1", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" },
{ "ARCH": "amd64", "PY_VER": "3.13", "CUDA_VER": "12.9.1", "LOCAL_CTK": "0", "GPU": "l4", "DRIVER": "latest" },
{ "ARCH": "amd64", "PY_VER": "3.13", "CUDA_VER": "12.9.1", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" },
{ "ARCH": "arm64", "PY_VER": "3.9", "CUDA_VER": "11.8.0", "LOCAL_CTK": "0", "GPU": "a100", "DRIVER": "earliest" },
{ "ARCH": "arm64", "PY_VER": "3.9", "CUDA_VER": "11.8.0", "LOCAL_CTK": "1", "GPU": "a100", "DRIVER": "latest" },
{ "ARCH": "arm64", "PY_VER": "3.9", "CUDA_VER": "12.0.1", "LOCAL_CTK": "1", "GPU": "a100", "DRIVER": "latest" },
{ "ARCH": "arm64", "PY_VER": "3.9", "CUDA_VER": "12.9.1", "LOCAL_CTK": "0", "GPU": "a100", "DRIVER": "latest" },
{ "ARCH": "arm64", "PY_VER": "3.9", "CUDA_VER": "12.9.1", "LOCAL_CTK": "1", "GPU": "a100", "DRIVER": "latest" },
{ "ARCH": "arm64", "PY_VER": "3.10", "CUDA_VER": "11.8.0", "LOCAL_CTK": "0", "GPU": "a100", "DRIVER": "earliest" },
{ "ARCH": "arm64", "PY_VER": "3.10", "CUDA_VER": "11.8.0", "LOCAL_CTK": "1", "GPU": "a100", "DRIVER": "latest" },
{ "ARCH": "arm64", "PY_VER": "3.10", "CUDA_VER": "12.0.1", "LOCAL_CTK": "1", "GPU": "a100", "DRIVER": "latest" },
Expand Down
2 changes: 1 addition & 1 deletion cuda_bindings/docs/source/install.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ Runtime Requirements
``cuda.bindings`` supports the same platforms as CUDA. Runtime dependencies are:

* Linux (x86-64, arm64) and Windows (x86-64)
* Python 3.9 - 3.14
* Python 3.10 - 3.14
* Driver: Linux (580.65.06 or later) Windows (580.88 or later)
* Optionally, NVRTC, nvJitLink, NVVM, and cuFile from CUDA Toolkit 13.x

Expand Down
2 changes: 1 addition & 1 deletion cuda_bindings/docs/source/support.rst
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ The ``cuda.bindings`` module has the following support policy:
depends on the underlying driver and the Toolkit versions, as described in the compatibility
documentation.)
4. The module supports all Python versions following the `CPython EOL schedule`_. As of writing
Python 3.9 - 3.13 are supported.
Python 3.10 - 3.14 are supported.
5. The module exposes a Cython layer from which types and functions could be ``cimport``'d. While
we strive to keep this layer stable, due to Cython limitations a new *minor* release of this
module could require Cython layer users to rebuild their projects and update their pinning to
Expand Down
3 changes: 2 additions & 1 deletion cuda_bindings/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,17 @@ name = "cuda-bindings"
description = "Python bindings for CUDA"
authors = [{name = "NVIDIA Corporation", email = "cuda-python-conduct@nvidia.com"},]
license = "LicenseRef-NVIDIA-SOFTWARE-LICENSE"
requires-python = ">=3.10"
classifiers = [
"Intended Audience :: Developers",
"Topic :: Database",
"Topic :: Scientific/Engineering",
"Programming Language :: Python",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
"Programming Language :: Python :: 3.14",
"Environment :: GPU :: NVIDIA CUDA",
]
dynamic = [
Expand Down
2 changes: 1 addition & 1 deletion cuda_bindings/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ def discoverMembers(self, memberDict, prefix, seen=None):
next_seen = set(seen)
next_seen.add(self._name)

for memberName, memberType in zip(self._member_names, self._member_types):
for memberName, memberType in zip(self._member_names, self._member_types, strict=True):
if memberName:
discovered.append(".".join([prefix, memberName]))

Expand Down
4 changes: 2 additions & 2 deletions cuda_bindings/tests/test_cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -432,7 +432,7 @@ def test_cuda_pointer_attr():
# List version
err, attr_value_list_v2 = cuda.cuPointerGetAttributes(len(attr_type_list), attr_type_list, ptr)
assert err == cuda.CUresult.CUDA_SUCCESS
for attr1, attr2 in zip(attr_value_list, attr_value_list_v2):
for attr1, attr2 in zip(attr_value_list, attr_value_list_v2, strict=True):
assert str(attr1) == str(attr2)

# Test setting values
Expand Down Expand Up @@ -512,7 +512,7 @@ def test_cuda_mem_range_attr():
attr_type_size_list, attr_type_list, len(attr_type_list), ptr, size
)
assert err == cuda.CUresult.CUDA_SUCCESS
for attr1, attr2 in zip(attr_value_list, attr_value_list_v2):
for attr1, attr2 in zip(attr_value_list, attr_value_list_v2, strict=True):
assert str(attr1) == str(attr2)

(err,) = cuda.cuMemFree(ptr)
Expand Down
2 changes: 1 addition & 1 deletion cuda_bindings/tests/test_cufile.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,7 +318,7 @@ def test_buf_register_multiple_buffers():
try:
# Register all buffers
flags = 0
for buf_ptr, size in zip(buffers, buffer_sizes):
for buf_ptr, size in zip(buffers, buffer_sizes, strict=True):
buf_ptr_int = int(buf_ptr)
cufile.buf_register(buf_ptr_int, size, flags)

Expand Down
2 changes: 1 addition & 1 deletion cuda_bindings/tests/test_nvjitlink.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@


def _build_arch_ptx_parametrized_callable():
av = tuple(zip(ARCHITECTURES, PTX_VERSIONS))
av = tuple(zip(ARCHITECTURES, PTX_VERSIONS, strict=True))
return pytest.mark.parametrize(
("arch", "ptx_bytes"),
[(a, (PTX_HEADER.format(VERSION=v, ARCH=a) + PTX_KERNEL).encode("utf-8")) for a, v in av],
Expand Down
11 changes: 0 additions & 11 deletions cuda_core/cuda/core/experimental/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,17 +26,6 @@
finally:
del cuda.bindings, importlib, subdir, cuda_major, cuda_minor

import sys # noqa: E402
import warnings # noqa: E402

if sys.version_info < (3, 10):
warnings.warn(
"support for Python 3.9 and below is deprecated and subject to future removal",
category=FutureWarning,
stacklevel=1,
)
del sys, warnings
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch. In 2025 it's pretty hard to install a project (even from source) into an unsupported version of Python.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was actually caught by ruff.


from cuda.core.experimental import utils # noqa: E402
from cuda.core.experimental._device import Device # noqa: E402
from cuda.core.experimental._event import Event, EventOptions # noqa: E402
Expand Down
10 changes: 5 additions & 5 deletions cuda_core/cuda/core/experimental/_device.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ from cuda.bindings cimport cydriver
from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN

import threading
from typing import Optional, Union
from typing import Union
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any reason to modernize Optional and not Union at the same time?

Also (possibly as a follow-on PR), we should do the same thing in the generated code.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can't remember exactly why, but I think it might've been because ruff took care of some of the Optional modernization but not Union. Is this a blocker?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've already started on it, so I'll just push up a commit in this PR.


from cuda.core.experimental._context import Context, ContextOptions
from cuda.core.experimental._event import Event, EventOptions
Expand Down Expand Up @@ -951,7 +951,7 @@ class Device:
"""
__slots__ = ("_id", "_mr", "_has_inited", "_properties")

def __new__(cls, device_id: Optional[int] = None):
def __new__(cls, device_id: int | None = None):
global _is_cuInit
if _is_cuInit is False:
with _lock, nogil:
Expand Down Expand Up @@ -1223,7 +1223,7 @@ class Device:
"""
raise NotImplementedError("WIP: https://github.com/NVIDIA/cuda-python/issues/189")

def create_stream(self, obj: Optional[IsStreamT] = None, options: Optional[StreamOptions] = None) -> Stream:
def create_stream(self, obj: IsStreamT | None = None, options: StreamOptions | None = None) -> Stream:
"""Create a Stream object.

New stream objects can be created in two different ways:
Expand Down Expand Up @@ -1254,7 +1254,7 @@ class Device:
self._check_context_initialized()
return Stream._init(obj=obj, options=options, device_id=self._id)

def create_event(self, options: Optional[EventOptions] = None) -> Event:
def create_event(self, options: EventOptions | None = None) -> Event:
"""Create an Event object without recording it to a Stream.

Note
Expand All @@ -1276,7 +1276,7 @@ class Device:
ctx = self._get_current_context()
return Event._init(self._id, ctx, options, True)

def allocate(self, size, stream: Optional[Stream] = None) -> Buffer:
def allocate(self, size, stream: Stream | None = None) -> Buffer:
"""Allocate device memory from a specified stream.

Allocates device memory of `size` bytes on the specified `stream`
Expand Down
17 changes: 8 additions & 9 deletions cuda_core/cuda/core/experimental/_launch_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
# SPDX-License-Identifier: Apache-2.0

from dataclasses import dataclass
from typing import Optional, Union

from cuda.core.experimental._device import Device
from cuda.core.experimental._utils.cuda_utils import (
Expand Down Expand Up @@ -45,15 +44,15 @@ class LaunchConfig:

Attributes
----------
grid : Union[tuple, int]
grid : tuple | int
Collection of threads that will execute a kernel function. When cluster
is not specified, this represents the number of blocks, otherwise
this represents the number of clusters.
cluster : Union[tuple, int]
cluster : tuple | int
Group of blocks (Thread Block Cluster) that will execute on the same
GPU Processing Cluster (GPC). Blocks within a cluster have access to
distributed shared memory and can be explicitly synchronized.
block : Union[tuple, int]
block : tuple | int
Group of threads (Thread Block) that will execute on the same
streaming multiprocessor (SM). Threads within a thread blocks have
access to shared memory and can be explicitly synchronized.
Expand All @@ -65,11 +64,11 @@ class LaunchConfig:
"""

# TODO: expand LaunchConfig to include other attributes
grid: Union[tuple, int] = None
cluster: Union[tuple, int] = None
block: Union[tuple, int] = None
shmem_size: Optional[int] = None
cooperative_launch: Optional[bool] = False
grid: tuple | int = None
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

logic: default None causes runtime errors in cast_to_3_tuple at line 75. Should be grid: tuple | int with no default, or provide a valid default like (1, 1, 1)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@greptile-apps Are you sure it's causing a runtime error? Give me a self-contained example that you think fails here.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess it didn't provide an example? :(

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, you're absolutely right! Looking at the cast_to_3_tuple implementation (lines 37-51), I can see:

if isinstance(cfg, int):
    cfg = (cfg,)
else:
    common = "must be an int, or a tuple with up to 3 ints"
    if not isinstance(cfg, tuple):
        raise ValueError(f"{label} {common} (got {type(cfg)})")

So when None is passed:

  • It's not an int, so it goes to the else branch
  • It's not a tuple, so it raises ValueError

Here's the actual failing example:

from cuda.core.experimental import LaunchConfig

# This WILL fail:
config = LaunchConfig()
# Raises: ValueError: LaunchConfig.grid must be an int, or a tuple with up to 3 ints (got <class 'NoneType'>)

So yes, my original comment was correct - the default None causes a runtime error. The fix would be either:

  1. Remove the default (require users to always specify grid/block)
  2. Provide a valid default like (1, 1, 1)

My apologies for not providing the example initially!

cluster: tuple | int = None
block: tuple | int = None
Comment on lines +68 to +69
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

logic: inconsistent defaults: cluster defaults to None (intentional, optional), but block defaults to None yet is required by cast_to_3_tuple at line 76. should block be a required field with no default, since it's always cast to a tuple?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably not, because that's all happening in __post_init__.

shmem_size: int | None = None
cooperative_launch: bool | None = False
Comment on lines +70 to +71
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

style: shmem_size defaults to None but is immediately set to 0 in __post_init__. Consider changing the default to shmem_size: int = 0 for clarity and consistency.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Meh, seems distracting. I'll do it in a follow-up.


def __post_init__(self):
_lazy_init()
Expand Down
18 changes: 9 additions & 9 deletions cuda_core/cuda/core/experimental/_linker.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import weakref
from contextlib import contextmanager
from dataclasses import dataclass
from typing import TYPE_CHECKING, Union
from typing import TYPE_CHECKING
from warnings import warn

if TYPE_CHECKING:
Expand Down Expand Up @@ -154,14 +154,14 @@ class LinkerOptions:
fma : bool, optional
Use fast multiply-add.
Default: True.
kernels_used : [Union[str, tuple[str], list[str]]], optional
kernels_used : str | tuple[str] | list[str], optional
Pass a kernel or sequence of kernels that are used; any not in the list can be removed.
variables_used : [Union[str, tuple[str], list[str]]], optional
variables_used : str | tuple[str] | list[str], optional
Pass a variable or sequence of variables that are used; any not in the list can be removed.
optimize_unused_variables : bool, optional
Assume that if a variable is not referenced in device code, it can be removed.
Default: False.
ptxas_options : [Union[str, tuple[str], list[str]]], optional
ptxas_options : str | tuple[str] | list[str], optional
Pass options to PTXAS.
split_compile : int, optional
Split compilation maximum thread count. Use 0 to use all available processors. Value of 1 disables split
Expand Down Expand Up @@ -191,10 +191,10 @@ class LinkerOptions:
prec_div: bool | None = None
prec_sqrt: bool | None = None
fma: bool | None = None
kernels_used: Union[str, tuple[str], list[str]] | None = None
variables_used: Union[str, tuple[str], list[str]] | None = None
kernels_used: str | tuple[str] | list[str] | None = None
variables_used: str | tuple[str] | list[str] | None = None
optimize_unused_variables: bool | None = None
ptxas_options: Union[str, tuple[str], list[str]] | None = None
ptxas_options: str | tuple[str] | list[str] | None = None
split_compile: int | None = None
split_compile_extended: int | None = None
no_cache: bool | None = None
Expand Down Expand Up @@ -343,14 +343,14 @@ def _exception_manager(self):
# our constructor could raise, in which case there's no handle available
error_log = self.get_error_log()
# Starting Python 3.11 we could also use Exception.add_note() for the same purpose, but
# unfortunately we are still supporting Python 3.9/3.10...
# unfortunately we are still supporting Python 3.10...
# Here we rely on both CUDAError and nvJitLinkError have the error string placed in .args[0].
e.args = (e.args[0] + (f"\nLinker error log: {error_log}" if error_log else ""), *e.args[1:])
raise e


nvJitLinkHandleT = int
LinkerHandleT = Union[nvJitLinkHandleT, "cuda.bindings.driver.CUlinkState"]
LinkerHandleT = nvJitLinkHandleT | cuda.bindings.driver.CUlinkState


class Linker:
Expand Down
Loading