NVIDIA · cpcloud · Oct 27, 2025 · Oct 27, 2025 · Oct 27, 2025 · Oct 27, 2025
diff --git a/.github/workflows/build-wheel.yml b/.github/workflows/build-wheel.yml
@@ -28,7 +28,6 @@ jobs:
       fail-fast: false
       matrix:
         python-version:
-          - "3.9"
           - "3.10"
           - "3.11"
           - "3.12"

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -113,7 +113,7 @@ flowchart TD
             B2["linux-aarch64<br/>(Self-hosted)"]
             B3["win-64<br/>(GitHub-hosted)"]
         end
-        BUILD_DETAILS["• Python versions: 3.9, 3.10, 3.11, 3.12, 3.13<br/>• CUDA version: 13.0.0 (build-time)<br/>• Components: cuda-core, cuda-bindings,<br/>  cuda-pathfinder, cuda-python"]
+        BUILD_DETAILS["• Python versions: 3.10, 3.11, 3.12, 3.13, 3.14<br/>• CUDA version: 13.0.0 (build-time)<br/>• Components: cuda-core, cuda-bindings,<br/>  cuda-pathfinder, cuda-python"]
     end
 
     %% Artifact Storage

diff --git a/ci/test-matrix.json b/ci/test-matrix.json
@@ -4,8 +4,6 @@
   "_notes": "DRIVER: 'earliest' does not work with CUDA 12.9.1 and LOCAL_CTK: 0 does not work with CUDA 12.0.1",
   "linux": {
     "pull-request": [
-      { "ARCH": "amd64", "PY_VER": "3.9",  "CUDA_VER": "12.9.1", "LOCAL_CTK": "0", "GPU": "l4", "DRIVER": "latest" },
-      { "ARCH": "amd64", "PY_VER": "3.9",  "CUDA_VER": "13.0.2", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" },
       { "ARCH": "amd64", "PY_VER": "3.10", "CUDA_VER": "12.9.1", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" },
       { "ARCH": "amd64", "PY_VER": "3.10", "CUDA_VER": "13.0.2", "LOCAL_CTK": "0", "GPU": "l4", "DRIVER": "latest" },
       { "ARCH": "amd64", "PY_VER": "3.11", "CUDA_VER": "12.9.1", "LOCAL_CTK": "0", "GPU": "l4", "DRIVER": "latest" },
@@ -16,8 +14,6 @@
       { "ARCH": "amd64", "PY_VER": "3.13", "CUDA_VER": "13.0.2", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" },
       { "ARCH": "amd64", "PY_VER": "3.14", "CUDA_VER": "13.0.2", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" },
       { "ARCH": "amd64", "PY_VER": "3.14t", "CUDA_VER": "13.0.2", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" },
-      { "ARCH": "arm64", "PY_VER": "3.9",  "CUDA_VER": "12.9.1", "LOCAL_CTK": "0", "GPU": "a100", "DRIVER": "latest" },
-      { "ARCH": "arm64", "PY_VER": "3.9",  "CUDA_VER": "13.0.2", "LOCAL_CTK": "1", "GPU": "a100", "DRIVER": "latest" },
       { "ARCH": "arm64", "PY_VER": "3.10", "CUDA_VER": "12.9.1", "LOCAL_CTK": "1", "GPU": "a100", "DRIVER": "latest" },
       { "ARCH": "arm64", "PY_VER": "3.10", "CUDA_VER": "13.0.2", "LOCAL_CTK": "0", "GPU": "a100", "DRIVER": "latest" },
       { "ARCH": "arm64", "PY_VER": "3.11", "CUDA_VER": "12.9.1", "LOCAL_CTK": "0", "GPU": "a100", "DRIVER": "latest" },
@@ -30,11 +26,6 @@
       { "ARCH": "arm64", "PY_VER": "3.14t", "CUDA_VER": "13.0.2", "LOCAL_CTK": "1", "GPU": "a100", "DRIVER": "latest" }
     ],
     "nightly": [
-      { "ARCH": "amd64", "PY_VER": "3.9",  "CUDA_VER": "11.8.0", "LOCAL_CTK": "0", "GPU": "l4", "DRIVER": "earliest" },
-      { "ARCH": "amd64", "PY_VER": "3.9",  "CUDA_VER": "11.8.0", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" },
-      { "ARCH": "amd64", "PY_VER": "3.9",  "CUDA_VER": "12.0.1", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" },
-      { "ARCH": "amd64", "PY_VER": "3.9",  "CUDA_VER": "12.9.1", "LOCAL_CTK": "0", "GPU": "l4", "DRIVER": "latest" },
-      { "ARCH": "amd64", "PY_VER": "3.9",  "CUDA_VER": "12.9.1", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" },
       { "ARCH": "amd64", "PY_VER": "3.10", "CUDA_VER": "11.8.0", "LOCAL_CTK": "0", "GPU": "l4", "DRIVER": "earliest" },
       { "ARCH": "amd64", "PY_VER": "3.10", "CUDA_VER": "11.8.0", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" },
       { "ARCH": "amd64", "PY_VER": "3.10", "CUDA_VER": "12.0.1", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" },
@@ -55,11 +46,6 @@
       { "ARCH": "amd64", "PY_VER": "3.13", "CUDA_VER": "12.0.1", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" },
       { "ARCH": "amd64", "PY_VER": "3.13", "CUDA_VER": "12.9.1", "LOCAL_CTK": "0", "GPU": "l4", "DRIVER": "latest" },
       { "ARCH": "amd64", "PY_VER": "3.13", "CUDA_VER": "12.9.1", "LOCAL_CTK": "1", "GPU": "l4", "DRIVER": "latest" },
-      { "ARCH": "arm64", "PY_VER": "3.9",  "CUDA_VER": "11.8.0", "LOCAL_CTK": "0", "GPU": "a100", "DRIVER": "earliest" },
-      { "ARCH": "arm64", "PY_VER": "3.9",  "CUDA_VER": "11.8.0", "LOCAL_CTK": "1", "GPU": "a100", "DRIVER": "latest" },
-      { "ARCH": "arm64", "PY_VER": "3.9",  "CUDA_VER": "12.0.1", "LOCAL_CTK": "1", "GPU": "a100", "DRIVER": "latest" },
-      { "ARCH": "arm64", "PY_VER": "3.9",  "CUDA_VER": "12.9.1", "LOCAL_CTK": "0", "GPU": "a100", "DRIVER": "latest" },
-      { "ARCH": "arm64", "PY_VER": "3.9",  "CUDA_VER": "12.9.1", "LOCAL_CTK": "1", "GPU": "a100", "DRIVER": "latest" },
       { "ARCH": "arm64", "PY_VER": "3.10", "CUDA_VER": "11.8.0", "LOCAL_CTK": "0", "GPU": "a100", "DRIVER": "earliest" },
       { "ARCH": "arm64", "PY_VER": "3.10", "CUDA_VER": "11.8.0", "LOCAL_CTK": "1", "GPU": "a100", "DRIVER": "latest" },
       { "ARCH": "arm64", "PY_VER": "3.10", "CUDA_VER": "12.0.1", "LOCAL_CTK": "1", "GPU": "a100", "DRIVER": "latest" },

diff --git a/cuda_bindings/docs/source/install.rst b/cuda_bindings/docs/source/install.rst
@@ -10,7 +10,7 @@ Runtime Requirements
 ``cuda.bindings`` supports the same platforms as CUDA. Runtime dependencies are:
 
 * Linux (x86-64, arm64) and Windows (x86-64)
-* Python 3.9 - 3.14
+* Python 3.10 - 3.14
 * Driver: Linux (580.65.06 or later) Windows (580.88 or later)
 * Optionally, NVRTC, nvJitLink, NVVM, and cuFile from CUDA Toolkit 13.x
 

diff --git a/cuda_bindings/docs/source/support.rst b/cuda_bindings/docs/source/support.rst
@@ -19,7 +19,7 @@ The ``cuda.bindings`` module has the following support policy:
    depends on the underlying driver and the Toolkit versions, as described in the compatibility
    documentation.)
 4. The module supports all Python versions following the `CPython EOL schedule`_. As of writing
-   Python 3.9 - 3.13 are supported.
+   Python 3.10 - 3.14 are supported.
 5. The module exposes a Cython layer from which types and functions could be ``cimport``'d. While
    we strive to keep this layer stable, due to Cython limitations a new *minor* release of this
    module could require Cython layer users to rebuild their projects and update their pinning to

diff --git a/cuda_bindings/pyproject.toml b/cuda_bindings/pyproject.toml
@@ -9,16 +9,17 @@ name = "cuda-bindings"
 description = "Python bindings for CUDA"
 authors = [{name = "NVIDIA Corporation", email = "cuda-python-conduct@nvidia.com"},]
 license = "LicenseRef-NVIDIA-SOFTWARE-LICENSE"
+requires-python = ">=3.10"
 classifiers = [
     "Intended Audience :: Developers",
     "Topic :: Database",
     "Topic :: Scientific/Engineering",
     "Programming Language :: Python",
-    "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
     "Programming Language :: Python :: 3.12",
     "Programming Language :: Python :: 3.13",
+    "Programming Language :: Python :: 3.14",
     "Environment :: GPU :: NVIDIA CUDA",
 ]
 dynamic = [

diff --git a/cuda_bindings/setup.py b/cuda_bindings/setup.py
@@ -125,7 +125,7 @@ def discoverMembers(self, memberDict, prefix, seen=None):
         next_seen = set(seen)
         next_seen.add(self._name)
 
-        for memberName, memberType in zip(self._member_names, self._member_types):
+        for memberName, memberType in zip(self._member_names, self._member_types, strict=True):
             if memberName:
                 discovered.append(".".join([prefix, memberName]))
 

diff --git a/cuda_bindings/tests/test_cuda.py b/cuda_bindings/tests/test_cuda.py
@@ -432,7 +432,7 @@ def test_cuda_pointer_attr():
     # List version
     err, attr_value_list_v2 = cuda.cuPointerGetAttributes(len(attr_type_list), attr_type_list, ptr)
     assert err == cuda.CUresult.CUDA_SUCCESS
-    for attr1, attr2 in zip(attr_value_list, attr_value_list_v2):
+    for attr1, attr2 in zip(attr_value_list, attr_value_list_v2, strict=True):
         assert str(attr1) == str(attr2)
 
     # Test setting values
@@ -512,7 +512,7 @@ def test_cuda_mem_range_attr():
         attr_type_size_list, attr_type_list, len(attr_type_list), ptr, size
     )
     assert err == cuda.CUresult.CUDA_SUCCESS
-    for attr1, attr2 in zip(attr_value_list, attr_value_list_v2):
+    for attr1, attr2 in zip(attr_value_list, attr_value_list_v2, strict=True):
         assert str(attr1) == str(attr2)
 
     (err,) = cuda.cuMemFree(ptr)

diff --git a/cuda_bindings/tests/test_cufile.py b/cuda_bindings/tests/test_cufile.py
@@ -318,7 +318,7 @@ def test_buf_register_multiple_buffers():
     try:
         # Register all buffers
         flags = 0
-        for buf_ptr, size in zip(buffers, buffer_sizes):
+        for buf_ptr, size in zip(buffers, buffer_sizes, strict=True):
             buf_ptr_int = int(buf_ptr)
             cufile.buf_register(buf_ptr_int, size, flags)
 

diff --git a/cuda_bindings/tests/test_nvjitlink.py b/cuda_bindings/tests/test_nvjitlink.py
@@ -34,7 +34,7 @@
 
 
 def _build_arch_ptx_parametrized_callable():
-    av = tuple(zip(ARCHITECTURES, PTX_VERSIONS))
+    av = tuple(zip(ARCHITECTURES, PTX_VERSIONS, strict=True))
     return pytest.mark.parametrize(
         ("arch", "ptx_bytes"),
         [(a, (PTX_HEADER.format(VERSION=v, ARCH=a) + PTX_KERNEL).encode("utf-8")) for a, v in av],

diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py
@@ -26,17 +26,6 @@
 finally:
     del cuda.bindings, importlib, subdir, cuda_major, cuda_minor
 
-import sys  # noqa: E402
-import warnings  # noqa: E402
-
-if sys.version_info < (3, 10):
-    warnings.warn(
-        "support for Python 3.9 and below is deprecated and subject to future removal",
-        category=FutureWarning,
-        stacklevel=1,
-    )
-del sys, warnings
-
 from cuda.core.experimental import utils  # noqa: E402
 from cuda.core.experimental._device import Device  # noqa: E402
 from cuda.core.experimental._event import Event, EventOptions  # noqa: E402

diff --git a/cuda_core/cuda/core/experimental/_device.pyx b/cuda_core/cuda/core/experimental/_device.pyx
@@ -10,7 +10,7 @@ from cuda.bindings cimport cydriver
 from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN
 
 import threading
-from typing import Optional, Union
+from typing import Union
 
 from cuda.core.experimental._context import Context, ContextOptions
 from cuda.core.experimental._event import Event, EventOptions
@@ -951,7 +951,7 @@ class Device:
     """
     __slots__ = ("_id", "_mr", "_has_inited", "_properties")
 
-    def __new__(cls, device_id: Optional[int] = None):
+    def __new__(cls, device_id: int | None = None):
         global _is_cuInit
         if _is_cuInit is False:
             with _lock, nogil:
@@ -1223,7 +1223,7 @@ class Device:
         """
         raise NotImplementedError("WIP: https://github.com/NVIDIA/cuda-python/issues/189")
 
-    def create_stream(self, obj: Optional[IsStreamT] = None, options: Optional[StreamOptions] = None) -> Stream:
+    def create_stream(self, obj: IsStreamT | None = None, options: StreamOptions | None = None) -> Stream:
         """Create a Stream object.
 
         New stream objects can be created in two different ways:
@@ -1254,7 +1254,7 @@ class Device:
         self._check_context_initialized()
         return Stream._init(obj=obj, options=options, device_id=self._id)
 
-    def create_event(self, options: Optional[EventOptions] = None) -> Event:
+    def create_event(self, options: EventOptions | None = None) -> Event:
         """Create an Event object without recording it to a Stream.
 
         Note
@@ -1276,7 +1276,7 @@ class Device:
         ctx = self._get_current_context()
         return Event._init(self._id, ctx, options, True)
 
-    def allocate(self, size, stream: Optional[Stream] = None) -> Buffer:
+    def allocate(self, size, stream: Stream | None = None) -> Buffer:
         """Allocate device memory from a specified stream.
 
         Allocates device memory of `size` bytes on the specified `stream`

diff --git a/cuda_core/cuda/core/experimental/_launch_config.py b/cuda_core/cuda/core/experimental/_launch_config.py
@@ -3,7 +3,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from dataclasses import dataclass
-from typing import Optional, Union
 
 from cuda.core.experimental._device import Device
 from cuda.core.experimental._utils.cuda_utils import (
@@ -45,15 +44,15 @@ class LaunchConfig:
 
     Attributes
     ----------
-    grid : Union[tuple, int]
+    grid : tuple | int
         Collection of threads that will execute a kernel function. When cluster
         is not specified, this represents the number of blocks, otherwise
         this represents the number of clusters.
-    cluster : Union[tuple, int]
+    cluster : tuple | int
         Group of blocks (Thread Block Cluster) that will execute on the same
         GPU Processing Cluster (GPC). Blocks within a cluster have access to
         distributed shared memory and can be explicitly synchronized.
-    block : Union[tuple, int]
+    block : tuple | int
         Group of threads (Thread Block) that will execute on the same
         streaming multiprocessor (SM). Threads within a thread blocks have
         access to shared memory and can be explicitly synchronized.
@@ -65,11 +64,11 @@ class LaunchConfig:
     """
 
     # TODO: expand LaunchConfig to include other attributes
-    grid: Union[tuple, int] = None
-    cluster: Union[tuple, int] = None
-    block: Union[tuple, int] = None
-    shmem_size: Optional[int] = None
-    cooperative_launch: Optional[bool] = False
+    grid: tuple | int = None
+    cluster: tuple | int = None
+    block: tuple | int = None
+    shmem_size: int | None = None
+    cooperative_launch: bool | None = False
 
     def __post_init__(self):
         _lazy_init()

diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py
@@ -9,7 +9,7 @@
 import weakref
 from contextlib import contextmanager
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Union
+from typing import TYPE_CHECKING
 from warnings import warn
 
 if TYPE_CHECKING:
@@ -154,14 +154,14 @@ class LinkerOptions:
     fma : bool, optional
         Use fast multiply-add.
         Default: True.
-    kernels_used : [Union[str, tuple[str], list[str]]], optional
+    kernels_used : str | tuple[str] | list[str], optional
         Pass a kernel or sequence of kernels that are used; any not in the list can be removed.
-    variables_used : [Union[str, tuple[str], list[str]]], optional
+    variables_used : str | tuple[str] | list[str], optional
         Pass a variable or sequence of variables that are used; any not in the list can be removed.
     optimize_unused_variables : bool, optional
         Assume that if a variable is not referenced in device code, it can be removed.
         Default: False.
-    ptxas_options : [Union[str, tuple[str], list[str]]], optional
+    ptxas_options : str | tuple[str] | list[str], optional
         Pass options to PTXAS.
     split_compile : int, optional
         Split compilation maximum thread count. Use 0 to use all available processors. Value of 1 disables split
@@ -191,10 +191,10 @@ class LinkerOptions:
     prec_div: bool | None = None
     prec_sqrt: bool | None = None
     fma: bool | None = None
-    kernels_used: Union[str, tuple[str], list[str]] | None = None
-    variables_used: Union[str, tuple[str], list[str]] | None = None
+    kernels_used: str | tuple[str] | list[str] | None = None
+    variables_used: str | tuple[str] | list[str] | None = None
     optimize_unused_variables: bool | None = None
-    ptxas_options: Union[str, tuple[str], list[str]] | None = None
+    ptxas_options: str | tuple[str] | list[str] | None = None
     split_compile: int | None = None
     split_compile_extended: int | None = None
     no_cache: bool | None = None
@@ -343,14 +343,14 @@ def _exception_manager(self):
             # our constructor could raise, in which case there's no handle available
             error_log = self.get_error_log()
         # Starting Python 3.11 we could also use Exception.add_note() for the same purpose, but
-        # unfortunately we are still supporting Python 3.9/3.10...
+        # unfortunately we are still supporting Python 3.10...
         # Here we rely on both CUDAError and nvJitLinkError have the error string placed in .args[0].
         e.args = (e.args[0] + (f"\nLinker error log: {error_log}" if error_log else ""), *e.args[1:])
         raise e
 
 
 nvJitLinkHandleT = int
-LinkerHandleT = Union[nvJitLinkHandleT, "cuda.bindings.driver.CUlinkState"]
+LinkerHandleT = nvJitLinkHandleT | cuda.bindings.driver.CUlinkState
 
 
 class Linker:
-Original file line number
+Diff line change
@@ Expand Up / @@ -28,7 +28,6 @@ jobs: @@
           fail-fast: false
           matrix:
             python-version:
-              - "3.9"
               - "3.10"
               - "3.11"
               - "3.12"
@@ Expand Down @@