From 6c3b48be0eb60097b505a2642d9e725b17632a38 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Tue, 7 Oct 2025 19:51:20 +0000 Subject: [PATCH 1/5] bump cuda.core to v0.4.0 --- cuda_core/cuda/core/_version.py | 2 +- cuda_core/docs/nv-versions.json | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/_version.py b/cuda_core/cuda/core/_version.py index 8326aa224b..44667d4a0f 100644 --- a/cuda_core/cuda/core/_version.py +++ b/cuda_core/cuda/core/_version.py @@ -2,4 +2,4 @@ # # SPDX-License-Identifier: Apache-2.0 -__version__ = "0.3.3a0" +__version__ = "0.4.0" diff --git a/cuda_core/docs/nv-versions.json b/cuda_core/docs/nv-versions.json index d1c10914cd..d9dd20e5cd 100644 --- a/cuda_core/docs/nv-versions.json +++ b/cuda_core/docs/nv-versions.json @@ -3,6 +3,10 @@ "version": "latest", "url": "https://nvidia.github.io/cuda-python/cuda-core/latest/" }, + { + "version": "0.4.0", + "url": "https://nvidia.github.io/cuda-python/cuda-core/0.4.0/" + }, { "version": "0.3.2", "url": "https://nvidia.github.io/cuda-python/cuda-core/0.3.2/" From 5157c1d8243d71171b28b275f0dfd00e7639b299 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Tue, 7 Oct 2025 22:35:00 +0000 Subject: [PATCH 2/5] document deallocation stream change --- cuda_core/docs/source/release/0.X.Y-notes.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/cuda_core/docs/source/release/0.X.Y-notes.rst b/cuda_core/docs/source/release/0.X.Y-notes.rst index 7c14873293..bc5e9844bf 100644 --- a/cuda_core/docs/source/release/0.X.Y-notes.rst +++ b/cuda_core/docs/source/release/0.X.Y-notes.rst @@ -22,6 +22,7 @@ Breaking Changes - **CUDA 11 support dropped**: CUDA 11 support is no longer tested and it may or may not work with cuda.bindings and CTK 11.x. Users are encouraged to migrate to CUDA 12.x or 13.x. - Support for ``cuda-bindings`` (and ``cuda-python``) < 12.6.2 is dropped. Internally, ``cuda.core`` now always requires the `new binding module layout `_. As per the ``cuda-bindings`` `support policy `_), CUDA 12 users are encouraged to use the latest ``cuda-bindings`` 12.9.x, which is backward-compatible with all CUDA Toolkit 12.y. - **LaunchConfig grid parameter interpretation**: When :attr:`LaunchConfig.cluster` is specified, the :attr:`LaunchConfig.grid` parameter now correctly represents the number of clusters instead of blocks. Previously, the grid parameter was incorrectly interpreted as blocks, causing a mismatch with the expected C++ behavior. This change ensures that ``LaunchConfig(grid=4, cluster=2, block=32)`` correctly produces 4 clusters × 2 blocks/cluster = 8 total blocks, matching the C++ equivalent ``cudax::make_hierarchy(cudax::grid_dims(4), cudax::cluster_dims(2), cudax::block_dims(32))``. +- The :class:`Buffer` objects now deallocate on the stream that was used to allocate it, instead of on the default stream. We encourage users to overwrite the deallocation stream explicitly through the :meth:`~Buffer.close` method if desired. Establishing a proper stream order is the user responsibility. New features From 43d8971c27eb1f52f4e8f871272f57296a4e7281 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Tue, 7 Oct 2025 22:36:20 +0000 Subject: [PATCH 3/5] prepare 0.4.0 rel notes --- cuda_core/docs/source/release.rst | 2 +- .../docs/source/release/{0.X.Y-notes.rst => 0.4.0-notes.rst} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename cuda_core/docs/source/release/{0.X.Y-notes.rst => 0.4.0-notes.rst} (100%) diff --git a/cuda_core/docs/source/release.rst b/cuda_core/docs/source/release.rst index dc28b31220..8be38864c5 100644 --- a/cuda_core/docs/source/release.rst +++ b/cuda_core/docs/source/release.rst @@ -7,7 +7,7 @@ Release Notes .. toctree:: :maxdepth: 3 - 0.X.Y + 0.4.0 0.3.2 0.3.1 0.3.0 diff --git a/cuda_core/docs/source/release/0.X.Y-notes.rst b/cuda_core/docs/source/release/0.4.0-notes.rst similarity index 100% rename from cuda_core/docs/source/release/0.X.Y-notes.rst rename to cuda_core/docs/source/release/0.4.0-notes.rst From 85f64f0bc1b4daa7861e2cf9716bb67798a695eb Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Tue, 7 Oct 2025 22:42:57 +0000 Subject: [PATCH 4/5] nit: avoid highlighting when applicable --- cuda_core/docs/source/release/0.4.0-notes.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cuda_core/docs/source/release/0.4.0-notes.rst b/cuda_core/docs/source/release/0.4.0-notes.rst index bc5e9844bf..83632ad7b2 100644 --- a/cuda_core/docs/source/release/0.4.0-notes.rst +++ b/cuda_core/docs/source/release/0.4.0-notes.rst @@ -19,9 +19,9 @@ Highlights Breaking Changes ---------------- -- **CUDA 11 support dropped**: CUDA 11 support is no longer tested and it may or may not work with cuda.bindings and CTK 11.x. Users are encouraged to migrate to CUDA 12.x or 13.x. +- CUDA 11 support dropped: CUDA 11 is no longer tested and it may or may not work with ``cuda.bindings`` and CTK 11.x. Users are encouraged to migrate to CUDA 12.x or 13.x. - Support for ``cuda-bindings`` (and ``cuda-python``) < 12.6.2 is dropped. Internally, ``cuda.core`` now always requires the `new binding module layout `_. As per the ``cuda-bindings`` `support policy `_), CUDA 12 users are encouraged to use the latest ``cuda-bindings`` 12.9.x, which is backward-compatible with all CUDA Toolkit 12.y. -- **LaunchConfig grid parameter interpretation**: When :attr:`LaunchConfig.cluster` is specified, the :attr:`LaunchConfig.grid` parameter now correctly represents the number of clusters instead of blocks. Previously, the grid parameter was incorrectly interpreted as blocks, causing a mismatch with the expected C++ behavior. This change ensures that ``LaunchConfig(grid=4, cluster=2, block=32)`` correctly produces 4 clusters × 2 blocks/cluster = 8 total blocks, matching the C++ equivalent ``cudax::make_hierarchy(cudax::grid_dims(4), cudax::cluster_dims(2), cudax::block_dims(32))``. +- Change in :class:`LaunchConfig` grid parameter interpretation: When :attr:`LaunchConfig.cluster` is specified, the :attr:`LaunchConfig.grid` parameter now correctly represents the number of clusters instead of blocks. Previously, the grid parameter was incorrectly interpreted as blocks, causing a mismatch with the expected C++ behavior. This change ensures that ``LaunchConfig(grid=4, cluster=2, block=32)`` correctly produces 4 clusters × 2 blocks/cluster = 8 total blocks, matching the C++ equivalent ``cudax::make_hierarchy(cudax::grid_dims(4), cudax::cluster_dims(2), cudax::block_dims(32))``. - The :class:`Buffer` objects now deallocate on the stream that was used to allocate it, instead of on the default stream. We encourage users to overwrite the deallocation stream explicitly through the :meth:`~Buffer.close` method if desired. Establishing a proper stream order is the user responsibility. From 6b1a7bb70bb519389984070bcc9c74b481a3b4a7 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Wed, 8 Oct 2025 17:48:33 +0000 Subject: [PATCH 5/5] emphasize VMM is linux only for now --- cuda_core/docs/source/release/0.4.0-notes.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/docs/source/release/0.4.0-notes.rst b/cuda_core/docs/source/release/0.4.0-notes.rst index 01380ceba9..919892340d 100644 --- a/cuda_core/docs/source/release/0.4.0-notes.rst +++ b/cuda_core/docs/source/release/0.4.0-notes.rst @@ -33,7 +33,7 @@ New features - Stream-ordered memory allocation can now be shared on Linux via :class:`DeviceMemoryResource`. - Added NVVM IR support to :class:`Program`. NVVM IR is now understood with ``code_type="nvvm"``. - Added an :attr:`ObjectCode.code_type` attribute for querying the code type. -- Added :class:`VirtualMemoryResource` for low-level virtual memory management. +- Added :class:`VirtualMemoryResource` for low-level virtual memory management on Linux. New examples