NVIDIA · kkraus14 · Oct 6, 2025 · Oct 6, 2025 · Oct 6, 2025 · Oct 6, 2025
diff --git a/cuda_core/docs/source/getting-started.rst b/cuda_core/docs/source/getting-started.rst
@@ -47,7 +47,7 @@ First, we define a string containing the CUDA C++ kernel. Note that this is a te
                               size_t N) {
        const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
        for (size_t i=tid; i<N; i+=gridDim.x*blockDim.x) {
-           C[tid] = A[tid] + B[tid];
+           C[i] = A[i] + B[i];
        }
    }
    """

diff --git a/cuda_core/docs/source/release/0.X.Y-notes.rst b/cuda_core/docs/source/release/0.X.Y-notes.rst
@@ -52,3 +52,4 @@ Fixes and enhancements
 - Fixed a bug in :class:`Stream` and other classes where object cleanup would error during interpreter shutdown.
 - :class:`StridedMemoryView` of an underlying array using the DLPack protocol will no longer leak memory.
 - General performance improvement.
+- Fixed incorrect index usage in vector_add example
diff --git a/cuda_core/examples/simple_multi_gpu_example.py b/cuda_core/examples/simple_multi_gpu_example.py
@@ -35,7 +35,7 @@
                            size_t N) {
     const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
     for (size_t i=tid; i<N; i+=gridDim.x*blockDim.x) {
-        C[tid] = A[tid] + B[tid];
+        C[i] = A[i] + B[i];
     }
 }
 """

diff --git a/cuda_core/examples/vector_add.py b/cuda_core/examples/vector_add.py
@@ -21,7 +21,7 @@
                            size_t N) {
     const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
     for (size_t i=tid; i<N; i+=gridDim.x*blockDim.x) {
-        C[tid] = A[tid] + B[tid];
+        C[i] = A[i] + B[i];
     }
 }
 """