From d3d36a5b4c12d284a2d0b5157aff025260e9b39e Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Wed, 15 Apr 2026 16:54:39 -0700
Subject: [PATCH 1/2] fix: re-enable and fix the conjugate gradient multi-block
 example

Remove the unconditional NVRTC waiver from the renamed example so CI can exercise its real execution path again. While re-enabling it, replace the standalone pytest.skip() checks with requirement_not_met() and simplify the platform gating.

The waived code path had been hiding several Python-side runtime bugs that required these fixes:
- replaced the invalid C-style %d f-string formatting
- fixed gen_tridiag() variable shadowing so the CSR row-offset array is actually populated
- passed the computed dynamic shared-memory size into cuLaunchCooperativeKernel() and made that size integer-valued
- stopped overwriting managed-memory pointer variables with loop indices before kernel launch and cleanup
- cached the residual before freeing dot_result, which removed the teardown segfault

Made-with: Cursor
---
 .../conjugate_gradient_multi_block_cg.py      | 93 +++++++++----------
 1 file changed, 46 insertions(+), 47 deletions(-)

diff --git a/cuda_bindings/examples/4_CUDA_Libraries/conjugate_gradient_multi_block_cg.py b/cuda_bindings/examples/4_CUDA_Libraries/conjugate_gradient_multi_block_cg.py
index 44ff57c6d7..94c9519100 100644
--- a/cuda_bindings/examples/4_CUDA_Libraries/conjugate_gradient_multi_block_cg.py
+++ b/cuda_bindings/examples/4_CUDA_Libraries/conjugate_gradient_multi_block_cg.py
@@ -26,6 +26,7 @@
     KernelHelper,
     check_cuda_errors,
     find_cuda_device,
+    requirement_not_met,
 )
 
 conjugate_gradient_multi_block_cg = """\
@@ -177,71 +178,68 @@
 """
 
 
-def gen_tridiag(i, j, val, n, nz):
-    i[0] = 0
-    j[0] = 0
-    j[1] = 0
+def gen_tridiag(row_offsets, col_indices, values, n, nz):
+    row_offsets[0] = 0
+    col_indices[0] = 0
+    col_indices[1] = 0
 
-    val[0] = float(random()) + 10.0
-    val[1] = float(random())
+    values[0] = float(random()) + 10.0
+    values[1] = float(random())
 
-    for i in range(1, n):
-        if i > 1:
-            i[i] = i[i - 1] + 3
+    for row_idx in range(1, n):
+        if row_idx > 1:
+            row_offsets[row_idx] = row_offsets[row_idx - 1] + 3
         else:
-            i[1] = 2
+            row_offsets[1] = 2
 
-        start = (i - 1) * 3 + 2
-        j[start] = i - 1
-        j[start + 1] = i
+        start = (row_idx - 1) * 3 + 2
+        col_indices[start] = row_idx - 1
+        col_indices[start + 1] = row_idx
 
-        if i < n - 1:
-            j[start + 2] = i + 1
+        if row_idx < n - 1:
+            col_indices[start + 2] = row_idx + 1
 
-        val[start] = val[start - 1]
-        val[start + 1] = float(random()) + 10.0
+        values[start] = values[start - 1]
+        values[start + 1] = float(random()) + 10.0
 
-        if i < n - 1:
-            val[start + 2] = float(random())
-    i[n] = nz
+        if row_idx < n - 1:
+            values[start + 2] = float(random())
+    row_offsets[n] = nz
 
 
 THREADS_PER_BLOCK = 512
 s_sd_kname = "conjugateGradientMultiBlockCG"
+UNSUPPORTED_SYSTEMS = {"Darwin", "QNX"}
+UNSUPPORTED_MACHINES = {"armv7l"}
 
 
 def main():
     tol = 1e-5
 
-    import pytest
+    system_name = platform.system()
+    if system_name in UNSUPPORTED_SYSTEMS:
+        requirement_not_met(f"{s_sd_kname} is not supported on {system_name}")
 
-    # WAIVE: Due to bug in NVRTC
-    return
-
-    if platform.system() == "Darwin":
-        pytest.skip("conjugateGradientMultiBlockCG is not supported on Mac OSX")
-
-    if platform.machine() == "armv7l":
-        pytest.skip("conjugateGradientMultiBlockCG is not supported on ARMv7")
-
-    if platform.machine() == "qnx":
-        pytest.skip("conjugateGradientMultiBlockCG is not supported on QNX")
+    machine_name = platform.machine()
+    if machine_name in UNSUPPORTED_MACHINES:
+        requirement_not_met(f"{s_sd_kname} is not supported on {machine_name}")
 
     # This will pick the best possible CUDA capable device
     dev_id = find_cuda_device()
     device_prop = check_cuda_errors(cudart.cudaGetDeviceProperties(dev_id))
 
     if not device_prop.managedMemory:
-        pytest.skip("Unified Memory not supported on this device")
+        requirement_not_met("Unified Memory not supported on this device")
 
     # This sample requires being run on a device that supports Cooperative Kernel
     # Launch
     if not device_prop.cooperativeLaunch:
-        pytest.skip(f"Selected GPU {dev_id} does not support Cooperative Kernel Launch")
+        requirement_not_met(f"Selected GPU {dev_id} does not support Cooperative Kernel Launch")
 
     # Statistics about the GPU device
     print(
-        f"> GPU device has {device_prop.multiProcessorCount:%d} Multi-Processors, SM {device_prop.major:%d}.{device_prop.minor:%d} compute capabilities\n"
+        f"> GPU device has {device_prop.multiProcessorCount} Multi-Processors, "
+        f"SM {device_prop.major}.{device_prop.minor} compute capabilities\n"
     )
 
     # Get kernel
@@ -267,7 +265,7 @@ def main():
     x_local = (ctypes.c_float * n).from_address(x)
     rhs_local = (ctypes.c_float * n).from_address(rhs)
     dot_result_local = (ctypes.c_double).from_address(dot_result)
-    dot_result_local = 0
+    dot_result_local.value = 0.0
 
     # temp memory for CG
     r = check_cuda_errors(cudart.cudaMallocManaged(np.dtype(np.float32).itemsize * n, cudart.cudaMemAttachGlobal))
@@ -280,9 +278,9 @@ def main():
     start = check_cuda_errors(cudart.cudaEventCreate())
     stop = check_cuda_errors(cudart.cudaEventCreate())
 
-    for i in range(n):
-        r_local[i] = rhs_local[i] = 1.0
-        x_local[i] = 0.0
+    for idx in range(n):
+        r_local[idx] = rhs_local[idx] = 1.0
+        x_local[idx] = 0.0
 
     kernel_args_value = (i, j, val, x, ax, p, r, dot_result, nz, n, tol)
     kernel_args_types = (
@@ -300,7 +298,7 @@ def main():
     )
     kernel_args = (kernel_args_value, kernel_args_types)
 
-    s_mem_size = np.dtype(np.float64).itemsize * ((THREADS_PER_BLOCK / 32) + 1)
+    s_mem_size = np.dtype(np.float64).itemsize * ((THREADS_PER_BLOCK // 32) + 1)
     num_threads = THREADS_PER_BLOCK
     num_blocks_per_sm = check_cuda_errors(
         cuda.cuOccupancyMaxActiveBlocksPerMultiprocessor(_gpu_conjugate_gradient, num_threads, s_mem_size)
@@ -325,7 +323,7 @@ def main():
             dim_block.x,
             dim_block.y,
             dim_block.z,
-            0,
+            s_mem_size,
             0,
             kernel_args,
         )
@@ -334,16 +332,17 @@ def main():
     check_cuda_errors(cudart.cudaDeviceSynchronize())
 
     time = check_cuda_errors(cudart.cudaEventElapsedTime(start, stop))
-    print(f"GPU Final, residual = {math.sqrt(dot_result_local):e}, kernel execution time = {time:f} ms")
+    residual = math.sqrt(dot_result_local.value)
+    print(f"GPU Final, residual = {residual:e}, kernel execution time = {time:f} ms")
 
     err = 0.0
-    for i in range(n):
+    for row_idx in range(n):
         rsum = 0.0
 
-        for j in range(i_local[i], i_local[i + 1]):
-            rsum += val_local[j] * x_local[j_local[j]]
+        for elem_idx in range(i_local[row_idx], i_local[row_idx + 1]):
+            rsum += val_local[elem_idx] * x_local[j_local[elem_idx]]
 
-        diff = math.fabs(rsum - rhs_local[i])
+        diff = math.fabs(rsum - rhs_local[row_idx])
 
         if diff > err:
             err = diff
@@ -361,7 +360,7 @@ def main():
     check_cuda_errors(cudart.cudaEventDestroy(stop))
 
     print(f"Test Summary:  Error amount = {err:f}")
-    if math.sqrt(dot_result_local) >= tol:
+    if residual >= tol:
         print("conjugateGradientMultiBlockCG FAILED", file=sys.stderr)
         sys.exit(1)
 

From be43e41ca3385d9fea10ebc1795f3684e8f5db0f Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Wed, 15 Apr 2026 17:03:50 -0700
Subject: [PATCH 2/2] fix: check the QNX example gate via platform.system()

QNX is an operating system rather than a machine architecture, so checking platform.machine() can miss the requirement_not_met() path on QNX hosts. Use platform.system() so the example is waived consistently on that platform.

Made-with: Cursor
---
 .../examples/3_CUDA_Features/global_to_shmem_async_copy.py      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_bindings/examples/3_CUDA_Features/global_to_shmem_async_copy.py b/cuda_bindings/examples/3_CUDA_Features/global_to_shmem_async_copy.py
index 18f5c88e9d..d3ab1c53f1 100644
--- a/cuda_bindings/examples/3_CUDA_Features/global_to_shmem_async_copy.py
+++ b/cuda_bindings/examples/3_CUDA_Features/global_to_shmem_async_copy.py
@@ -1142,7 +1142,7 @@ def matrix_multiply(dims_a, dims_b, kernel_number):
 def main():
     check_compute_capability_too_low(find_cuda_device(), (7, 0))
 
-    if platform.machine() == "qnx":
+    if platform.system() == "QNX":
         requirement_not_met("globalToShmemAsyncCopy is not supported on QNX")
 
     version = check_cuda_errors(cuda.cuDriverGetVersion())