NVIDIA · rwgk · May 1, 2026 · Apr 29, 2026 · Apr 29, 2026 · Apr 30, 2026
diff --git a/benchmarks/cuda_bindings/README.md b/benchmarks/cuda_bindings/README.md
@@ -6,9 +6,13 @@ Driver APIs through cuda.bindings, relative to a similar C++ baseline.
 The goal is to benchmark how much overhead does the Python layer adds to calling
 CUDA APIs and what operations are not in our target of less than 1us of overhead.
 
-Each Python benchmark has a C++ counterpart, which is used to compare the
-operations. We try to make each implementation perform small operations
-and nearly the same work as possible and are run under similar conditions.
+Most Python benchmarks have a C++ counterpart that is used as a comparative
+baseline. We try to make each implementation perform small operations and
+nearly the same work as possible and are run under similar conditions.
+
+A few benchmarks (e.g. in `bench_enum.py`) are intentionally Python-only
+because they measure costs with no direct C++ equivalent — such as enum
+construction and member access on `cuda.bindings` enum classes.
 
 These are **not** throughput benchmarks to measure the overall performance
 of kernels and applications.

diff --git a/benchmarks/cuda_bindings/benchmarks/bench_ctx_device.py b/benchmarks/cuda_bindings/benchmarks/bench_ctx_device.py
@@ -13,6 +13,11 @@
 _, DEVICE = cuda.cuDeviceGet(0)
 ATTRIBUTE = cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR
 
+# Outer retain so the benchmarked retain/release pair just bumps the refcount.
+_err, _PRIMARY_CTX = cuda.cuDevicePrimaryCtxRetain(DEVICE)
+if _err != cuda.CUresult.CUDA_SUCCESS:
+    raise RuntimeError(f"cuDevicePrimaryCtxRetain failed during setup: {_err}")
+
 
 def bench_ctx_get_current(loops: int) -> float:
     _fn = cuda.cuCtxGetCurrent
@@ -60,3 +65,15 @@ def bench_device_get_attribute(loops: int) -> float:
     for _ in range(loops):
         _fn(_attr, _dev)
     return time.perf_counter() - t0
+
+
+def bench_device_primary_ctx_retain(loops: int) -> float:
+    _retain = cuda.cuDevicePrimaryCtxRetain
+    _release = cuda.cuDevicePrimaryCtxRelease
+    _dev = DEVICE
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _retain(_dev)
+        _release(_dev)
+    return time.perf_counter() - t0
diff --git a/benchmarks/cuda_bindings/benchmarks/bench_enum.py b/benchmarks/cuda_bindings/benchmarks/bench_enum.py
@@ -0,0 +1,34 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import time
+
+from cuda.bindings import driver as cuda
+
+
+def bench_curesult_construction(loops: int) -> float:
+    _cls = cuda.CUresult
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _cls(0)
+    return time.perf_counter() - t0
+
+
+def bench_curesult_member_access(loops: int) -> float:
+    _cls = cuda.CUresult
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _cls.CUDA_SUCCESS  # noqa: B018
+    return time.perf_counter() - t0
+
+
+def bench_device_attribute_construction(loops: int) -> float:
+    _cls = cuda.CUdevice_attribute
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _cls(1)
+    return time.perf_counter() - t0
diff --git a/benchmarks/cuda_bindings/benchmarks/bench_pointer_attributes.py b/benchmarks/cuda_bindings/benchmarks/bench_pointer_attributes.py
@@ -12,6 +12,14 @@
 PTR = alloc_persistent(1 << 18)
 ATTRIBUTE = cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MEMORY_TYPE
 
+ATTRIBUTES = (
+    cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MEMORY_TYPE,
+    cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_DEVICE_POINTER,
+    cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_HOST_POINTER,
+    cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_BUFFER_ID,
+)
+NUM_ATTRIBUTES = len(ATTRIBUTES)
+
 
 def bench_pointer_get_attribute(loops: int) -> float:
     # Local references to avoid global lookups in the hot loop
@@ -23,3 +31,15 @@ def bench_pointer_get_attribute(loops: int) -> float:
     for _ in range(loops):
         _fn(_attr, _ptr)
     return time.perf_counter() - t0
+
+
+def bench_pointer_get_attributes(loops: int) -> float:
+    _fn = cuda.cuPointerGetAttributes
+    _num = NUM_ATTRIBUTES
+    _attrs = ATTRIBUTES
+    _ptr = PTR
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _fn(_num, _attrs, _ptr)
+    return time.perf_counter() - t0
diff --git a/benchmarks/cuda_bindings/benchmarks/bench_tensormap.py b/benchmarks/cuda_bindings/benchmarks/bench_tensormap.py
@@ -0,0 +1,203 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import time
+
+from runner.runtime import alloc_persistent, ensure_context
+
+from cuda.bindings import driver as cuda
+
+ensure_context()
+
+PTR = alloc_persistent(1 << 20)
+
+cuuint32_t = cuda.cuuint32_t
+cuuint64_t = cuda.cuuint64_t
+
+# Tiled: rank-2 float32, 128x128, 64x64 tile.
+TILED_DTYPE = cuda.CUtensorMapDataType.CU_TENSOR_MAP_DATA_TYPE_FLOAT32
+TILED_RANK = 2
+TILED_GLOBAL_DIM = (cuuint64_t(128), cuuint64_t(128))
+TILED_GLOBAL_STRIDES = (cuuint64_t(128 * 4),)
+TILED_BOX_DIM = (cuuint32_t(64), cuuint32_t(64))
+TILED_ELEMENT_STRIDES = (cuuint32_t(1), cuuint32_t(1))
+TILED_INTERLEAVE = cuda.CUtensorMapInterleave.CU_TENSOR_MAP_INTERLEAVE_NONE
+TILED_SWIZZLE = cuda.CUtensorMapSwizzle.CU_TENSOR_MAP_SWIZZLE_NONE
+TILED_L2 = cuda.CUtensorMapL2promotion.CU_TENSOR_MAP_L2_PROMOTION_NONE
+TILED_OOB = cuda.CUtensorMapFloatOOBfill.CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE
+
+# Im2col: rank-3 float16, 32x64x64.
+IM2COL_DTYPE = cuda.CUtensorMapDataType.CU_TENSOR_MAP_DATA_TYPE_FLOAT16
+IM2COL_RANK = 3
+IM2COL_GLOBAL_DIM = (cuuint64_t(32), cuuint64_t(64), cuuint64_t(64))
+IM2COL_GLOBAL_STRIDES = (cuuint64_t(32 * 2), cuuint64_t(32 * 64 * 2))
+IM2COL_PIXEL_BOX_LOWER = (0,)
+IM2COL_PIXEL_BOX_UPPER = (0,)
+IM2COL_CHANNELS = 32
+IM2COL_PIXELS = 32
+IM2COL_ELEMENT_STRIDES = (cuuint32_t(1), cuuint32_t(1), cuuint32_t(1))
+IM2COL_INTERLEAVE = cuda.CUtensorMapInterleave.CU_TENSOR_MAP_INTERLEAVE_NONE
+IM2COL_SWIZZLE = cuda.CUtensorMapSwizzle.CU_TENSOR_MAP_SWIZZLE_NONE
+IM2COL_L2 = cuda.CUtensorMapL2promotion.CU_TENSOR_MAP_L2_PROMOTION_NONE
+IM2COL_OOB = cuda.CUtensorMapFloatOOBfill.CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE
+
+_SUCCESS = cuda.CUresult.CUDA_SUCCESS
+
+# Resolve bindings once at module load. A missing attribute (old binding that
+# predates a TMA API) is the only legitimate reason for a probe to skip —
+# everything else (signature mismatches, unexpected TypeError, etc.) should
+# surface loudly instead of being reclassified as "unsupported".
+_ENCODE_TILED = getattr(cuda, "cuTensorMapEncodeTiled", None)
+_ENCODE_IM2COL = getattr(cuda, "cuTensorMapEncodeIm2col", None)
+_ENCODE_IM2COL_WIDE = getattr(cuda, "cuTensorMapEncodeIm2colWide", None)
+_IM2COL_WIDE_MODE_CLS = getattr(cuda, "CUtensorMapIm2ColWideMode", None)
+
+
+def _probe_tiled() -> bool:
+    if _ENCODE_TILED is None:
+        return False
+    err, _ = _ENCODE_TILED(
+        TILED_DTYPE,
+        TILED_RANK,
+        PTR,
+        TILED_GLOBAL_DIM,
+        TILED_GLOBAL_STRIDES,
+        TILED_BOX_DIM,
+        TILED_ELEMENT_STRIDES,
+        TILED_INTERLEAVE,
+        TILED_SWIZZLE,
+        TILED_L2,
+        TILED_OOB,
+    )
+    return err == _SUCCESS
+
+
+def _probe_im2col() -> bool:
+    if _ENCODE_IM2COL is None:
+        return False
+    err, _ = _ENCODE_IM2COL(
+        IM2COL_DTYPE,
+        IM2COL_RANK,
+        PTR,
+        IM2COL_GLOBAL_DIM,
+        IM2COL_GLOBAL_STRIDES,
+        IM2COL_PIXEL_BOX_LOWER,
+        IM2COL_PIXEL_BOX_UPPER,
+        IM2COL_CHANNELS,
+        IM2COL_PIXELS,
+        IM2COL_ELEMENT_STRIDES,
+        IM2COL_INTERLEAVE,
+        IM2COL_SWIZZLE,
+        IM2COL_L2,
+        IM2COL_OOB,
+    )
+    return err == _SUCCESS
+
+
+def _probe_im2col_wide() -> bool:
+    if _ENCODE_IM2COL_WIDE is None or _IM2COL_WIDE_MODE_CLS is None:
+        return False
+    mode = _IM2COL_WIDE_MODE_CLS.CU_TENSOR_MAP_IM2COL_WIDE_MODE_W
+    err, _ = _ENCODE_IM2COL_WIDE(
+        IM2COL_DTYPE,
+        IM2COL_RANK,
+        PTR,
+        IM2COL_GLOBAL_DIM,
+        IM2COL_GLOBAL_STRIDES,
+        0,
+        0,
+        IM2COL_CHANNELS,
+        IM2COL_PIXELS,
+        IM2COL_ELEMENT_STRIDES,
+        IM2COL_INTERLEAVE,
+        mode,
+        cuda.CUtensorMapSwizzle.CU_TENSOR_MAP_SWIZZLE_128B,
+        IM2COL_L2,
+        IM2COL_OOB,
+    )
+    return err == _SUCCESS
+
+
+_TILED_OK = _probe_tiled()
+_IM2COL_OK = _probe_im2col()
+_IM2COL_WIDE_OK = _probe_im2col_wide()
+
+if _IM2COL_WIDE_OK:
+    _IM2COL_WIDE_MODE_W = _IM2COL_WIDE_MODE_CLS.CU_TENSOR_MAP_IM2COL_WIDE_MODE_W
+    _IM2COL_WIDE_SWIZZLE = cuda.CUtensorMapSwizzle.CU_TENSOR_MAP_SWIZZLE_128B
+
+SKIPPED_BENCHMARKS: set[str] = set()
+if not _TILED_OK:
+    SKIPPED_BENCHMARKS.add("bench_tensor_map_encode_tiled")
+if not _IM2COL_OK:
+    SKIPPED_BENCHMARKS.add("bench_tensor_map_encode_im2col")
+if not _IM2COL_WIDE_OK:
+    SKIPPED_BENCHMARKS.add("bench_tensor_map_encode_im2col_wide")
+
+
+def bench_tensor_map_encode_tiled(loops: int) -> float:
+    _fn = cuda.cuTensorMapEncodeTiled
+    _dt = TILED_DTYPE
+    _rank = TILED_RANK
+    _addr = PTR
+    _gdim = TILED_GLOBAL_DIM
+    _gstr = TILED_GLOBAL_STRIDES
+    _bdim = TILED_BOX_DIM
+    _estr = TILED_ELEMENT_STRIDES
+    _inter = TILED_INTERLEAVE
+    _swz = TILED_SWIZZLE
+    _l2 = TILED_L2
+    _oob = TILED_OOB
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _fn(_dt, _rank, _addr, _gdim, _gstr, _bdim, _estr, _inter, _swz, _l2, _oob)
+    return time.perf_counter() - t0
+
+
+def bench_tensor_map_encode_im2col(loops: int) -> float:
+    _fn = cuda.cuTensorMapEncodeIm2col
+    _dt = IM2COL_DTYPE
+    _rank = IM2COL_RANK
+    _addr = PTR
+    _gdim = IM2COL_GLOBAL_DIM
+    _gstr = IM2COL_GLOBAL_STRIDES
+    _lower = IM2COL_PIXEL_BOX_LOWER
+    _upper = IM2COL_PIXEL_BOX_UPPER
+    _ch = IM2COL_CHANNELS
+    _px = IM2COL_PIXELS
+    _estr = IM2COL_ELEMENT_STRIDES
+    _inter = IM2COL_INTERLEAVE
+    _swz = IM2COL_SWIZZLE
+    _l2 = IM2COL_L2
+    _oob = IM2COL_OOB
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _fn(_dt, _rank, _addr, _gdim, _gstr, _lower, _upper, _ch, _px, _estr, _inter, _swz, _l2, _oob)
+    return time.perf_counter() - t0
+
+
+def bench_tensor_map_encode_im2col_wide(loops: int) -> float:
+    _fn = _ENCODE_IM2COL_WIDE
+    _dt = IM2COL_DTYPE
+    _rank = IM2COL_RANK
+    _addr = PTR
+    _gdim = IM2COL_GLOBAL_DIM
+    _gstr = IM2COL_GLOBAL_STRIDES
+    _lower_w = 0
+    _upper_w = 0
+    _ch = IM2COL_CHANNELS
+    _px = IM2COL_PIXELS
+    _estr = IM2COL_ELEMENT_STRIDES
+    _inter = IM2COL_INTERLEAVE
+    _mode = _IM2COL_WIDE_MODE_W
+    _swz = _IM2COL_WIDE_SWIZZLE
+    _l2 = IM2COL_L2
+    _oob = IM2COL_OOB
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _fn(_dt, _rank, _addr, _gdim, _gstr, _lower_w, _upper_w, _ch, _px, _estr, _inter, _mode, _swz, _l2, _oob)
+    return time.perf_counter() - t0
diff --git a/benchmarks/cuda_bindings/benchmarks/cpp/CMakeLists.txt b/benchmarks/cuda_bindings/benchmarks/cpp/CMakeLists.txt
@@ -83,6 +83,7 @@ add_driver_benchmark(bench_ctx_device)
 add_driver_benchmark(bench_stream)
 add_driver_benchmark(bench_event)
 add_driver_benchmark(bench_memory)
+add_driver_benchmark(bench_tensormap)
 
 # NVRTC benchmarks (require nvrtc for kernel compilation)
 if(NVRTC_INCLUDE_DIR AND NVRTC_LIBRARY)

diff --git a/benchmarks/cuda_bindings/benchmarks/cpp/bench_ctx_device.cpp b/benchmarks/cuda_bindings/benchmarks/cpp/bench_ctx_device.cpp
@@ -77,6 +77,25 @@ int main(int argc, char** argv) {
         });
     }
 
+    // --- device_primary_ctx_retain ---
+    // Outer retain so the benchmarked retain/release pair just bumps the refcount.
+    CUcontext primary_outer = nullptr;
+    check_cu(
+        cuDevicePrimaryCtxRetain(&primary_outer, device),
+        "cuDevicePrimaryCtxRetain (setup) failed"
+    );
+    {
+        CUcontext primary = nullptr;
+        suite.run("ctx_device.device_primary_ctx_retain", [&]() {
+            check_cu(cuDevicePrimaryCtxRetain(&primary, device), "cuDevicePrimaryCtxRetain failed");
+            check_cu(cuDevicePrimaryCtxRelease(device), "cuDevicePrimaryCtxRelease failed");
+        });
+    }
+    check_cu(
+        cuDevicePrimaryCtxRelease(device),
+        "cuDevicePrimaryCtxRelease (teardown) failed"
+    );
+
     // Cleanup
     check_cu(cuCtxDestroy(ctx), "cuCtxDestroy failed");
 

diff --git a/benchmarks/cuda_bindings/benchmarks/cpp/bench_pointer_attributes.cpp b/benchmarks/cuda_bindings/benchmarks/cpp/bench_pointer_attributes.cpp
@@ -49,6 +49,29 @@ int main(int argc, char** argv) {
         });
     }
 
+    // --- pointer_get_attributes ---
+    {
+        unsigned int memory_type = 0;
+        CUdeviceptr dev_ptr_out = 0;
+        void* host_ptr_out = nullptr;
+        unsigned long long buffer_id = 0;
+
+        CUpointer_attribute attrs[4] = {
+            CU_POINTER_ATTRIBUTE_MEMORY_TYPE,
+            CU_POINTER_ATTRIBUTE_DEVICE_POINTER,
+            CU_POINTER_ATTRIBUTE_HOST_POINTER,
+            CU_POINTER_ATTRIBUTE_BUFFER_ID,
+        };
+        void* data[4] = {&memory_type, &dev_ptr_out, &host_ptr_out, &buffer_id};
+
+        suite.run("pointer_attributes.pointer_get_attributes", [&]() {
+            check_cu(
+                cuPointerGetAttributes(4, attrs, data, ptr),
+                "cuPointerGetAttributes failed"
+            );
+        });
+    }
+
     // Cleanup
     check_cu(cuMemFree(ptr), "cuMemFree failed");
     check_cu(cuCtxDestroy(ctx), "cuCtxDestroy failed");