Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions benchmarks/cuda_bindings/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,13 @@ Driver APIs through cuda.bindings, relative to a similar C++ baseline.
The goal is to benchmark how much overhead does the Python layer adds to calling
CUDA APIs and what operations are not in our target of less than 1us of overhead.

Each Python benchmark has a C++ counterpart, which is used to compare the
operations. We try to make each implementation perform small operations
and nearly the same work as possible and are run under similar conditions.
Most Python benchmarks have a C++ counterpart that is used as a comparative
baseline. We try to make each implementation perform small operations and
nearly the same work as possible and are run under similar conditions.

A few benchmarks (e.g. in `bench_enum.py`) are intentionally Python-only
because they measure costs with no direct C++ equivalent — such as enum
construction and member access on `cuda.bindings` enum classes.

These are **not** throughput benchmarks to measure the overall performance
of kernels and applications.
Expand Down
17 changes: 17 additions & 0 deletions benchmarks/cuda_bindings/benchmarks/bench_ctx_device.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,11 @@
_, DEVICE = cuda.cuDeviceGet(0)
ATTRIBUTE = cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR

# Outer retain so the benchmarked retain/release pair just bumps the refcount.
_err, _PRIMARY_CTX = cuda.cuDevicePrimaryCtxRetain(DEVICE)
if _err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError(f"cuDevicePrimaryCtxRetain failed during setup: {_err}")


def bench_ctx_get_current(loops: int) -> float:
_fn = cuda.cuCtxGetCurrent
Expand Down Expand Up @@ -60,3 +65,15 @@ def bench_device_get_attribute(loops: int) -> float:
for _ in range(loops):
_fn(_attr, _dev)
return time.perf_counter() - t0


def bench_device_primary_ctx_retain(loops: int) -> float:
_retain = cuda.cuDevicePrimaryCtxRetain
_release = cuda.cuDevicePrimaryCtxRelease
_dev = DEVICE

t0 = time.perf_counter()
for _ in range(loops):
_retain(_dev)
_release(_dev)
return time.perf_counter() - t0
34 changes: 34 additions & 0 deletions benchmarks/cuda_bindings/benchmarks/bench_enum.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0

import time

from cuda.bindings import driver as cuda


def bench_curesult_construction(loops: int) -> float:
_cls = cuda.CUresult

t0 = time.perf_counter()
for _ in range(loops):
_cls(0)
return time.perf_counter() - t0


def bench_curesult_member_access(loops: int) -> float:
_cls = cuda.CUresult

t0 = time.perf_counter()
for _ in range(loops):
_cls.CUDA_SUCCESS # noqa: B018
return time.perf_counter() - t0


def bench_device_attribute_construction(loops: int) -> float:
_cls = cuda.CUdevice_attribute

t0 = time.perf_counter()
for _ in range(loops):
_cls(1)
return time.perf_counter() - t0
20 changes: 20 additions & 0 deletions benchmarks/cuda_bindings/benchmarks/bench_pointer_attributes.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,14 @@
PTR = alloc_persistent(1 << 18)
ATTRIBUTE = cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MEMORY_TYPE

ATTRIBUTES = (
cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MEMORY_TYPE,
cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_DEVICE_POINTER,
cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_HOST_POINTER,
cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_BUFFER_ID,
)
NUM_ATTRIBUTES = len(ATTRIBUTES)


def bench_pointer_get_attribute(loops: int) -> float:
# Local references to avoid global lookups in the hot loop
Expand All @@ -23,3 +31,15 @@ def bench_pointer_get_attribute(loops: int) -> float:
for _ in range(loops):
_fn(_attr, _ptr)
return time.perf_counter() - t0


def bench_pointer_get_attributes(loops: int) -> float:
_fn = cuda.cuPointerGetAttributes
_num = NUM_ATTRIBUTES
_attrs = ATTRIBUTES
_ptr = PTR

t0 = time.perf_counter()
for _ in range(loops):
_fn(_num, _attrs, _ptr)
return time.perf_counter() - t0
203 changes: 203 additions & 0 deletions benchmarks/cuda_bindings/benchmarks/bench_tensormap.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0

import time

from runner.runtime import alloc_persistent, ensure_context

from cuda.bindings import driver as cuda

ensure_context()

PTR = alloc_persistent(1 << 20)

cuuint32_t = cuda.cuuint32_t
cuuint64_t = cuda.cuuint64_t

# Tiled: rank-2 float32, 128x128, 64x64 tile.
TILED_DTYPE = cuda.CUtensorMapDataType.CU_TENSOR_MAP_DATA_TYPE_FLOAT32
TILED_RANK = 2
TILED_GLOBAL_DIM = (cuuint64_t(128), cuuint64_t(128))
TILED_GLOBAL_STRIDES = (cuuint64_t(128 * 4),)
TILED_BOX_DIM = (cuuint32_t(64), cuuint32_t(64))
TILED_ELEMENT_STRIDES = (cuuint32_t(1), cuuint32_t(1))
TILED_INTERLEAVE = cuda.CUtensorMapInterleave.CU_TENSOR_MAP_INTERLEAVE_NONE
TILED_SWIZZLE = cuda.CUtensorMapSwizzle.CU_TENSOR_MAP_SWIZZLE_NONE
TILED_L2 = cuda.CUtensorMapL2promotion.CU_TENSOR_MAP_L2_PROMOTION_NONE
TILED_OOB = cuda.CUtensorMapFloatOOBfill.CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE

# Im2col: rank-3 float16, 32x64x64.
IM2COL_DTYPE = cuda.CUtensorMapDataType.CU_TENSOR_MAP_DATA_TYPE_FLOAT16
IM2COL_RANK = 3
IM2COL_GLOBAL_DIM = (cuuint64_t(32), cuuint64_t(64), cuuint64_t(64))
IM2COL_GLOBAL_STRIDES = (cuuint64_t(32 * 2), cuuint64_t(32 * 64 * 2))
IM2COL_PIXEL_BOX_LOWER = (0,)
IM2COL_PIXEL_BOX_UPPER = (0,)
IM2COL_CHANNELS = 32
IM2COL_PIXELS = 32
IM2COL_ELEMENT_STRIDES = (cuuint32_t(1), cuuint32_t(1), cuuint32_t(1))
IM2COL_INTERLEAVE = cuda.CUtensorMapInterleave.CU_TENSOR_MAP_INTERLEAVE_NONE
IM2COL_SWIZZLE = cuda.CUtensorMapSwizzle.CU_TENSOR_MAP_SWIZZLE_NONE
IM2COL_L2 = cuda.CUtensorMapL2promotion.CU_TENSOR_MAP_L2_PROMOTION_NONE
IM2COL_OOB = cuda.CUtensorMapFloatOOBfill.CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE

_SUCCESS = cuda.CUresult.CUDA_SUCCESS

# Resolve bindings once at module load. A missing attribute (old binding that
# predates a TMA API) is the only legitimate reason for a probe to skip —
# everything else (signature mismatches, unexpected TypeError, etc.) should
# surface loudly instead of being reclassified as "unsupported".
_ENCODE_TILED = getattr(cuda, "cuTensorMapEncodeTiled", None)
_ENCODE_IM2COL = getattr(cuda, "cuTensorMapEncodeIm2col", None)
_ENCODE_IM2COL_WIDE = getattr(cuda, "cuTensorMapEncodeIm2colWide", None)
_IM2COL_WIDE_MODE_CLS = getattr(cuda, "CUtensorMapIm2ColWideMode", None)


def _probe_tiled() -> bool:
if _ENCODE_TILED is None:
return False
err, _ = _ENCODE_TILED(
TILED_DTYPE,
TILED_RANK,
PTR,
TILED_GLOBAL_DIM,
TILED_GLOBAL_STRIDES,
TILED_BOX_DIM,
TILED_ELEMENT_STRIDES,
TILED_INTERLEAVE,
TILED_SWIZZLE,
TILED_L2,
TILED_OOB,
)
return err == _SUCCESS


def _probe_im2col() -> bool:
if _ENCODE_IM2COL is None:
return False
err, _ = _ENCODE_IM2COL(
IM2COL_DTYPE,
IM2COL_RANK,
PTR,
IM2COL_GLOBAL_DIM,
IM2COL_GLOBAL_STRIDES,
IM2COL_PIXEL_BOX_LOWER,
IM2COL_PIXEL_BOX_UPPER,
IM2COL_CHANNELS,
IM2COL_PIXELS,
IM2COL_ELEMENT_STRIDES,
IM2COL_INTERLEAVE,
IM2COL_SWIZZLE,
IM2COL_L2,
IM2COL_OOB,
)
return err == _SUCCESS


def _probe_im2col_wide() -> bool:
if _ENCODE_IM2COL_WIDE is None or _IM2COL_WIDE_MODE_CLS is None:
return False
mode = _IM2COL_WIDE_MODE_CLS.CU_TENSOR_MAP_IM2COL_WIDE_MODE_W
err, _ = _ENCODE_IM2COL_WIDE(
IM2COL_DTYPE,
IM2COL_RANK,
PTR,
IM2COL_GLOBAL_DIM,
IM2COL_GLOBAL_STRIDES,
0,
0,
IM2COL_CHANNELS,
IM2COL_PIXELS,
IM2COL_ELEMENT_STRIDES,
IM2COL_INTERLEAVE,
mode,
cuda.CUtensorMapSwizzle.CU_TENSOR_MAP_SWIZZLE_128B,
IM2COL_L2,
IM2COL_OOB,
)
return err == _SUCCESS


_TILED_OK = _probe_tiled()
_IM2COL_OK = _probe_im2col()
_IM2COL_WIDE_OK = _probe_im2col_wide()

if _IM2COL_WIDE_OK:
_IM2COL_WIDE_MODE_W = _IM2COL_WIDE_MODE_CLS.CU_TENSOR_MAP_IM2COL_WIDE_MODE_W
_IM2COL_WIDE_SWIZZLE = cuda.CUtensorMapSwizzle.CU_TENSOR_MAP_SWIZZLE_128B

SKIPPED_BENCHMARKS: set[str] = set()
if not _TILED_OK:
SKIPPED_BENCHMARKS.add("bench_tensor_map_encode_tiled")
if not _IM2COL_OK:
SKIPPED_BENCHMARKS.add("bench_tensor_map_encode_im2col")
if not _IM2COL_WIDE_OK:
SKIPPED_BENCHMARKS.add("bench_tensor_map_encode_im2col_wide")


def bench_tensor_map_encode_tiled(loops: int) -> float:
_fn = cuda.cuTensorMapEncodeTiled
_dt = TILED_DTYPE
_rank = TILED_RANK
_addr = PTR
_gdim = TILED_GLOBAL_DIM
_gstr = TILED_GLOBAL_STRIDES
_bdim = TILED_BOX_DIM
_estr = TILED_ELEMENT_STRIDES
_inter = TILED_INTERLEAVE
_swz = TILED_SWIZZLE
_l2 = TILED_L2
_oob = TILED_OOB

t0 = time.perf_counter()
for _ in range(loops):
_fn(_dt, _rank, _addr, _gdim, _gstr, _bdim, _estr, _inter, _swz, _l2, _oob)
return time.perf_counter() - t0


def bench_tensor_map_encode_im2col(loops: int) -> float:
_fn = cuda.cuTensorMapEncodeIm2col
_dt = IM2COL_DTYPE
_rank = IM2COL_RANK
_addr = PTR
_gdim = IM2COL_GLOBAL_DIM
_gstr = IM2COL_GLOBAL_STRIDES
_lower = IM2COL_PIXEL_BOX_LOWER
_upper = IM2COL_PIXEL_BOX_UPPER
_ch = IM2COL_CHANNELS
_px = IM2COL_PIXELS
_estr = IM2COL_ELEMENT_STRIDES
_inter = IM2COL_INTERLEAVE
_swz = IM2COL_SWIZZLE
_l2 = IM2COL_L2
_oob = IM2COL_OOB

t0 = time.perf_counter()
for _ in range(loops):
_fn(_dt, _rank, _addr, _gdim, _gstr, _lower, _upper, _ch, _px, _estr, _inter, _swz, _l2, _oob)
return time.perf_counter() - t0


def bench_tensor_map_encode_im2col_wide(loops: int) -> float:
_fn = _ENCODE_IM2COL_WIDE
_dt = IM2COL_DTYPE
_rank = IM2COL_RANK
_addr = PTR
_gdim = IM2COL_GLOBAL_DIM
_gstr = IM2COL_GLOBAL_STRIDES
_lower_w = 0
_upper_w = 0
_ch = IM2COL_CHANNELS
_px = IM2COL_PIXELS
_estr = IM2COL_ELEMENT_STRIDES
_inter = IM2COL_INTERLEAVE
_mode = _IM2COL_WIDE_MODE_W
_swz = _IM2COL_WIDE_SWIZZLE
_l2 = IM2COL_L2
_oob = IM2COL_OOB

t0 = time.perf_counter()
for _ in range(loops):
_fn(_dt, _rank, _addr, _gdim, _gstr, _lower_w, _upper_w, _ch, _px, _estr, _inter, _mode, _swz, _l2, _oob)
return time.perf_counter() - t0
1 change: 1 addition & 0 deletions benchmarks/cuda_bindings/benchmarks/cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ add_driver_benchmark(bench_ctx_device)
add_driver_benchmark(bench_stream)
add_driver_benchmark(bench_event)
add_driver_benchmark(bench_memory)
add_driver_benchmark(bench_tensormap)

# NVRTC benchmarks (require nvrtc for kernel compilation)
if(NVRTC_INCLUDE_DIR AND NVRTC_LIBRARY)
Expand Down
19 changes: 19 additions & 0 deletions benchmarks/cuda_bindings/benchmarks/cpp/bench_ctx_device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,25 @@ int main(int argc, char** argv) {
});
}

// --- device_primary_ctx_retain ---
// Outer retain so the benchmarked retain/release pair just bumps the refcount.
CUcontext primary_outer = nullptr;
check_cu(
cuDevicePrimaryCtxRetain(&primary_outer, device),
"cuDevicePrimaryCtxRetain (setup) failed"
);
{
CUcontext primary = nullptr;
suite.run("ctx_device.device_primary_ctx_retain", [&]() {
check_cu(cuDevicePrimaryCtxRetain(&primary, device), "cuDevicePrimaryCtxRetain failed");
check_cu(cuDevicePrimaryCtxRelease(device), "cuDevicePrimaryCtxRelease failed");
});
}
check_cu(
cuDevicePrimaryCtxRelease(device),
"cuDevicePrimaryCtxRelease (teardown) failed"
);

// Cleanup
check_cu(cuCtxDestroy(ctx), "cuCtxDestroy failed");

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,29 @@ int main(int argc, char** argv) {
});
}

// --- pointer_get_attributes ---
{
unsigned int memory_type = 0;
CUdeviceptr dev_ptr_out = 0;
void* host_ptr_out = nullptr;
unsigned long long buffer_id = 0;

CUpointer_attribute attrs[4] = {
CU_POINTER_ATTRIBUTE_MEMORY_TYPE,
CU_POINTER_ATTRIBUTE_DEVICE_POINTER,
CU_POINTER_ATTRIBUTE_HOST_POINTER,
CU_POINTER_ATTRIBUTE_BUFFER_ID,
};
void* data[4] = {&memory_type, &dev_ptr_out, &host_ptr_out, &buffer_id};

suite.run("pointer_attributes.pointer_get_attributes", [&]() {
check_cu(
cuPointerGetAttributes(4, attrs, data, ptr),
"cuPointerGetAttributes failed"
);
});
}

// Cleanup
check_cu(cuMemFree(ptr), "cuMemFree failed");
check_cu(cuCtxDestroy(ctx), "cuCtxDestroy failed");
Expand Down
Loading
Loading